Skip to content

Commit ff8d2c9

Browse files
authored
[Java] Charset encoding handling improvements. (#887)
* [Java] Use StandardCharsets when generating `CharacterEncoding` methods and make alias names case-insensitive. * [Java] Use `putStringWithoutLengthAscii` for ASCII-encoded property when input is a `CharSequence`. * [Java] Replace use an actual CharSet instance when invoking String constructor and getBytes. * [Java] Use `ArrayUtil.EMPTY_BYTE_ARRAY` instead of creating `byte[0]` when input string is null or empty. * [Java] Add assertions around Charset usage. * [Java] Add an explicit test for case-insensitive alias and remove extra output. * [Java] Resolve canonical charset name when not a standard charset. * [Java] Do not convert character encoding to upper case, i.e. treat character encoding as case-sensitive.
1 parent 796a448 commit ff8d2c9

File tree

11 files changed

+423
-407
lines changed

11 files changed

+423
-407
lines changed

sbe-tool/src/main/java/uk/co/real_logic/sbe/generation/golang/GolangGenerator.java

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import uk.co.real_logic.sbe.generation.CodeGenerator;
2121
import org.agrona.generation.OutputManager;
2222
import uk.co.real_logic.sbe.generation.Generators;
23+
import uk.co.real_logic.sbe.generation.java.JavaUtil;
2324
import uk.co.real_logic.sbe.ir.*;
2425
import org.agrona.Verify;
2526

@@ -244,29 +245,31 @@ private void generateCharacterEncodingRangeCheck(
244245

245246
if (null != characterEncoding)
246247
{
247-
switch (token.encoding().characterEncoding())
248+
if (JavaUtil.isAsciiEncoding(characterEncoding))
248249
{
249-
case "ASCII":
250-
imports.peek().add("fmt");
251-
sb.append(String.format(
252-
"\tfor idx, ch := range %1$s {\n" +
253-
"\t\tif ch > 127 {\n" +
254-
"\t\t\treturn fmt.Errorf(\"%1$s[%%d]=%%d" +
255-
" failed ASCII validation\", idx, ch)\n" +
256-
"\t\t}\n" +
257-
"\t}\n",
258-
varName));
259-
break;
260-
261-
case "UTF-8":
262-
imports.peek().add("errors");
263-
imports.peek().add("unicode/utf8");
264-
sb.append(String.format(
265-
"\tif !utf8.Valid(%1$s[:]) {\n" +
266-
"\t\treturn errors.New(\"%1$s failed UTF-8 validation\")\n" +
267-
"\t}\n",
268-
varName));
269-
break;
250+
imports.peek().add("fmt");
251+
sb.append(String.format(
252+
"\tfor idx, ch := range %1$s {\n" +
253+
"\t\tif ch > 127 {\n" +
254+
"\t\t\treturn fmt.Errorf(\"%1$s[%%d]=%%d" +
255+
" failed ASCII validation\", idx, ch)\n" +
256+
"\t\t}\n" +
257+
"\t}\n",
258+
varName));
259+
}
260+
else if (JavaUtil.isUtf8Encoding(characterEncoding))
261+
{
262+
imports.peek().add("errors");
263+
imports.peek().add("unicode/utf8");
264+
sb.append(String.format(
265+
"\tif !utf8.Valid(%1$s[:]) {\n" +
266+
"\t\treturn errors.New(\"%1$s failed UTF-8 validation\")\n" +
267+
"\t}\n",
268+
varName));
269+
}
270+
else
271+
{
272+
throw new IllegalArgumentException("Unsupported encoding: " + characterEncoding);
270273
}
271274
}
272275
}
@@ -1836,7 +1839,7 @@ private void generateCompositePropertyElements(
18361839
final String containingTypeName,
18371840
final List<Token> tokens)
18381841
{
1839-
for (int i = 0; i < tokens.size();)
1842+
for (int i = 0; i < tokens.size(); )
18401843
{
18411844
final Token token = tokens.get(i);
18421845
final String propertyName = formatPropertyName(token.name());

sbe-tool/src/main/java/uk/co/real_logic/sbe/generation/java/JavaGenerator.java

Lines changed: 16 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -921,25 +921,16 @@ private void generateDataDecodeMethods(
921921
indent + " }\n\n" +
922922
indent + " final byte[] tmp = new byte[dataLength];\n" +
923923
indent + " buffer.getBytes(limit + headerLength, tmp, 0, dataLength);\n\n" +
924-
indent + " final String value;\n" +
925-
indent + " try\n" +
926-
indent + " {\n" +
927-
indent + " value = new String(tmp, \"%6$s\");\n" +
928-
indent + " }\n" +
929-
indent + " catch (final java.io.UnsupportedEncodingException ex)\n" +
930-
indent + " {\n" +
931-
indent + " throw new RuntimeException(ex);\n" +
932-
indent + " }\n\n" +
933-
indent + " return value;\n" +
924+
indent + " return new String(tmp, %6$s);\n" +
934925
indent + " }\n",
935926
formatPropertyName(propertyName),
936927
generateStringNotPresentCondition(token.version(), indent),
937928
sizeOfLengthField,
938929
PrimitiveType.UINT32 == lengthType ? "(int)" : "",
939930
generateGet(lengthType, "limit", byteOrderStr),
940-
characterEncoding);
931+
charset(characterEncoding));
941932

942-
if (characterEncoding.contains("ASCII"))
933+
if (isAsciiEncoding(characterEncoding))
943934
{
944935
new Formatter(sb).format("\n" +
945936
indent + " public int get%1$s(final Appendable appendable)\n" +
@@ -1050,7 +1041,7 @@ private void generateCharArrayEncodeMethods(
10501041
{
10511042
final PrimitiveType lengthPutType = PrimitiveType.UINT32 == lengthType ? PrimitiveType.INT32 : lengthType;
10521043

1053-
if (characterEncoding.contains("ASCII"))
1044+
if (isAsciiEncoding(characterEncoding))
10541045
{
10551046
new Formatter(sb).format("\n" +
10561047
indent + " public %1$s %2$s(final String value)\n" +
@@ -1099,16 +1090,8 @@ private void generateCharArrayEncodeMethods(
10991090
new Formatter(sb).format("\n" +
11001091
indent + " public %1$s %2$s(final String value)\n" +
11011092
indent + " {\n" +
1102-
indent + " final byte[] bytes;\n" +
1103-
indent + " try\n" +
1104-
indent + " {\n" +
1105-
indent + " bytes = null == value || value.isEmpty() ?" +
1106-
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : value.getBytes(\"%3$s\");\n" +
1107-
indent + " }\n" +
1108-
indent + " catch (final java.io.UnsupportedEncodingException ex)\n" +
1109-
indent + " {\n" +
1110-
indent + " throw new RuntimeException(ex);\n" +
1111-
indent + " }\n\n" +
1093+
indent + " final byte[] bytes = (null == value || value.isEmpty()) ?" +
1094+
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : value.getBytes(%3$s);\n\n" +
11121095
indent + " final int length = bytes.length;\n" +
11131096
indent + " if (length > %4$d)\n" +
11141097
indent + " {\n" +
@@ -1123,7 +1106,7 @@ private void generateCharArrayEncodeMethods(
11231106
indent + " }\n",
11241107
className,
11251108
formatPropertyName(propertyName),
1126-
characterEncoding,
1109+
charset(characterEncoding),
11271110
maxLengthValue,
11281111
sizeOfLengthField,
11291112
generatePut(lengthPutType, "limit", "length", byteOrderStr));
@@ -2042,7 +2025,7 @@ private CharSequence generatePrimitiveArrayPropertyDecode(
20422025
fieldLength,
20432026
charset(encoding.characterEncoding()));
20442027

2045-
if (encoding.characterEncoding().contains("ASCII"))
2028+
if (isAsciiEncoding(encoding.characterEncoding()))
20462029
{
20472030
new Formatter(sb).format("\n" +
20482031
indent + " public int get%1$s(final Appendable value)\n" +
@@ -2240,7 +2223,7 @@ private void generateCharArrayEncodeMethods(
22402223
fieldLength,
22412224
offset);
22422225

2243-
if (encoding.characterEncoding().contains("ASCII"))
2226+
if (isAsciiEncoding(encoding.characterEncoding()))
22442227
{
22452228
new Formatter(sb).format("\n" +
22462229
indent + " public %1$s %2$s(final String src)\n" +
@@ -2274,15 +2257,10 @@ private void generateCharArrayEncodeMethods(
22742257
indent + " throw new IndexOutOfBoundsException(" +
22752258
"\"CharSequence too large for copy: byte length=\" + srcLength);\n" +
22762259
indent + " }\n\n" +
2277-
indent + " for (int i = 0; i < srcLength; ++i)\n" +
2278-
indent + " {\n" +
2279-
indent + " final char charValue = src.charAt(i);\n" +
2280-
indent + " final byte byteValue = charValue > 127 ? (byte)'?' : (byte)charValue;\n" +
2281-
indent + " buffer.putByte(offset + %4$d + i, byteValue);\n" +
2282-
indent + " }\n\n" +
2283-
indent + " for (int i = srcLength; i < length; ++i)\n" +
2260+
indent + " buffer.putStringWithoutLengthAscii(offset + %4$d, src);\n\n" +
2261+
indent + " for (int start = srcLength; start < length; ++start)\n" +
22842262
indent + " {\n" +
2285-
indent + " buffer.putByte(offset + %4$d + i, (byte)0);\n" +
2263+
indent + " buffer.putByte(offset + %4$d + start, (byte)0);\n" +
22862264
indent + " }\n\n" +
22872265
indent + " return this;\n" +
22882266
indent + " }\n",
@@ -2297,7 +2275,8 @@ private void generateCharArrayEncodeMethods(
22972275
indent + " public %s %s(final String src)\n" +
22982276
indent + " {\n" +
22992277
indent + " final int length = %d;\n" +
2300-
indent + " final byte[] bytes = null == src ? new byte[0] : src.getBytes(%s);\n" +
2278+
indent + " final byte[] bytes = (null == src || src.isEmpty()) ?" +
2279+
" org.agrona.collections.ArrayUtil.EMPTY_BYTE_ARRAY : src.getBytes(%s);\n" +
23012280
indent + " if (bytes.length > length)\n" +
23022281
indent + " {\n" +
23032282
indent + " throw new IndexOutOfBoundsException(" +
@@ -2387,7 +2366,7 @@ private static void generateCharacterEncodingMethod(
23872366
sb.append("\n")
23882367
.append(indent).append(" public static String ").append(propName).append("CharacterEncoding()\n")
23892368
.append(indent).append(" {\n")
2390-
.append(indent).append(" return \"").append(characterEncoding).append("\";\n")
2369+
.append(indent).append(" return ").append(charsetName(characterEncoding)).append(";\n")
23912370
.append(indent).append(" }\n");
23922371
}
23932372
}
@@ -3537,7 +3516,7 @@ private void appendDecoderDisplay(
35373516
}
35383517
else
35393518
{
3540-
if (characterEncoding.contains("ASCII") || characterEncoding.contains("ascii"))
3519+
if (isAsciiEncoding(characterEncoding))
35413520
{
35423521
append(sb, indent, "builder.append('\\'');");
35433522
append(sb, indent, formatGetterName(varDataToken.name()) + "(builder);");

sbe-tool/src/main/java/uk/co/real_logic/sbe/generation/java/JavaUtil.java

Lines changed: 63 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,20 +18,19 @@
1818
import org.agrona.Strings;
1919
import uk.co.real_logic.sbe.PrimitiveType;
2020
import uk.co.real_logic.sbe.SbeTool;
21+
import uk.co.real_logic.sbe.ValidationUtil;
2122
import uk.co.real_logic.sbe.generation.Generators;
2223
import uk.co.real_logic.sbe.ir.Token;
23-
import uk.co.real_logic.sbe.ValidationUtil;
2424

2525
import java.io.IOException;
2626
import java.lang.reflect.Field;
27+
import java.lang.reflect.Modifier;
2728
import java.nio.charset.Charset;
2829
import java.nio.charset.StandardCharsets;
2930
import java.util.EnumMap;
3031
import java.util.HashMap;
3132
import java.util.Map;
3233

33-
import static java.lang.reflect.Modifier.STATIC;
34-
3534
/**
3635
* Utilities for mapping between {@link uk.co.real_logic.sbe.ir.Ir} and the Java language.
3736
*/
@@ -96,19 +95,33 @@ public String toString()
9695
/**
9796
* Indexes known charset aliases to the name of the instance in {@link StandardCharsets}.
9897
*/
99-
private static final Map<String, String> STD_CHARSETS = new HashMap<>();
98+
static final HashMap<String, String> STD_CHARSETS = new HashMap<>();
10099

101100
static
102101
{
103102
try
104103
{
105104
for (final Field field : StandardCharsets.class.getDeclaredFields())
106105
{
107-
if (Charset.class.isAssignableFrom(field.getType()) && ((field.getModifiers() & STATIC) == STATIC))
106+
if (Charset.class.isAssignableFrom(field.getType()) && Modifier.isStatic(field.getModifiers()) &&
107+
Modifier.isPublic(field.getModifiers()))
108108
{
109109
final Charset charset = (Charset)field.get(null);
110-
STD_CHARSETS.put(charset.name(), field.getName());
111-
charset.aliases().forEach((alias) -> STD_CHARSETS.put(alias, field.getName()));
110+
final String name = field.getName();
111+
String oldName = STD_CHARSETS.put(charset.name(), name);
112+
if (null != oldName)
113+
{
114+
throw new IllegalStateException("Duplicate charset alias: old=" + oldName + ", new=" + name);
115+
}
116+
for (final String alias : charset.aliases())
117+
{
118+
oldName = STD_CHARSETS.put(alias, name);
119+
if (null != oldName)
120+
{
121+
throw new IllegalStateException("Duplicate charset alias: old=" + oldName + ", new=" +
122+
alias);
123+
}
124+
}
112125
}
113126
}
114127
}
@@ -207,10 +220,52 @@ public static String charset(final String encoding)
207220
}
208221
else
209222
{
210-
return "java.nio.charset.Charset.forName(\"" + encoding + "\")";
223+
final String canonicalName = Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding;
224+
return "java.nio.charset.Charset.forName(\"" + canonicalName + "\")";
211225
}
212226
}
213227

228+
/**
229+
* Code to fetch the name of the {@link Charset} given the encoding.
230+
*
231+
* @param encoding as a string name (eg. UTF-8).
232+
* @return the code to fetch the associated Charset name.
233+
*/
234+
public static String charsetName(final String encoding)
235+
{
236+
final String charsetName = STD_CHARSETS.get(encoding);
237+
if (charsetName != null)
238+
{
239+
return "java.nio.charset.StandardCharsets." + charsetName + ".name()";
240+
}
241+
else
242+
{
243+
return "\"" + (Charset.isSupported(encoding) ? Charset.forName(encoding).name() : encoding) + "\"";
244+
}
245+
}
246+
247+
/**
248+
* Checks if the given encoding represents an ASCII charset.
249+
*
250+
* @param encoding as a string name (e.g. ASCII).
251+
* @return {@code true} if the encoding denotes an ASCII charset.
252+
*/
253+
public static boolean isAsciiEncoding(final String encoding)
254+
{
255+
return "US_ASCII".equals(STD_CHARSETS.get(encoding));
256+
}
257+
258+
/**
259+
* Checks if the given encoding represents a UTF-8 charset.
260+
*
261+
* @param encoding as a string name (e.g. unicode-1-1-utf-8).
262+
* @return {@code true} if the encoding denotes a UTF-8 charset.
263+
*/
264+
public static boolean isUtf8Encoding(final String encoding)
265+
{
266+
return "UTF_8".equals(STD_CHARSETS.get(encoding));
267+
}
268+
214269
/**
215270
* Generate a literal value to be used in code generation.
216271
*

sbe-tool/src/main/java/uk/co/real_logic/sbe/generation/rust/RustGenerator.java

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import uk.co.real_logic.sbe.PrimitiveType;
2121
import uk.co.real_logic.sbe.generation.CodeGenerator;
2222
import uk.co.real_logic.sbe.generation.Generators;
23+
import uk.co.real_logic.sbe.generation.java.JavaUtil;
2324
import uk.co.real_logic.sbe.ir.Encoding;
2425
import uk.co.real_logic.sbe.ir.Ir;
2526
import uk.co.real_logic.sbe.ir.Signal;
@@ -304,20 +305,15 @@ static void generateEncoderVarData(
304305

305306
final String varDataType;
306307
final String toBytesFn;
307-
switch (characterEncoding)
308+
if (JavaUtil.isUtf8Encoding(characterEncoding))
308309
{
309-
case "UTF-8":
310-
{
311-
varDataType = "&str";
312-
toBytesFn = ".as_bytes()";
313-
break;
314-
}
315-
default:
316-
{
317-
varDataType = "&[u8]";
318-
toBytesFn = "";
319-
break;
320-
}
310+
varDataType = "&str";
311+
toBytesFn = ".as_bytes()";
312+
}
313+
else
314+
{
315+
varDataType = "&[u8]";
316+
toBytesFn = "";
321317
}
322318

323319
// function to write slice ... todo - handle character encoding ?
@@ -681,23 +677,20 @@ private static void generatePrimitiveConstantDecoder(
681677
indent(sb, level, "/// characterEncoding: '%s'\n", characterEncoding);
682678
indent(sb, level, "#[inline]\n");
683679

684-
switch (characterEncoding)
680+
if (JavaUtil.isAsciiEncoding(characterEncoding))
685681
{
686-
case "US-ASCII":
687-
{
688-
indent(sb, level, "pub fn %s(&self) -> &'static [u8] {\n",
689-
formatFunctionName(name));
690-
indent(sb, level + 1, "b\"%s\"\n", rawConstValue);
691-
break;
692-
}
693-
case "UTF-8":
694-
{
695-
indent(sb, level, "pub fn %s(&self) -> &'static str {\n", formatFunctionName(name));
696-
indent(sb, level + 1, "\"%s\"\n", rawConstValue);
697-
break;
698-
}
699-
default:
700-
throw new RuntimeException("Unable to handle " + characterEncoding);
682+
indent(sb, level, "pub fn %s(&self) -> &'static [u8] {\n",
683+
formatFunctionName(name));
684+
indent(sb, level + 1, "b\"%s\"\n", rawConstValue);
685+
}
686+
else if (JavaUtil.isUtf8Encoding(characterEncoding))
687+
{
688+
indent(sb, level, "pub fn %s(&self) -> &'static str {\n", formatFunctionName(name));
689+
indent(sb, level + 1, "\"%s\"\n", rawConstValue);
690+
}
691+
else
692+
{
693+
throw new IllegalArgumentException("Unsupported encoding: " + characterEncoding);
701694
}
702695

703696
indent(sb, level, "}\n\n");

0 commit comments

Comments
 (0)