diff --git a/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java b/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java index a0939d68b97..14ecce14866 100644 --- a/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java +++ b/gobblin-api/src/main/java/org/apache/gobblin/compat/hadoop/TextSerializer.java @@ -19,7 +19,6 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; -import java.nio.charset.StandardCharsets; /** @@ -31,20 +30,27 @@ public class TextSerializer { * Serialize a String using the same logic as a Hadoop Text object */ public static void writeStringAsText(DataOutput stream, String str) throws IOException { - byte[] utf8Encoded = str.getBytes(StandardCharsets.UTF_8); - writeVLong(stream, utf8Encoded.length); - stream.write(utf8Encoded); + // TODO: Use writeChars instead of writeBytes to support unicode + for (int i = 0; i < str.length(); i++) { + if (str.charAt(i) > 0x7F) { + throw new IllegalArgumentException("Non-ASCII character detected."); + } + } + writeVLong(stream, str.length()); + stream.writeBytes(str); } /** * Deserialize a Hadoop Text object into a String */ public static String readTextAsString(DataInput in) throws IOException { - int bufLen = (int)readVLong(in); - byte[] buf = new byte[bufLen]; - in.readFully(buf); + int bufLen = (int) readVLong(in); + StringBuilder sb = new StringBuilder(); - return new String(buf, StandardCharsets.UTF_8); + for (int i = 0; i < bufLen; i++) { + sb.append((char) in.readByte()); + } + return sb.toString(); } /** diff --git a/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java b/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java index 04ba79f9b66..30906c92222 100644 --- a/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java +++ b/gobblin-api/src/test/java/org/apache/gobblin/compat/TextSerializerTest.java @@ -33,6 +33,26 @@ public class TextSerializerTest { private static final String[] textsToSerialize = new String[]{"abracadabra", Strings.repeat("longString", 128000)}; + private static final String[] serializationErrorText = new String[]{".߸´ˇ", Strings.repeat("ˀ.¸¯.", 128000)}; + + @Test + public void testSerializeError() throws IOException { + // Use our serializer, verify Hadoop deserializer can read it back + for (String textToSerialize : serializationErrorText) { + ByteArrayOutputStream bOs = new ByteArrayOutputStream(); + DataOutputStream dataOutputStream = new DataOutputStream(bOs); + + try { + TextSerializer.writeStringAsText(dataOutputStream, textToSerialize); + Assert.fail("Expected IOException not thrown"); + } catch (Exception e) { + Assert.assertTrue(e instanceof IllegalArgumentException); + // Expected exception + } finally { + dataOutputStream.close(); + } + } + } @Test public void testSerialize()