Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -167,19 +167,16 @@ void unescpae(TextUnit textUnit) {
}

void unescapeSource(TextUnit textUnit) {
String sourceString = textUnitUtils.getSourceAsString(textUnit);
String unescapedSourceString = unescapeUtils.replaceEscapedQuotes(sourceString);
textUnitUtils.replaceSourceString(textUnit, unescapedSourceString);
// No-op: Okapi's POFilter already fully handles all C-style escape sequences
// (\\, \n, \r, \t, \", etc.) via its own single-pass unescape in toAbstract().
// Any additional unescaping here would double-process and corrupt strings.
// For example, PO \\\" (escaped-backslash + escaped-quote) becomes \" (literal
// backslash + quote) after Okapi's unescape. Applying replaceEscapedQuotes()
// would then strip the literal backslash.
}

void unescapeTarget(TextUnit textUnit) {
TextContainer target = textUnit.getTarget(targetLocale);
if (target != null) {
String targetString = target.toString();
String unescapedTargetString = unescapeUtils.replaceEscapedQuotes(targetString);
TextContainer newTarget = new TextContainer(unescapedTargetString);
textUnit.setTarget(targetLocale, newTarget);
}
// No-op: same reasoning as unescapeSource.
}

boolean isPluralGroupStarting(Event event) {
Expand Down Expand Up @@ -290,13 +287,13 @@ void adaptTextUnitToCLDRForm(ITextUnit textUnit, String cldrPluralForm) {
// source should always be singular form for "one" form,
// this is needed for language with 6 entry like arabic
logger.debug("Set message singular: {}", msgID);
textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgID)));
textUnit.setSource(new TextContainer(unescapeUtils.unescape(msgID)));
} else {
// source should always be plural form unless for "one" form,
// this is needed for language with only one entry like
// japanese: [0] --> other
logger.debug("Set message plural: {}", msgIDPlural);
textUnit.setSource(new TextContainer(unescapeUtils.replaceEscapedQuotes(msgIDPlural)));
textUnit.setSource(new TextContainer(unescapeUtils.unescape(msgIDPlural)));
}
}

Expand Down Expand Up @@ -361,10 +358,12 @@ void renameTextUnitWithSourceAndContent(ITextUnit textUnit) {

Property property = textUnit.getProperty(POFilter.PROPERTY_CONTEXT);

StringBuilder newName = new StringBuilder(msgID);
// Unescape msgID for the name (backslash, newline, carriage return, quotes)
StringBuilder newName = new StringBuilder(unescapeUtils.unescape(msgID));

if (property != null) {
newName.append(" --- ").append(property.getValue());
// Also unescape the context value
newName.append(" --- ").append(unescapeUtils.unescape(property.getValue()));
}

if (poPluralForm != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
import net.sf.okapi.common.encoder.IEncoder;

/**
* Encoder to handle escaping \n, \r, double-quotes.
* Encoder to handle escaping backslash, \n, \r, \t, double-quotes.
*
* <p>Follows C-style string escaping as required by the GNU PO file format.
*
* @author jyi
*/
Expand Down Expand Up @@ -50,12 +52,18 @@ public String encode(char value, EncoderContext context) {
String res;

switch (value) {
case '\\':
res = "\\\\";
break;
case '\n':
res = "\\n";
break;
case '\r':
res = "\\r";
break;
case '\t':
res = "\\t";
break;
case '"':
res = "\\\"";
break;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package com.box.l10n.mojito.okapi.filters;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -14,6 +15,7 @@ public class UnescapeUtils {
/** Logger */
static Logger logger = LoggerFactory.getLogger(UnescapeUtils.class);

private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
private static final Pattern ESCAPED_CARIAGE_RETURN = Pattern.compile("\\\\r");
private static final Pattern ESCAPED_LINE_FEED = Pattern.compile("\\\\n");
private static final Pattern ESCAPED_QUOTES = Pattern.compile("\\\\(\"|')");
Expand All @@ -24,16 +26,84 @@ public class UnescapeUtils {
private static final Pattern LINE_FEED = Pattern.compile("\n");

/**
* Unescapes line feed, cariage return, single quote and double quote
* Single-pass pattern for C-style escape sequences used in GNU PO files. Matches exactly
* two-character sequences starting with a backslash, so "\\\\n" (4 chars) matches "\\\\" first (→
* \), leaving "n" as a literal — not "\n" (newline).
*
* <p>Covers the same set as Okapi's {@code POFilter.unescape()}: {@code \\[abfnrtv"'\\]}.
*/
private static final Pattern ESCAPE_SEQUENCE = Pattern.compile("\\\\[abfnrtv\"'\\\\]");

/**
* Unescapes C-style escape sequences in a single pass, following the GNU PO file format (same
* escaping rules as C strings).
*
* <p>Handles: {@code \\} (backslash), {@code \n} (newline), {@code \r} (CR), {@code \t} (tab),
* {@code \"} (quote), {@code \'} (single quote), {@code \a} (bell), {@code \b} (backspace),
* {@code \f} (form feed), {@code \v} (vertical tab).
*
* <p>A single-pass approach is required because sequential replacement can corrupt strings
* containing ambiguous sequences like "\\\\n" (escaped-backslash followed by literal 'n'). With
* sequential replacement, this would be incorrectly decoded as a newline character.
*
* @param text the escaped text
* @return the unescaped text
*/
public String unescape(String text) {
Matcher matcher = ESCAPE_SEQUENCE.matcher(text);
StringBuilder sb = new StringBuilder(text.length());
while (matcher.find()) {
String match = matcher.group();
String replacement;
switch (match.charAt(1)) {
case '\\':
replacement = "\\";
break;
case 'a':
replacement = "\u0007"; // bell
break;
case 'b':
replacement = "\b"; // backspace
break;
case 'f':
replacement = "\f"; // form feed
break;
case 'n':
replacement = "\n";
break;
case 'r':
replacement = "\r";
break;
case 't':
replacement = "\t";
break;
case 'v':
replacement = "\u000B"; // vertical tab
break;
case '"':
replacement = "\"";
break;
case '\'':
replacement = "'";
break;
default:
replacement = match;
break;
}
matcher.appendReplacement(sb, Matcher.quoteReplacement(replacement));
}
matcher.appendTail(sb);
return sb.toString();
}

/**
* Replaces \\\\ with \\
*
* @param text
* @return
*/
public String unescape(String text) {
String unescapedText = replaceEscapedCarriageReturn(text);
unescapedText = replaceEscapedLineFeed(unescapedText);
unescapedText = replaceEscapedQuotes(unescapedText);
return unescapedText;
String replaceEscapedBackslash(String text) {
return ESCAPED_BACKSLASH.matcher(text).replaceAll("\\\\");
}

String replaceEscapedCarriageReturn(String text) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import com.box.l10n.mojito.okapi.asset.AssetPathToFilterConfigMapper;
import com.box.l10n.mojito.okapi.asset.FilterConfigurationMappers;
import com.box.l10n.mojito.okapi.asset.UnsupportedAssetFilterTypeException;
import com.box.l10n.mojito.okapi.filters.UnescapeUtils;
import java.util.Arrays;
import java.util.List;
import org.assertj.core.api.Assertions;
Expand All @@ -25,6 +26,7 @@
AssetPathToFilterConfigMapper.class,
FilterConfigurationMappers.class,
TextUnitUtils.class,
UnescapeUtils.class,
AssetExtractorTest.class
})
@EnableSpringConfigured
Expand Down Expand Up @@ -253,4 +255,133 @@ public void documentNoPartExtraction() throws UnsupportedAssetFilterTypeExceptio
"34a6a48789dd1ff7dff813a8fb627b91-8f1bdae06589d55b62184a76e0e70d0e-1",
"Image in text <br id='p1'/>."));
}

@Test
public void extractPoWithBackslash() throws UnsupportedAssetFilterTypeException {
// PO file with backslash in msgid - should be unescaped to literal backslash
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. Path with backslash\n"
+ "msgid \"C:\\\\Users\\\\test\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

// The backslash should be unescaped: C:\\Users\\test -> C:\Users\test
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(tuple("C:\\Users\\test", "C:\\Users\\test"));
}

@Test
public void extractPoWithNewlineEscape() throws UnsupportedAssetFilterTypeException {
// PO file with escaped newline in msgid
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. Multi-line text\n"
+ "msgid \"line1\\nline2\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

// The newline escape should be unescaped: line1\nline2 -> line1<newline>line2
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(tuple("line1\nline2", "line1\nline2"));
}

@Test
public void extractPoWithQuoteEscape() throws UnsupportedAssetFilterTypeException {
// PO file with escaped quote in msgid
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. Text with quotes\n"
+ "msgid \"say \\\"hello\\\"\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

// The quote escape should be unescaped: say \"hello\" -> say "hello"
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(tuple("say \"hello\"", "say \"hello\""));
}

@Test
public void extractPoWithComplexEscapes() throws UnsupportedAssetFilterTypeException {
// PO file with multiple escape sequences
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. Complex escapes\n"
+ "msgid \"path\\\\to\\\\file\\nwith \\\"quotes\\\"\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

// All escapes should be unescaped
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(
tuple("path\\to\\file\nwith \"quotes\"", "path\\to\\file\nwith \"quotes\""));
}

@Test
public void extractPoWithContext() throws UnsupportedAssetFilterTypeException {
// PO file with msgctxt containing backslash
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. Context test\n"
+ "msgctxt \"menu\\\\file\"\n"
+ "msgid \"Open\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

// The name should include the context with unescaped backslash
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(tuple("Open --- menu\\file", "Open"));
}

@Test
public void extractPoWithRealisticBackslashMessage() throws UnsupportedAssetFilterTypeException {
// Realistic PO message: You are not able to use "/" or "\" in text files
// PO escaping: \" for quotes, \\ for backslash
String poContent =
"msgid \"\"\n"
+ "msgstr \"\"\n"
+ "\"Content-Type: text/plain; charset=utf-8\\n\"\n"
+ "\n"
+ "#. File name validation error\n"
+ "msgid \"You are not able to use \\\"/\\\" or \\\"\\\\\\\" in text files\"\n"
+ "msgstr \"\"\n";

List<AssetExtractorTextUnit> assetExtractorTextUnitsForAsset =
assetExtractor.getAssetExtractorTextUnitsForAsset("messages.pot", poContent, null, null);

String expectedString = "You are not able to use \"/\" or \"\\\" in text files";
Assertions.assertThat(assetExtractorTextUnitsForAsset)
.extracting(AssetExtractorTextUnit::getName, AssetExtractorTextUnit::getSource)
.containsExactly(tuple(expectedString, expectedString));
}
}
Loading