Skip to content

Strip invalid char data from strings on save #329

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
11 changes: 10 additions & 1 deletion src/SIL.LCModel.Core/Text/TsStringSerializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
using System;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
using System.Xml.Schema;
Expand Down Expand Up @@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
if (runText != string.Empty && runText.All(char.IsWhiteSpace))
writer.WriteAttributeString("xml", "space", "", "preserve");
// TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation
writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC));
writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)));
}

writer.WriteEndElement();
Expand All @@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
return xml.ToString();
}

private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled);
public static string StripInvalidXmlChars(string text)
{
// Remove characters not allowed in XML
// Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML
return InvalidXmlRegex.Replace(text, string.Empty);
}

#endregion

#region Serialization Helper Methods
Expand Down
2 changes: 1 addition & 1 deletion src/SIL.LCModel/DomainImpl/Strings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int

writer.WriteStartElement("AUni");
writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id);
text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC);
text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC));
writer.WriteString(text);
writer.WriteEndElement();
}
Expand Down
23 changes: 23 additions & 0 deletions tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,29 @@ public void SerializeTsStringToXml_Simple()
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
}

[Test]
public void SerializeTsStringToXml_StripsInvalidControlCharacter()
{
ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS);
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
Assert.That(StripNewLines(xml), Is.EqualTo("<Str><Run ws=\"en\">This is a test!</Run></Str>"));
}

[Test]
[TestCase("This is a test!")]
[TestCase(" 𐰉 (dǒng)")]//Nushu script
[TestCase("𠔤野 (Nishino)")]//Japanese Kanji
[TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese
[TestCase("🦊")]//emoji
[TestCase("\u200B\u200D\u200E\uDA00\uDC01")]
public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word)
{
ITsString tss = TsStringUtils.MakeString(word, EnWS);
string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
Assert.That(StripNewLines(xml),
Is.EqualTo($"<Str><Run ws=\"en\">{word}</Run></Str>"));
}

///--------------------------------------------------------------------------------------
/// <summary>
/// Tests the method SerializeTsStringToXml with a MultiString. This should
Expand Down
30 changes: 30 additions & 0 deletions tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
// (http://www.gnu.org/licenses/lgpl-2.1.html)

using System;
using System.IO;
using System.Linq;
using System.Text;
using NUnit.Framework;
using SIL.LCModel.Core.KernelInterfaces;
using SIL.LCModel.Core.Text;
Expand Down Expand Up @@ -283,6 +285,34 @@ public void AppendAlternativesTest()
Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text);
Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text);
}

[Test]
public void ToXml_WorksAsExpected()
{
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle));
var xml = ToXml(Cache.LangProject.MainCountry);
Assert.AreEqual("<AUni ws=\"en\">Mexico</AUni>", xml, "XML does not contain expected string element.");
}

[Test]
public void ToXml_WithControlCharacterWorks()
{
var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle);
Cache.LangProject.MainCountry.set_String(english.Handle, tsString);
var xml = ToXml(Cache.LangProject.MainCountry);
Assert.AreEqual("<AUni ws=\"en\">test</AUni>", xml, "XML does not contain expected string element.");
}

private string ToXml(ITsMultiString multiString)
{
using var ms = new MemoryStream();
using var xmlWriter = XmlServices.CreateWriter(ms);
((MultiAccessor)multiString).ToXMLString(xmlWriter);
xmlWriter.Flush();
return Encoding.UTF8.GetString(ms.ToArray());
}
}

/// ----------------------------------------------------------------------------------------
Expand Down
Loading