diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs index 56f0eac7..173b6a7a 100644 --- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs +++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs @@ -11,6 +11,7 @@ using System; using System.Linq; using System.Text; +using System.Text.RegularExpressions; using System.Xml; using System.Xml.Linq; using System.Xml.Schema; @@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto if (runText != string.Empty && runText.All(char.IsWhiteSpace)) writer.WriteAttributeString("xml", "space", "", "preserve"); // TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation - writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)); + writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC))); } writer.WriteEndElement(); @@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto return xml.ToString(); } + private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled); + public static string StripInvalidXmlChars(string text) + { + // Remove characters not allowed in XML + // Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML + return InvalidXmlRegex.Replace(text, string.Empty); + } + #endregion #region Serialization Helper Methods diff --git a/src/SIL.LCModel/DomainImpl/Strings.cs b/src/SIL.LCModel/DomainImpl/Strings.cs index 2220b9c0..c554f5e9 100644 --- a/src/SIL.LCModel/DomainImpl/Strings.cs +++ b/src/SIL.LCModel/DomainImpl/Strings.cs @@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int writer.WriteStartElement("AUni"); writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id); - text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC); + text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC)); writer.WriteString(text); writer.WriteEndElement(); } diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs index fadb7179..374a43ab 100644 --- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs +++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs @@ -42,6 +42,29 @@ public void SerializeTsStringToXml_Simple() Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!")); } + [Test] + public void SerializeTsStringToXml_StripsInvalidControlCharacter() + { + ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS); + string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager); + Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!")); + } + + [Test] + [TestCase("This is a test!")] + [TestCase(" 𐰉 (dǒng)")]//Nushu script + [TestCase("𠔤野 (Nishino)")]//Japanese Kanji + [TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese + [TestCase("🦊")]//emoji + [TestCase("\u200B\u200D\u200E\uDA00\uDC01")] + public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word) + { + ITsString tss = TsStringUtils.MakeString(word, EnWS); + string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager); + Assert.That(StripNewLines(xml), + Is.EqualTo($"{word}")); + } + ///-------------------------------------------------------------------------------------- /// /// Tests the method SerializeTsStringToXml with a MultiString. This should diff --git a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs index 55a16db6..ee6360b8 100644 --- a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs +++ b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs @@ -3,7 +3,9 @@ // (http://www.gnu.org/licenses/lgpl-2.1.html) using System; +using System.IO; using System.Linq; +using System.Text; using NUnit.Framework; using SIL.LCModel.Core.KernelInterfaces; using SIL.LCModel.Core.Text; @@ -283,6 +285,34 @@ public void AppendAlternativesTest() Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text); Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text); } + + [Test] + public void ToXml_WorksAsExpected() + { + var english = Cache.LangProject.CurrentAnalysisWritingSystems.First(); + Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle)); + var xml = ToXml(Cache.LangProject.MainCountry); + Assert.AreEqual("Mexico", xml, "XML does not contain expected string element."); + } + + [Test] + public void ToXml_WithControlCharacterWorks() + { + var english = Cache.LangProject.CurrentAnalysisWritingSystems.First(); + var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle); + Cache.LangProject.MainCountry.set_String(english.Handle, tsString); + var xml = ToXml(Cache.LangProject.MainCountry); + Assert.AreEqual("test", xml, "XML does not contain expected string element."); + } + + private string ToXml(ITsMultiString multiString) + { + using var ms = new MemoryStream(); + using var xmlWriter = XmlServices.CreateWriter(ms); + ((MultiAccessor)multiString).ToXMLString(xmlWriter); + xmlWriter.Flush(); + return Encoding.UTF8.GetString(ms.ToArray()); + } } /// ----------------------------------------------------------------------------------------