diff --git a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs
index 56f0eac7..173b6a7a 100644
--- a/src/SIL.LCModel.Core/Text/TsStringSerializer.cs
+++ b/src/SIL.LCModel.Core/Text/TsStringSerializer.cs
@@ -11,6 +11,7 @@
using System;
using System.Linq;
using System.Text;
+using System.Text.RegularExpressions;
using System.Xml;
using System.Xml.Linq;
using System.Xml.Schema;
@@ -173,7 +174,7 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
if (runText != string.Empty && runText.All(char.IsWhiteSpace))
writer.WriteAttributeString("xml", "space", "", "preserve");
// TODO: should we escape quotation marks? this is not necessary but different than the behavior of the C++ implementation
- writer.WriteString(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC));
+ writer.WriteString(StripInvalidXmlChars(Normalizer.Normalize(runText, Normalizer.UNormalizationMode.UNORM_NFC)));
}
writer.WriteEndElement();
@@ -187,6 +188,14 @@ public static string SerializeTsStringToXml(ITsString tss, ILgWritingSystemFacto
return xml.ToString();
}
+ private static readonly Regex InvalidXmlRegex = new Regex(@"[\x00-\x08\x0B\x0C\x0E-\x1F\uFFFE\uFFFF]", RegexOptions.Compiled);
+ public static string StripInvalidXmlChars(string text)
+ {
+ // Remove characters not allowed in XML
+ // Documented here: https://en.wikipedia.org/wiki/Valid_characters_in_XML
+ return InvalidXmlRegex.Replace(text, string.Empty);
+ }
+
#endregion
#region Serialization Helper Methods
diff --git a/src/SIL.LCModel/DomainImpl/Strings.cs b/src/SIL.LCModel/DomainImpl/Strings.cs
index 2220b9c0..c554f5e9 100644
--- a/src/SIL.LCModel/DomainImpl/Strings.cs
+++ b/src/SIL.LCModel/DomainImpl/Strings.cs
@@ -900,7 +900,7 @@ protected override void ToXml(XmlWriter writer, ILgWritingSystemFactory wsf, int
writer.WriteStartElement("AUni");
writer.WriteAttributeString("ws", m_object.Services.WritingSystemManager.Get(ws).Id);
- text = Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC);
+ text = TsStringSerializer.StripInvalidXmlChars(Normalizer.Normalize(text, Normalizer.UNormalizationMode.UNORM_NFC));
writer.WriteString(text);
writer.WriteEndElement();
}
diff --git a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs
index fadb7179..374a43ab 100644
--- a/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs
+++ b/tests/SIL.LCModel.Core.Tests/Text/TsStringSerializerTests.cs
@@ -42,6 +42,29 @@ public void SerializeTsStringToXml_Simple()
Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!"));
}
+ [Test]
+ public void SerializeTsStringToXml_StripsInvalidControlCharacter()
+ {
+ ITsString tss = TsStringUtils.MakeString("This is a te\u0002st!", EnWS);
+ string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
+ Assert.That(StripNewLines(xml), Is.EqualTo("This is a test!"));
+ }
+
+ [Test]
+ [TestCase("This is a test!")]
+ [TestCase(" 𐰉 (dǒng)")]//Nushu script
+ [TestCase("𠔤野 (Nishino)")]//Japanese Kanji
+ [TestCase("𠮷野家 (Yóu yě jiā)")]//Historic Chinese
+ [TestCase("🦊")]//emoji
+ [TestCase("\u200B\u200D\u200E\uDA00\uDC01")]
+ public void SerializeTsStringToXml_DoesNotStripValidCharacters(string word)
+ {
+ ITsString tss = TsStringUtils.MakeString(word, EnWS);
+ string xml = TsStringSerializer.SerializeTsStringToXml(tss, WritingSystemManager);
+ Assert.That(StripNewLines(xml),
+ Is.EqualTo($"{word}"));
+ }
+
///--------------------------------------------------------------------------------------
///
/// Tests the method SerializeTsStringToXml with a MultiString. This should
diff --git a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs
index 55a16db6..ee6360b8 100644
--- a/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs
+++ b/tests/SIL.LCModel.Tests/DomainImpl/StringsTests.cs
@@ -3,7 +3,9 @@
// (http://www.gnu.org/licenses/lgpl-2.1.html)
using System;
+using System.IO;
using System.Linq;
+using System.Text;
using NUnit.Framework;
using SIL.LCModel.Core.KernelInterfaces;
using SIL.LCModel.Core.Text;
@@ -283,6 +285,34 @@ public void AppendAlternativesTest()
Assert.AreEqual("Saltillo Mexico", Cache.LangProject.FieldWorkLocation.get_String(english.Handle).Text);
Assert.AreEqual("Saltillo Mejico", Cache.LangProject.FieldWorkLocation.get_String(spanish.Handle).Text);
}
+
+ [Test]
+ public void ToXml_WorksAsExpected()
+ {
+ var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
+ Cache.LangProject.MainCountry.set_String(english.Handle, TsStringUtils.MakeString("Mexico", english.Handle));
+ var xml = ToXml(Cache.LangProject.MainCountry);
+ Assert.AreEqual("Mexico", xml, "XML does not contain expected string element.");
+ }
+
+ [Test]
+ public void ToXml_WithControlCharacterWorks()
+ {
+ var english = Cache.LangProject.CurrentAnalysisWritingSystems.First();
+ var tsString = TsStringUtils.MakeString("te\u0002st", english.Handle);
+ Cache.LangProject.MainCountry.set_String(english.Handle, tsString);
+ var xml = ToXml(Cache.LangProject.MainCountry);
+ Assert.AreEqual("test", xml, "XML does not contain expected string element.");
+ }
+
+ private string ToXml(ITsMultiString multiString)
+ {
+ using var ms = new MemoryStream();
+ using var xmlWriter = XmlServices.CreateWriter(ms);
+ ((MultiAccessor)multiString).ToXMLString(xmlWriter);
+ xmlWriter.Flush();
+ return Encoding.UTF8.GetString(ms.ToArray());
+ }
}
/// ----------------------------------------------------------------------------------------