@@ -532,12 +532,17 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
532532 IcuToolErrorCode status (" icuexportdata: computeDecompositions" );
533533 const Normalizer2* mainNormalizer;
534534 const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance (status);
535+ FILE* f = NULL ;
536+ std::vector<uint32_t > nonRecursive32;
537+ LocalUMutableCPTriePointer nonRecursiveBuilder (umutablecptrie_open (0 , 0 , status));
538+
535539 if (uprv_strcmp (basename, " nfkd" ) == 0 ) {
536540 mainNormalizer = Normalizer2::getNFKDInstance (status);
537541 } else if (uprv_strcmp (basename, " uts46d" ) == 0 ) {
538542 mainNormalizer = Normalizer2::getInstance (NULL , " uts46" , UNORM2_COMPOSE, status);
539543 } else {
540544 mainNormalizer = nfdNormalizer;
545+ f = prepareOutputFile (" decompositionex" );
541546 }
542547
543548 // Max length as of Unicode 14 is 4 for NFD. For NFKD the max
@@ -546,6 +551,8 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
546551 const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8 ;
547552 const int32_t DECOMPOSITION_BUFFER_SIZE = 20 ;
548553 UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
554+ const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2 ;
555+ UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
549556
550557 // Iterate over all scalar values excluding Hangul syllables.
551558 //
@@ -625,6 +632,54 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
625632 if (src == dst) {
626633 continue ;
627634 }
635+ // ICU4X hard-codes ANGSTROM SIGN
636+ if (c != 0x212B ) {
637+ UnicodeString raw;
638+ if (!nfdNormalizer->getRawDecomposition (c, raw)) {
639+ // We're always supposed to have a non-recursive decomposition
640+ // if we had a recursive one.
641+ status.set (U_INTERNAL_PROGRAM_ERROR);
642+ handleError (status, basename);
643+ }
644+ // In addition to actual difference, put the whole range that contains characters
645+ // with oxia into the non-recursive trie in order to catch cases where characters
646+ // with oxia have singleton decompositions to corresponding characters with tonos.
647+ // This way, the run-time decision to fall through can be done on the range
648+ // without checking for individual characters inside the range.
649+ if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB )) {
650+ int32_t rawLen = raw.toUTF32 (rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
651+ if (!rawLen) {
652+ status.set (U_INTERNAL_PROGRAM_ERROR);
653+ handleError (status, basename);
654+ }
655+ if (rawLen == 1 ) {
656+ if (c >= 0xFFFF ) {
657+ status.set (U_INTERNAL_PROGRAM_ERROR);
658+ handleError (status, basename);
659+ }
660+ uint32_t shifted = uint32_t (rawUtf32[0 ]) << 16 ;
661+ umutablecptrie_set (nonRecursiveBuilder.getAlias (), c, shifted, status);
662+ } else if (rawUtf32[0 ] <= 0xFFFF && rawUtf32[1 ] <= 0xFFFF ) {
663+ if (!rawUtf32[0 ] || !rawUtf32[1 ]) {
664+ status.set (U_INTERNAL_PROGRAM_ERROR);
665+ handleError (status, basename);
666+ }
667+ uint32_t bmpPair = uint32_t (rawUtf32[0 ]) << 16 | uint32_t (rawUtf32[1 ]);
668+ umutablecptrie_set (nonRecursiveBuilder.getAlias (), c, bmpPair, status);
669+ } else {
670+ // Let's add 1 to index to make it always non-zero to distinguish
671+ // it from the default zero.
672+ uint32_t index = nonRecursive32.size () + 1 ;
673+ nonRecursive32.push_back (uint32_t (rawUtf32[0 ]));
674+ nonRecursive32.push_back (uint32_t (rawUtf32[1 ]));
675+ if (index > 0xFFFF ) {
676+ status.set (U_INTERNAL_PROGRAM_ERROR);
677+ handleError (status, basename);
678+ }
679+ umutablecptrie_set (nonRecursiveBuilder.getAlias (), c, index, status);
680+ }
681+ }
682+ }
628683 }
629684 if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F )) {
630685 // A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
@@ -769,6 +824,21 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
769824 if (storage16.size () + storage32.size () > 0xFFF ) {
770825 status.set (U_INTERNAL_PROGRAM_ERROR);
771826 }
827+ if (f) {
828+ usrc_writeArray (f, " scalars32 = [\n " , nonRecursive32.data (), 32 , nonRecursive32.size (), " " , " \n ]\n " );
829+
830+ LocalUCPTriePointer utrie (umutablecptrie_buildImmutable (
831+ nonRecursiveBuilder.getAlias (),
832+ trieType,
833+ UCPTRIE_VALUE_BITS_32,
834+ status));
835+ handleError (status, basename);
836+
837+ fprintf (f, " [trie]\n " );
838+ usrc_writeUCPTrie (f, " trie" , utrie.getAlias (), UPRV_TARGET_SYNTAX_TOML);
839+
840+ fclose (f);
841+ }
772842 handleError (status, basename);
773843}
774844
0 commit comments