Skip to content

Commit 8e4af76

Browse files
hsivonensffc
authored andcommitted
ICU-22087 Export a non-recursive canonical decomposition supplement for ICU4X
1 parent 6cb4fd5 commit 8e4af76

File tree

1 file changed

+70
-0
lines changed

1 file changed

+70
-0
lines changed

icu4c/source/tools/icuexportdata/icuexportdata.cpp

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,12 +532,17 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
532532
IcuToolErrorCode status("icuexportdata: computeDecompositions");
533533
const Normalizer2* mainNormalizer;
534534
const Normalizer2* nfdNormalizer = Normalizer2::getNFDInstance(status);
535+
FILE* f = NULL;
536+
std::vector<uint32_t> nonRecursive32;
537+
LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status));
538+
535539
if (uprv_strcmp(basename, "nfkd") == 0) {
536540
mainNormalizer = Normalizer2::getNFKDInstance(status);
537541
} else if (uprv_strcmp(basename, "uts46d") == 0) {
538542
mainNormalizer = Normalizer2::getInstance(NULL, "uts46", UNORM2_COMPOSE, status);
539543
} else {
540544
mainNormalizer = nfdNormalizer;
545+
f = prepareOutputFile("decompositionex");
541546
}
542547

543548
// Max length as of Unicode 14 is 4 for NFD. For NFKD the max
@@ -546,6 +551,8 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
546551
const int32_t LONGEST_ENCODABLE_LENGTH_32 = 8;
547552
const int32_t DECOMPOSITION_BUFFER_SIZE = 20;
548553
UChar32 utf32[DECOMPOSITION_BUFFER_SIZE];
554+
const int32_t RAW_DECOMPOSITION_BUFFER_SIZE = 2;
555+
UChar32 rawUtf32[RAW_DECOMPOSITION_BUFFER_SIZE];
549556

550557
// Iterate over all scalar values excluding Hangul syllables.
551558
//
@@ -625,6 +632,54 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
625632
if (src == dst) {
626633
continue;
627634
}
635+
// ICU4X hard-codes ANGSTROM SIGN
636+
if (c != 0x212B) {
637+
UnicodeString raw;
638+
if (!nfdNormalizer->getRawDecomposition(c, raw)) {
639+
// We're always supposed to have a non-recursive decomposition
640+
// if we had a recursive one.
641+
status.set(U_INTERNAL_PROGRAM_ERROR);
642+
handleError(status, basename);
643+
}
644+
// In addition to actual difference, put the whole range that contains characters
645+
// with oxia into the non-recursive trie in order to catch cases where characters
646+
// with oxia have singleton decompositions to corresponding characters with tonos.
647+
// This way, the run-time decision to fall through can be done on the range
648+
// without checking for individual characters inside the range.
649+
if (raw != dst || (c >= 0x1F71 && c <= 0x1FFB)) {
650+
int32_t rawLen = raw.toUTF32(rawUtf32, RAW_DECOMPOSITION_BUFFER_SIZE, status);
651+
if (!rawLen) {
652+
status.set(U_INTERNAL_PROGRAM_ERROR);
653+
handleError(status, basename);
654+
}
655+
if (rawLen == 1) {
656+
if (c >= 0xFFFF) {
657+
status.set(U_INTERNAL_PROGRAM_ERROR);
658+
handleError(status, basename);
659+
}
660+
uint32_t shifted = uint32_t(rawUtf32[0]) << 16;
661+
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, shifted, status);
662+
} else if (rawUtf32[0] <= 0xFFFF && rawUtf32[1] <= 0xFFFF) {
663+
if (!rawUtf32[0] || !rawUtf32[1]) {
664+
status.set(U_INTERNAL_PROGRAM_ERROR);
665+
handleError(status, basename);
666+
}
667+
uint32_t bmpPair = uint32_t(rawUtf32[0]) << 16 | uint32_t(rawUtf32[1]);
668+
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, bmpPair, status);
669+
} else {
670+
// Let's add 1 to index to make it always non-zero to distinguish
671+
// it from the default zero.
672+
uint32_t index = nonRecursive32.size() + 1;
673+
nonRecursive32.push_back(uint32_t(rawUtf32[0]));
674+
nonRecursive32.push_back(uint32_t(rawUtf32[1]));
675+
if (index > 0xFFFF) {
676+
status.set(U_INTERNAL_PROGRAM_ERROR);
677+
handleError(status, basename);
678+
}
679+
umutablecptrie_set(nonRecursiveBuilder.getAlias(), c, index, status);
680+
}
681+
}
682+
}
628683
}
629684
if (startsWithNonStarter && !(c == 0x0340 || c == 0x0341 || c == 0x0343 || c == 0x0344 || c == 0x0F73 || c == 0x0F75 || c == 0x0F81 || c == 0xFF9E || c == 0xFF9F)) {
630685
// A character whose decomposition starts with a non-starter and isn't the same as the character itself and isn't already hard-coded into ICU4X.
@@ -769,6 +824,21 @@ void computeDecompositions(const char* basename, const USet* backwardCombiningSt
769824
if (storage16.size() + storage32.size() > 0xFFF) {
770825
status.set(U_INTERNAL_PROGRAM_ERROR);
771826
}
827+
if (f) {
828+
usrc_writeArray(f, "scalars32 = [\n ", nonRecursive32.data(), 32, nonRecursive32.size(), " ", "\n]\n");
829+
830+
LocalUCPTriePointer utrie(umutablecptrie_buildImmutable(
831+
nonRecursiveBuilder.getAlias(),
832+
trieType,
833+
UCPTRIE_VALUE_BITS_32,
834+
status));
835+
handleError(status, basename);
836+
837+
fprintf(f, "[trie]\n");
838+
usrc_writeUCPTrie(f, "trie", utrie.getAlias(), UPRV_TARGET_SYNTAX_TOML);
839+
840+
fclose(f);
841+
}
772842
handleError(status, basename);
773843
}
774844

0 commit comments

Comments
 (0)