Skip to content

Commit 1e09555

Browse files
authored
perltest: add support for hex modifier (#529)
* pcre2test: tighten \N{U+hh...} support When \N{U+hh...} was added it was meant to support all unicode characters that can be encoded by pcre2test and Perl, but its use outside what is officially considered valid can be confusing so print a warning for those cases. * perltest: add support for hex modifier The use of \xhh can be ambiguous when used together with the utf modifier, so allow for describing code points individually in the pattern using hex, with the same syntax that is already supported by pcre2test.
1 parent 03be4d2 commit 1e09555

File tree

13 files changed

+135
-35
lines changed

13 files changed

+135
-35
lines changed

doc/pcre2test.1

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.TH PCRE2TEST 1 "04 October 2024" "PCRE 10.45"
1+
.TH PCRE2TEST 1 "16 October 2024" "PCRE 10.45"
22
.SH NAME
33
pcre2test - a program for testing Perl-compatible regular expressions.
44
.SH SYNOPSIS
@@ -511,7 +511,9 @@ means of encoding non-printing characters in a visible way:
511511
.sp
512512
Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP
513513
modifier on the pattern. It is always recognized. There may be any number of
514-
hexadecimal digits inside the braces; invalid values provoke error messages.
514+
hexadecimal digits inside the braces; invalid values provoke error messages
515+
but when using \eN{U+hh...} with some invalid unicode characters they will
516+
be accepted with a warning instead.
515517
.P
516518
Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd)
517519
describe one byte rather than one character; this makes it possible to
@@ -526,7 +528,7 @@ When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
526528
values are accepted. This makes it possible to construct invalid UTF-16
527529
sequences for testing purposes.
528530
.P
529-
When testing the 32-bit library, not In UTF-32 mode, all 4 to 8-digit \ex{...}
531+
When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \ex{...}
530532
values are accepted. This makes it possible to construct invalid UTF-32
531533
sequences for testing purposes.
532534
.P
@@ -2243,6 +2245,6 @@ Cambridge, England.
22432245
.rs
22442246
.sp
22452247
.nf
2246-
Last updated: 04 October 2024
2248+
Last updated: 16 October 2024
22472249
Copyright (c) 1997-2024 University of Cambridge.
22482250
.fi

perltest.sh

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ fi
8585
# aftertext interpreted as "print $' afterwards"
8686
# afteralltext ignored
8787
# dupnames ignored (Perl always allows)
88+
# hex preprocess pattern with embedded octets
8889
# jitstack ignored
8990
# mark show mark information
9091
# no_auto_possess ignored
@@ -244,9 +245,9 @@ for (;;)
244245
245246
# Split the pattern from the modifiers and adjust them as necessary.
246247
247-
$pattern =~ /^\s*((.).*\2)(.*)$/s;
248-
$pat = $1;
249-
$del = $2;
248+
$pattern =~ /^\s*(.)(.*)\1(.*)$/s;
249+
$del = $1;
250+
$pat = $2;
250251
$mod = "$3,$extra_modifiers";
251252
$mod =~ s/^,\s*//;
252253
@@ -286,6 +287,34 @@ for (;;)
286287
287288
$mod =~ s/no_auto_possess,?//;
288289
290+
# The "hex" modifier instructs us to preprocess the pattern
291+
292+
if ($mod =~ s/hex,?//)
293+
{
294+
my $t = "";
295+
296+
# find either 2 digit hex octets, optionally surrounded by spaces, to
297+
# add as code points or quoted strings that will be copied verbatim
298+
299+
while ($pat =~ /\s*(?:(\p{ahex}{2})|(['"])([^\2]+?)\2)\s*/g)
300+
{
301+
if (defined $1)
302+
{
303+
no utf8;
304+
$t .= chr(hex($1));
305+
use if $utf8, "utf8";
306+
}
307+
else
308+
{
309+
$t .= $3;
310+
}
311+
}
312+
no utf8;
313+
utf8::decode($t) if $utf8;
314+
use if $utf8, "utf8";
315+
$pat = $t;
316+
}
317+
289318
# Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
290319
# optimization by inserting (??{""}) at the start of the pattern. We may
291320
# also encounter -no_start_optimize from a #pattern setting.
@@ -297,7 +326,8 @@ for (;;)
297326
# Add back retained modifiers and check that the pattern is valid.
298327
299328
$mod =~ s/,//g;
300-
$pattern = "$pat$mod";
329+
330+
$pattern = "$del$pat$del$mod";
301331
302332
eval "\$_ =~ ${pattern}";
303333
if ($@)

src/pcre2_compile.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1940,7 +1940,7 @@ else
19401940
cc = *ptr++;
19411941
if (c == 0 && cc == CHAR_0) continue; /* Leading zeroes */
19421942
#if PCRE2_CODE_UNIT_WIDTH == 32
1943-
if (c >= 0x20000000l) { overflow = TRUE; break; }
1943+
if (c >= 0x20000000u) { overflow = TRUE; break; }
19441944
#endif
19451945
c = (c << 3) + (cc - CHAR_0);
19461946
#if PCRE2_CODE_UNIT_WIDTH == 8

src/pcre2test.c

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,7 @@ static modstruct modlist[] = {
713713
{ "global", MOD_PNDP, MOD_CTL, CTL_GLOBAL, PO(control) },
714714
{ "heap_limit", MOD_CTM, MOD_INT, 0, MO(heap_limit) },
715715
{ "heapframes_size", MOD_PND, MOD_CTL, CTL2_HEAPFRAMES_SIZE, PO(control2) },
716-
{ "hex", MOD_PAT, MOD_CTL, CTL_HEXPAT, PO(control) },
716+
{ "hex", MOD_PATP, MOD_CTL, CTL_HEXPAT, PO(control) },
717717
{ "info", MOD_PAT, MOD_CTL, CTL_INFO, PO(control) },
718718
{ "jit", MOD_PAT, MOD_IND, 7, PO(jit) },
719719
{ "jitfast", MOD_PAT, MOD_CTL, CTL_JITFAST, PO(control) },
@@ -7264,7 +7264,7 @@ while ((c = *p++) != 0)
72647264
c = 0;
72657265
for (pt++; isdigit(*pt) && *pt < '8'; ++i, pt++)
72667266
{
7267-
if (c >= 0x20000000l)
7267+
if (c >= 0x20000000u)
72687268
{
72697269
fprintf(outfile, "** \\o{ escape too large\n");
72707270
return PR_OK;
@@ -7397,20 +7397,38 @@ while ((c = *p++) != 0)
73977397
"and therefore cannot be encoded as UTF-8\n", c);
73987398
return PR_OK;
73997399
}
7400+
else if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
7401+
fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
7402+
"0x%x and should not be encoded as UTF-8\n",
7403+
c, MAX_UTF_CODE_POINT);
74007404
q8 += ord2utf8(c, q8);
74017405
}
74027406
}
74037407
#endif
74047408
#ifdef SUPPORT_PCRE2_16
74057409
if (test_mode == PCRE16_MODE)
74067410
{
7407-
if (encoding == FORCE_UTF || utf)
7411+
/* Unlike the 8-bit code, there are no forced raw suggestions for the
7412+
16-bit mode, so assume raw unless utf is preferred */
7413+
7414+
if (!(encoding == FORCE_UTF || utf))
74087415
{
7409-
if (c > 0x10ffffu)
7416+
if (c > 0xffffu)
7417+
{
7418+
fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
7419+
"and UTF-16 mode is not enabled.\n", c);
7420+
fprintf(outfile, "** Truncation will probably give the wrong "
7421+
"result.\n");
7422+
}
7423+
*q16++ = (uint16_t)c;
7424+
}
7425+
else
7426+
{
7427+
if (c > MAX_UTF_CODE_POINT)
74107428
{
74117429
fprintf(outfile, "** Failed: character \\N{U+%x} is greater than "
7412-
"0x10ffff and therefore cannot be encoded as "
7413-
"UTF-16\n", c);
7430+
"0x%x and therefore cannot be encoded as UTF-16\n",
7431+
c, MAX_UTF_CODE_POINT);
74147432
return PR_OK;
74157433
}
74167434
else if (c >= 0x10000u)
@@ -7419,24 +7437,25 @@ while ((c = *p++) != 0)
74197437
*q16++ = 0xD800 | (c >> 10);
74207438
*q16++ = 0xDC00 | (c & 0x3ff);
74217439
}
7422-
else *q16++ = c;
7423-
}
7424-
else
7425-
{
7426-
if (c > 0xffffu)
7440+
else
74277441
{
7428-
fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
7429-
"and UTF-16 mode is not enabled.\n", c);
7430-
fprintf(outfile, "** Truncation will probably give the wrong "
7431-
"result.\n");
7442+
if (encoding == FORCE_UTF && 0xe000u > c && c >= 0xd800u)
7443+
fprintf(outfile, "** Warning: character \\N{U+%x} is a surrogate "
7444+
"and should not be encoded as UTF-16\n", c);
7445+
*q16++ = c;
74327446
}
7433-
7434-
*q16++ = (uint16_t)c;
74357447
}
74367448
}
74377449
#endif
74387450
#ifdef SUPPORT_PCRE2_32
7439-
if (test_mode == PCRE32_MODE) *q32++ = c;
7451+
if (test_mode == PCRE32_MODE)
7452+
{
7453+
if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
7454+
fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
7455+
"0x%x and should not be encoded as UTF-32\n",
7456+
c, MAX_UTF_CODE_POINT);
7457+
*q32++ = c;
7458+
}
74407459
#endif
74417460
}
74427461

testdata/testinput1

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6709,4 +6709,7 @@ $/x
67096709
\= Expect no match
67106710
.a.b.c.
67116711

6712+
/65 00 64/hex
6713+
e\0d
6714+
67126715
# End of testinput1

testdata/testinput11

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -356,12 +356,18 @@
356356
# We can use pcre2test's utf8_input modifier to create wide pattern characters,
357357
# even though this test is run when UTF is not supported.
358358

359+
/a\x{d800}b/utf8_input
360+
a���b
361+
a\x{d800}b
362+
a\o{154000}b
363+
\= Expect warning unless 32bit
364+
a\N{U+d800}b
365+
359366
/a\x{ffff}b/utf8_input
360367
a￿b
361368
a\x{ffff}b
362369
a\o{177777}b
363-
\= Expect no match
364-
a\N{U+ffff}z
370+
a\N{U+ffff}b
365371

366372
/ab������z/utf8_input
367373
ab������z

testdata/testinput4

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2908,4 +2908,9 @@
29082908
/\p{ ^ L u }/
29092909
AbCd
29102910

2911+
# hex
2912+
2913+
/c3 b1/hex,utf
2914+
\N{U+00F1}
2915+
29112916
# End of testinput4

testdata/testinput9

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
a\443b
1313

1414
/fd bf bf bf bf bf/I,hex
15+
\= Expect warning
1516
\N{U+7fffffff}
1617
\= Expect no match # error message (too big char)
1718
\x{7fffffff}

testdata/testoutput1

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10580,4 +10580,8 @@ No match
1058010580
.a.b.c.
1058110581
No match
1058210582

10583+
/65 00 64/hex
10584+
e\0d
10585+
0: e\x00d
10586+
1058310587
# End of testinput1

testdata/testoutput11-16

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -646,16 +646,27 @@ Subject length lower bound = 1
646646
# We can use pcre2test's utf8_input modifier to create wide pattern characters,
647647
# even though this test is run when UTF is not supported.
648648

649+
/a\x{d800}b/utf8_input
650+
a���b
651+
0: a\x{d800}b
652+
a\x{d800}b
653+
0: a\x{d800}b
654+
a\o{154000}b
655+
0: a\x{d800}b
656+
\= Expect warning unless 32bit
657+
a\N{U+d800}b
658+
** Warning: character \N{U+d800} is a surrogate and should not be encoded as UTF-16
659+
0: a\x{d800}b
660+
649661
/a\x{ffff}b/utf8_input
650662
a￿b
651663
0: a\x{ffff}b
652664
a\x{ffff}b
653665
0: a\x{ffff}b
654666
a\o{177777}b
655667
0: a\x{ffff}b
656-
\= Expect no match
657-
a\N{U+ffff}z
658-
No match
668+
a\N{U+ffff}b
669+
0: a\x{ffff}b
659670

660671
/ab������z/utf8_input
661672
** Failed: character value greater than 0xffff cannot be converted to 16-bit in non-UTF mode

0 commit comments

Comments
 (0)