perltest: add support for hex modifier (#529)

carenas · web-flow · commit 1e09555d6950 · 2024-10-17T16:42:31.000+01:00
* pcre2test: tighten \N{U+hh...} support

When \N{U+hh...} was added it was meant to support all unicode
characters that can be encoded by pcre2test and Perl, but its
use outside what is officially considered valid can be confusing
so print a warning for those cases.

* perltest: add support for hex modifier

The use of \xhh can be ambiguous when used together with the utf modifier,
so allow for describing code points individually in the pattern using hex,
with the same syntax that is already supported by pcre2test.
diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -1,4 +1,4 @@
-.TH PCRE2TEST 1 "04 October 2024" "PCRE 10.45"
+.TH PCRE2TEST 1 "16 October 2024" "PCRE 10.45"
 .SH NAME
 pcre2test - a program for testing Perl-compatible regular expressions.
 .SH SYNOPSIS
@@ -511,7 +511,9 @@ means of encoding non-printing characters in a visible way:
 .sp
 Invoking \eN{U+hh...} or \ex{hh...} doesn't require the use of the \fButf\fP
 modifier on the pattern. It is always recognized. There may be any number of
-hexadecimal digits inside the braces; invalid values provoke error messages.
+hexadecimal digits inside the braces; invalid values provoke error messages
+but when using \eN{U+hh...} with some invalid unicode characters they will
+be accepted with a warning instead.
 .P
 Note that even in UTF-8 mode, \exhh (and depending of how large, \eddd)
 describe one byte rather than one character; this makes it possible to
@@ -526,7 +528,7 @@ When testing te 16-bit library, not in UTF-16 mode, all 4-digit \ex{hhhh}
 values are accepted. This makes it possible to construct invalid UTF-16
 sequences for testing purposes.
 .P
-When testing the 32-bit library, not In UTF-32 mode, all 4 to 8-digit \ex{...}
+When testing the 32-bit library, not in UTF-32 mode, all 4 to 8-digit \ex{...}
 values are accepted. This makes it possible to construct invalid UTF-32
 sequences for testing purposes.
 .P
@@ -2243,6 +2245,6 @@ Cambridge, England.
 .rs
 .sp
 .nf
-Last updated: 04 October 2024
+Last updated: 16 October 2024
 Copyright (c) 1997-2024 University of Cambridge.
 .fi
diff --git a/perltest.sh b/perltest.sh
@@ -85,6 +85,7 @@ fi
 #   aftertext          interpreted as "print $' afterwards"
 #   afteralltext       ignored
 #   dupnames           ignored (Perl always allows)
+#   hex                preprocess pattern with embedded octets
 #   jitstack           ignored
 #   mark               show mark information
 #   no_auto_possess    ignored
@@ -244,9 +245,9 @@ for (;;)
 
   # Split the pattern from the modifiers and adjust them as necessary.
 
-  $pattern =~ /^\s*((.).*\2)(.*)$/s;
-  $pat = $1;
-  $del = $2;
+  $pattern =~ /^\s*(.)(.*)\1(.*)$/s;
+  $del = $1;
+  $pat = $2;
   $mod = "$3,$extra_modifiers";
   $mod =~ s/^,\s*//;
 
@@ -286,6 +287,34 @@ for (;;)
 
   $mod =~ s/no_auto_possess,?//;
 
+  # The "hex" modifier instructs us to preprocess the pattern
+
+  if ($mod =~ s/hex,?//)
+    {
+    my $t = "";
+
+    # find either 2 digit hex octets, optionally surrounded by spaces, to
+    # add as code points or quoted strings that will be copied verbatim
+
+    while ($pat =~ /\s*(?:(\p{ahex}{2})|(['"])([^\2]+?)\2)\s*/g)
+      {
+      if (defined $1)
+        {
+        no utf8;
+        $t .= chr(hex($1));
+        use if $utf8, "utf8";
+        }
+      else
+        {
+        $t .= $3;
+        }
+      }
+    no utf8;
+    utf8::decode($t) if $utf8;
+    use if $utf8, "utf8";
+    $pat = $t;
+    }
+
   # Use no_start_optimize (disable PCRE2 start-up optimization) to disable Perl
   # optimization by inserting (??{""}) at the start of the pattern. We may
   # also encounter -no_start_optimize from a #pattern setting.
@@ -297,7 +326,8 @@ for (;;)
   # Add back retained modifiers and check that the pattern is valid.
 
   $mod =~ s/,//g;
-  $pattern = "$pat$mod";
+
+  $pattern = "$del$pat$del$mod";
 
   eval "\$_ =~ ${pattern}";
   if ($@)
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -1940,7 +1940,7 @@ else
       cc = *ptr++;
       if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
 #if PCRE2_CODE_UNIT_WIDTH == 32
-      if (c >= 0x20000000l) { overflow = TRUE; break; }
+      if (c >= 0x20000000u) { overflow = TRUE; break; }
 #endif
       c = (c << 3) + (cc - CHAR_0);
 #if PCRE2_CODE_UNIT_WIDTH == 8
diff --git a/src/pcre2test.c b/src/pcre2test.c
@@ -713,7 +713,7 @@ static modstruct modlist[] = {
   { "global",                      MOD_PNDP, MOD_CTL, CTL_GLOBAL,                 PO(control) },
   { "heap_limit",                  MOD_CTM,  MOD_INT, 0,                          MO(heap_limit) },
   { "heapframes_size",             MOD_PND,  MOD_CTL, CTL2_HEAPFRAMES_SIZE,       PO(control2) },
-  { "hex",                         MOD_PAT,  MOD_CTL, CTL_HEXPAT,                 PO(control) },
+  { "hex",                         MOD_PATP, MOD_CTL, CTL_HEXPAT,                 PO(control) },
   { "info",                        MOD_PAT,  MOD_CTL, CTL_INFO,                   PO(control) },
   { "jit",                         MOD_PAT,  MOD_IND, 7,                          PO(jit) },
   { "jitfast",                     MOD_PAT,  MOD_CTL, CTL_JITFAST,                PO(control) },
@@ -7264,7 +7264,7 @@ while ((c = *p++) != 0)
       c = 0;
       for (pt++; isdigit(*pt) && *pt < '8'; ++i, pt++)
         {
-        if (c >= 0x20000000l)
+        if (c >= 0x20000000u)
           {
           fprintf(outfile, "** \\o{ escape too large\n");
           return PR_OK;
@@ -7397,20 +7397,38 @@ while ((c = *p++) != 0)
                          "and therefore cannot be encoded as UTF-8\n", c);
         return PR_OK;
         }
+      else if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
+        fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
+                         "0x%x and should not be encoded as UTF-8\n",
+                         c, MAX_UTF_CODE_POINT);
       q8 += ord2utf8(c, q8);
       }
     }
 #endif
 #ifdef SUPPORT_PCRE2_16
   if (test_mode == PCRE16_MODE)
     {
-    if (encoding == FORCE_UTF || utf)
+    /* Unlike the 8-bit code, there are no forced raw suggestions for the
+    16-bit mode, so assume raw unless utf is preferred */
+
+    if (!(encoding == FORCE_UTF || utf))
       {
-      if (c > 0x10ffffu)
+      if (c > 0xffffu)
+        {
+        fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
+          "and UTF-16 mode is not enabled.\n", c);
+        fprintf(outfile, "** Truncation will probably give the wrong "
+          "result.\n");
+        }
+      *q16++ = (uint16_t)c;
+      }
+    else
+      {
+      if (c > MAX_UTF_CODE_POINT)
         {
         fprintf(outfile, "** Failed: character \\N{U+%x} is greater than "
-                         "0x10ffff and therefore cannot be encoded as "
-                         "UTF-16\n", c);
+                         "0x%x and therefore cannot be encoded as UTF-16\n",
+                c, MAX_UTF_CODE_POINT);
         return PR_OK;
         }
       else if (c >= 0x10000u)
@@ -7419,24 +7437,25 @@ while ((c = *p++) != 0)
         *q16++ = 0xD800 | (c >> 10);
         *q16++ = 0xDC00 | (c & 0x3ff);
         }
-      else *q16++ = c;
-      }
-    else
-      {
-      if (c > 0xffffu)
+      else
         {
-        fprintf(outfile, "** Character \\x{%x} is greater than 0xffff "
-          "and UTF-16 mode is not enabled.\n", c);
-        fprintf(outfile, "** Truncation will probably give the wrong "
-          "result.\n");
+        if (encoding == FORCE_UTF && 0xe000u > c && c >= 0xd800u)
+          fprintf(outfile, "** Warning: character \\N{U+%x} is a surrogate "
+                           "and should not be encoded as UTF-16\n", c);
+        *q16++ = c;
         }
-
-      *q16++ = (uint16_t)c;
       }
     }
 #endif
 #ifdef SUPPORT_PCRE2_32
-  if (test_mode == PCRE32_MODE) *q32++ = c;
+  if (test_mode == PCRE32_MODE)
+    {
+    if (encoding == FORCE_UTF && c > MAX_UTF_CODE_POINT)
+      fprintf(outfile, "** Warning: character \\N{U+%x} is greater than "
+                       "0x%x and should not be encoded as UTF-32\n",
+                       c, MAX_UTF_CODE_POINT);
+    *q32++ = c;
+    }
 #endif
   }
 
diff --git a/testdata/testinput1 b/testdata/testinput1
@@ -6709,4 +6709,7 @@ $/x
 \= Expect no match
     .a.b.c.
 
+/65 00 64/hex
+    e\0d
+
 # End of testinput1 
diff --git a/testdata/testinput11 b/testdata/testinput11
@@ -356,12 +356,18 @@
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    a���b
+    a\x{d800}b
+    a\o{154000}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+
 /a\x{ffff}b/utf8_input
     a￿b
     a\x{ffff}b
     a\o{177777}b
-\= Expect no match
-    a\N{U+ffff}z
+    a\N{U+ffff}b
 
 /ab������z/utf8_input
     ab������z
diff --git a/testdata/testinput4 b/testdata/testinput4
@@ -2908,4 +2908,9 @@
 /\p{  ^ L u }/
     AbCd
 
+# hex
+
+/c3 b1/hex,utf
+    \N{U+00F1}
+
 # End of testinput4
diff --git a/testdata/testinput9 b/testdata/testinput9
@@ -12,6 +12,7 @@
     a\443b
 
 /fd bf bf bf bf bf/I,hex
+\= Expect warning
     \N{U+7fffffff}
 \= Expect no match # error message (too big char)
     \x{7fffffff}
diff --git a/testdata/testoutput1 b/testdata/testoutput1
@@ -10580,4 +10580,8 @@ No match
     .a.b.c.
 No match
 
+/65 00 64/hex
+    e\0d
+ 0: e\x00d
+
 # End of testinput1 
diff --git a/testdata/testoutput11-16 b/testdata/testoutput11-16
@@ -646,16 +646,27 @@ Subject length lower bound = 1
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    a���b
+ 0: a\x{d800}b
+    a\x{d800}b
+ 0: a\x{d800}b
+    a\o{154000}b
+ 0: a\x{d800}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+** Warning: character \N{U+d800} is a surrogate and should not be encoded as UTF-16
+ 0: a\x{d800}b
+
 /a\x{ffff}b/utf8_input
     a￿b
  0: a\x{ffff}b
     a\x{ffff}b
  0: a\x{ffff}b
     a\o{177777}b
  0: a\x{ffff}b
-\= Expect no match
-    a\N{U+ffff}z
-No match
+    a\N{U+ffff}b
+ 0: a\x{ffff}b
 
 /ab������z/utf8_input
 ** Failed: character value greater than 0xffff cannot be converted to 16-bit in non-UTF mode
diff --git a/testdata/testoutput11-32 b/testdata/testoutput11-32
@@ -649,16 +649,26 @@ Subject length lower bound = 1
 # We can use pcre2test's utf8_input modifier to create wide pattern characters,
 # even though this test is run when UTF is not supported.
 
+/a\x{d800}b/utf8_input
+    a���b
+ 0: a\x{d800}b
+    a\x{d800}b
+ 0: a\x{d800}b
+    a\o{154000}b
+ 0: a\x{d800}b
+\= Expect warning unless 32bit
+    a\N{U+d800}b
+ 0: a\x{d800}b
+
 /a\x{ffff}b/utf8_input
     a￿b
  0: a\x{ffff}b
     a\x{ffff}b
  0: a\x{ffff}b
     a\o{177777}b
  0: a\x{ffff}b
-\= Expect no match
-    a\N{U+ffff}z
-No match
+    a\N{U+ffff}b
+ 0: a\x{ffff}b
 
 /ab������z/utf8_input
     ab������z
@@ -668,6 +678,7 @@ No match
     ab\o{17777777777}z
  0: ab\x{7fffffff}z
     ab\N{U+7fffffff}z
+** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-32
  0: ab\x{7fffffff}z
 
 /ab�������z/utf8_input
diff --git a/testdata/testoutput4 b/testdata/testoutput4
@@ -4656,4 +4656,10 @@ No match
     AbCd
  0: b
 
+# hex
+
+/c3 b1/hex,utf
+    \N{U+00F1}
+ 0: \x{f1}
+
 # End of testinput4
diff --git a/testdata/testoutput9 b/testdata/testoutput9
@@ -26,7 +26,9 @@ Capture group count = 0
 First code unit = \xfd
 Last code unit = \xbf
 Subject length lower bound = 6
+\= Expect warning
     \N{U+7fffffff}
+** Warning: character \N{U+7fffffff} is greater than 0x10ffff and should not be encoded as UTF-8
  0: \xfd\xbf\xbf\xbf\xbf\xbf
 \= Expect no match # error message (too big char)
     \x{7fffffff}