Skip to content

Commit 295f94c

Browse files
authored
perltest: add support for locale modifier (#534)
Use a similar syntax to pcre2test to set a per pattern locale, and teach pcre2test to recognize the modifier as perl compatible. While at it, update tests and fix a recent regresion that wasn't covered by them.
1 parent 998d2e0 commit 295f94c

File tree

10 files changed

+178
-78
lines changed

10 files changed

+178
-78
lines changed

perltest.sh

Lines changed: 68 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,19 @@
1515
# a script to Perl through a pipe. See comments below about the data for the
1616
# Perl script. If the next argument of this script is "-utf8", a suitable
1717
# prefix for the Perl script is set up.
18-
19-
# If the next argument of this script is -locale, it must be followed by the
20-
# name of a locale, which is then set when running the tests. Setting a locale
21-
# implies -utf8. For example:
2218
#
23-
# ./perltest.sh -locale tr_TR.utf8 some-file
19+
# A similar process is used to indicate the desire to set a specific locale
20+
# tables per pattern in a similar way to pcre2test through a locale modifier,
21+
# by using the -locale argument. This can be optionally combined with the
22+
# previous arguments; for example, to process an UTF-8 test file in Turkish,
23+
# add the locale=tr_TR.utf8 modifier to the pattern and -locale to perltest,
24+
# or invoke something like (the specific names of the locale might vary):
25+
#
26+
# ./perltest.sh -utf8 -locale=tr_TR.utf8 some-file
27+
#
28+
# If the -locale argument has no setting, a suitable default locale is used
29+
# when possible and reported at startup, it can be always overriden using the
30+
# locale modifier for each pattern.
2431
#
2532
# The remaining arguments of this script, if any, are passed to Perl. They are
2633
# an input file and an output file. If there is one argument, the output is
@@ -33,7 +40,7 @@
3340

3441
perl=perl
3542
perlarg=""
36-
prefix=''
43+
prefix=""
3744
spc=""
3845

3946
if [ $# -gt 0 -a "$1" = "-perl" ] ; then
@@ -53,27 +60,37 @@ if [ $# -gt 0 -a "$1" = "-w" ] ; then
5360
fi
5461

5562
if [ $# -gt 0 -a "$1" = "-utf8" ] ; then
56-
prefix="use utf8; require Encode;"
63+
default_locale="C.utf8"
64+
prefix="\
65+
use utf8;\
66+
require Encode;"
5767
perlarg="$perlarg$spc-CSD"
58-
5968
shift
6069
fi
6170

62-
if [ $# -gt 0 -a "$1" = "-locale" ] ; then
63-
if [ $# -lt 2 ] ; then
64-
echo "perltest.sh: Missing locale name - abandoned"
65-
exit 1
71+
if [ $# -gt 0 ] ; then
72+
case "$1" in
73+
-locale=*)
74+
default_locale=${1#-locale=}
75+
;;
76+
-locale)
77+
default_locale=${default_locale:-C}
78+
;;
79+
*)
80+
skip=1
81+
esac
82+
if [ -z "$skip" ] ; then
83+
prefix="\
84+
use POSIX qw(locale_h);\
85+
use locale qw(:ctype);\
86+
\
87+
\$default_locale = setlocale(LC_CTYPE, \"$default_locale\");\
88+
if (!defined(\$default_locale))\
89+
{ die \"perltest: Failed to set locale \\\"$default_locale\\\"\\\n\"; }\
90+
print \"Locale: \$default_locale\\\n\";\
91+
$prefix"
92+
shift
6693
fi
67-
prefix="use utf8;\
68-
use POSIX qw(locale_h);\
69-
use locale;\
70-
\$loc=setlocale(LC_ALL, \"$2\");\
71-
if (\"\$loc\" eq \"\")\
72-
{ die \"perltest.sh: Failed to set locale \\\"$2\\\" - abandoned\\n\";}\
73-
print \"Locale: \$loc\\n\";\
74-
require Encode;"
75-
shift
76-
shift
7794
fi
7895

7996

@@ -87,6 +104,7 @@ fi
87104
# dupnames ignored (Perl always allows)
88105
# hex preprocess pattern with embedded octets
89106
# jitstack ignored
107+
# locale use a specific locale tables
90108
# mark show mark information
91109
# no_auto_possess ignored
92110
# no_start_optimize insert (??{""}) at pattern start (disables optimizing)
@@ -146,7 +164,7 @@ else
146164
{
147165
foreach $c (split(//, $_[0]))
148166
{
149-
if (ord $c >= 32 && ord $c < 127) { $t .= $c; }
167+
if ($c =~ /^[[:print:]]$/) { $t .= $c; }
150168
else { $t .= sprintf("\\x%02x", ord $c); }
151169
}
152170
}
@@ -190,6 +208,12 @@ $default_show_mark = 0;
190208
NEXT_RE:
191209
for (;;)
192210
{
211+
if (defined $locale && defined $default_locale)
212+
{
213+
setlocale(LC_CTYPE, $default_locale);
214+
undef $locale;
215+
}
216+
193217
printf " re> " if $interact;
194218
last if ! ($_ = <$infile>);
195219
printf $outfile "$_" if ! $interact;
@@ -263,10 +287,6 @@ for (;;)
263287
264288
$mod =~ s/allaftertext,?//;
265289
266-
# Detect utf
267-
268-
$utf8 = $mod =~ s/utf,?//;
269-
270290
# Remove "dupnames".
271291
272292
$mod =~ s/dupnames,?//;
@@ -275,6 +295,19 @@ for (;;)
275295
276296
$mod =~ s/jitstack=\d+,?//;
277297
298+
# The "locale" modifier indicates which locale to use
299+
if ($mod =~ /locale=([^,]+),?/)
300+
{
301+
die "perltest: missing -locale cmdline flag" unless defined &setlocale;
302+
$locale = setlocale(LC_CTYPE, $1);
303+
if (!defined $locale)
304+
{
305+
print "** Failed to set locale '$1'\n";
306+
next NEXT_RE;
307+
}
308+
}
309+
$mod =~ s/locale=[^,]*,?//; # Remove it; "locale=" Ignored
310+
278311
# The "mark" modifier requests checking of MARK data */
279312
280313
$show_mark = $default_show_mark | ($mod =~ s/mark,?//);
@@ -283,11 +316,16 @@ for (;;)
283316
284317
$mod =~ s/ucp,?/u/;
285318
319+
# Detect utf
320+
321+
$utf8 = $mod =~ s/utf,?//;
322+
286323
# Remove "no_auto_possess".
287324
288325
$mod =~ s/no_auto_possess,?//;
289326
290-
# The "hex" modifier instructs us to preprocess the pattern
327+
# The "hex" modifier instructs us to preprocess a pattern with embedded
328+
# octets formatted as two digit hexadecimals
291329
292330
if ($mod =~ s/hex,?//)
293331
{
@@ -321,12 +359,11 @@ for (;;)
321359
322360
$mod =~ s/-no_start_optimize,?//;
323361
324-
if ($mod =~ s/no_start_optimize,?//) { $pat =~ s/$del/$del(??{""})/; }
362+
if ($mod =~ s/no_start_optimize,?//) { $pat = '(??{""})' . $pat; }
325363
326364
# Add back retained modifiers and check that the pattern is valid.
327365
328366
$mod =~ s/,//g;
329-
330367
$pattern = "$del$pat$del$mod";
331368
332369
eval "\$_ =~ ${pattern}";
@@ -419,7 +456,7 @@ for (;;)
419456
420457
if ($@)
421458
{
422-
printf $outfile "Error: $@\n";
459+
printf $outfile "Error: $@";
423460
next NEXT_RE;
424461
}
425462
elsif (scalar(@subs) == 0)

src/pcre2test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ static modstruct modlist[] = {
720720
{ "jitstack", MOD_PNDP, MOD_INT, 0, PO(jitstack) },
721721
{ "jitverify", MOD_PAT, MOD_CTL, CTL_JITVERIFY, PO(control) },
722722
{ "literal", MOD_PAT, MOD_OPT, PCRE2_LITERAL, PO(options) },
723-
{ "locale", MOD_PAT, MOD_STR, LOCALESIZE, PO(locale) },
723+
{ "locale", MOD_PATP, MOD_STR, LOCALESIZE, PO(locale) },
724724
{ "mark", MOD_PNDP, MOD_CTL, CTL_MARK, PO(control) },
725725
{ "match_invalid_utf", MOD_PAT, MOD_OPT, PCRE2_MATCH_INVALID_UTF, PO(options) },
726726
{ "match_limit", MOD_CTM, MOD_INT, 0, MO(match_limit) },

testdata/testinput1

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5087,6 +5087,15 @@ name)/mark
50875087
\= Expect no match
50885088
D
50895089

5090+
/(*COMMIT)ABC/no_start_optimize
5091+
ABC
5092+
\= Expect no match
5093+
DEFABC
5094+
5095+
/(*COMMIT)ABC/
5096+
ABC
5097+
DEFABC
5098+
50905099
# This should fail, as the skip causes a bump to offset 3 (the skip).
50915100

50925101
/A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark

testdata/testinput3

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This set of tests checks local-specific features, using the "fr_FR" locale.
2-
# It is not Perl-compatible. When run via RunTest, the locale is edited to
2+
# It is almost Perl-compatible. When run via RunTest, the locale is edited to
33
# be whichever of "fr_FR", "french", or "fr" is found to exist. There is
44
# different version of this file called wintestinput3 for use on Windows,
55
# where the locale is called "french" and the tests are run using
@@ -14,10 +14,6 @@
1414
/^[\w]+/locale=fr_FR
1515
�cole
1616

17-
/^[\w]+/
18-
\= Expect no match
19-
�cole
20-
2117
/^[\W]+/
2218
�cole
2319

@@ -80,6 +76,14 @@
8076
\= Expect no match
8177
\x9c
8278

79+
/�/i
80+
\xff
81+
\= Expect no match
82+
y
83+
84+
/(.)\1/i
85+
\xfe\xde
86+
8387
/\W+/
8488
>>>\xaa<<<
8589
>>>\xba<<<

testdata/testoutput1

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8174,6 +8174,19 @@ MK: B
81748174
D
81758175
No match, mark = B
81768176

8177+
/(*COMMIT)ABC/no_start_optimize
8178+
ABC
8179+
0: ABC
8180+
\= Expect no match
8181+
DEFABC
8182+
No match
8183+
8184+
/(*COMMIT)ABC/
8185+
ABC
8186+
0: ABC
8187+
DEFABC
8188+
0: ABC
8189+
81778190
# This should fail, as the skip causes a bump to offset 3 (the skip).
81788191

81798192
/A(*MARK:A)A+(*SKIP)(B|Z) | AC/x,mark

testdata/testoutput3

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This set of tests checks local-specific features, using the "fr_FR" locale.
2-
# It is not Perl-compatible. When run via RunTest, the locale is edited to
2+
# It is almost Perl-compatible. When run via RunTest, the locale is edited to
33
# be whichever of "fr_FR", "french", or "fr" is found to exist. There is
44
# different version of this file called wintestinput3 for use on Windows,
55
# where the locale is called "french" and the tests are run using
@@ -16,11 +16,6 @@ No match
1616
�cole
1717
0: �cole
1818

19-
/^[\w]+/
20-
\= Expect no match
21-
�cole
22-
No match
23-
2419
/^[\W]+/
2520
�cole
2621
0: \xc9
@@ -115,6 +110,18 @@ No match
115110
\x9c
116111
No match
117112

113+
/�/i
114+
\xff
115+
0: �
116+
\= Expect no match
117+
y
118+
No match
119+
120+
/(.)\1/i
121+
\xfe\xde
122+
0: ��
123+
1: �
124+
118125
/\W+/
119126
>>>\xaa<<<
120127
0: >>>

testdata/testoutput3A

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This set of tests checks local-specific features, using the "fr_FR" locale.
2-
# It is not Perl-compatible. When run via RunTest, the locale is edited to
2+
# It is almost Perl-compatible. When run via RunTest, the locale is edited to
33
# be whichever of "fr_FR", "french", or "fr" is found to exist. There is
44
# different version of this file called wintestinput3 for use on Windows,
55
# where the locale is called "french" and the tests are run using
@@ -16,11 +16,6 @@ No match
1616
�cole
1717
0: �cole
1818

19-
/^[\w]+/
20-
\= Expect no match
21-
�cole
22-
No match
23-
2419
/^[\W]+/
2520
�cole
2621
0: \xc9
@@ -115,6 +110,18 @@ No match
115110
\x9c
116111
No match
117112

113+
/�/i
114+
\xff
115+
0: �
116+
\= Expect no match
117+
y
118+
No match
119+
120+
/(.)\1/i
121+
\xfe\xde
122+
0: ��
123+
1: �
124+
118125
/\W+/
119126
>>>\xaa<<<
120127
0: >>>

testdata/testoutput3B

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# This set of tests checks local-specific features, using the "fr_FR" locale.
2-
# It is not Perl-compatible. When run via RunTest, the locale is edited to
2+
# It is almost Perl-compatible. When run via RunTest, the locale is edited to
33
# be whichever of "fr_FR", "french", or "fr" is found to exist. There is
44
# different version of this file called wintestinput3 for use on Windows,
55
# where the locale is called "french" and the tests are run using
@@ -16,11 +16,6 @@ No match
1616
�cole
1717
0: �cole
1818

19-
/^[\w]+/
20-
\= Expect no match
21-
�cole
22-
No match
23-
2419
/^[\W]+/
2520
�cole
2621
0: \xc9
@@ -115,6 +110,18 @@ No match
115110
\x9c
116111
No match
117112

113+
/�/i
114+
\xff
115+
0: �
116+
\= Expect no match
117+
y
118+
No match
119+
120+
/(.)\1/i
121+
\xfe\xde
122+
0: ��
123+
1: �
124+
118125
/\W+/
119126
>>>\xaa<<<
120127
0: >>>

0 commit comments

Comments
 (0)