Perl-Critic · karenetheridge · Mar 21, 2014 · Mar 21, 2014 · Nov 2, 2014 · Nov 6, 2014
diff --git a/Changes b/Changes
@@ -1,4 +1,25 @@
 Revision history for Perl extension PPI
+	- Fix 1.218 regression where packages, subs, and words after
+	  labels like /^x\d+/ would parse as x operator (GitHub #122)
+	  (MOREGAN)
+
+1.221_01
+	Summary:
+	- support Perl 5.12 "package NAMESPACE VERSION BLOCK" syntax
+
+	Details:
+	- support Perl 5.12 "package NAMESPACE VERSION BLOCK" syntax
+	  (RT #67831, GitHub #70) (BDFOY, MOREGAN)
+	- Prevent package names like 'x' from being parsed as operators
+	  (GitHub #75) (MOREGAN)
+	- Prevent package names like 'v10' from being parsed as version
+	  strings (GitHub #76) (MOREGAN)
+	- Prevent sub names like 'v10' from being parsed as version
+	  strings (RT #74527, GitHub #65) (JAE, MOREGAN)
+	- Prevent 'use' and 'no' package names from being parsed as
+	  operators (MOREGAN)
+	- Prevent left side of fat comma from parsing as operator
+	  operators (MOREGAN)
 
 1.220 Tue 11 Nov 2014
 	Summary:

diff --git a/Makefile.PL b/Makefile.PL
@@ -40,6 +40,7 @@ test_requires 'Test::More'       => '0.86';
 test_requires 'Test::NoWarnings' => '0.084';
 test_requires 'Test::Object'     => '0.07';
 test_requires 'Test::SubCalls'   => '1.07';
+test_requires 'Test::Deep';
 
 # Force the existence of the weaken function
 # (which some distributions annoyingly don't have)

diff --git a/lib/PPI/Cache.pm b/lib/PPI/Cache.pm
@@ -268,7 +268,7 @@ sub _md5hex {
 	my $it     = _SCALAR($_[0])
 		? PPI::Util::md5hex(${$_[0]})
 		: $_[0];
-	return (defined $it and ! ref $it and $it =~ /^[a-f0-9]{32}\z/si)
+	return (defined $it and ! ref $it and $it =~ /^[[:xdigit:]]{32}\z/s)
 		? lc $it
 		: undef;
 }

diff --git a/lib/PPI/Lexer.pm b/lib/PPI/Lexer.pm
@@ -440,6 +440,28 @@ sub _statement {
 
 	# Is it a token in our known classes list
 	my $class = $STATEMENT_CLASSES{$Token->content};
+	if ( $class ) {
+		# Is the next significant token a =>
+		# Read ahead to the next significant token
+		my $Next;
+		while ( $Next = $self->_get_token ) {
+			if ( !$Next->significant ) {
+				push @{$self->{delayed}}, $Next;
+				next;
+			}
+
+			last if
+				!$Next->isa( 'PPI::Token::Operator' ) or $Next->content ne '=>';
+
+			# Got the next token
+			# Is an ordinary expression
+			$self->_rollback( $Next );
+			return 'PPI::Statement';
+		}
+
+		# Rollback and continue
+		$self->_rollback( $Next );
+	}
 
 	# Handle potential barewords for subscripts
 	if ( $Parent->isa('PPI::Structure::Subscript') ) {
@@ -533,8 +555,16 @@ sub _statement {
 			}
 
 			# Found the next significant token.
+			if (
+				$Next->isa('PPI::Token::Operator')
+				and
+				$Next->content eq '=>'
+			) {
+				# Is an ordinary expression
+				$self->_rollback( $Next );
+				return 'PPI::Statement';
 			# Is it a v6 use?
-			if ( $Next->content eq 'v6' ) {
+			} elsif ( $Next->content eq 'v6' ) {
 				$self->_rollback( $Next );
 				return 'PPI::Statement::Include::Perl6';
 			} else {
@@ -696,22 +726,23 @@ sub _continues {
 		return '';
 	}
 
-	# Alrighty then, there are only five implied end statement types,
-	# ::Scheduled blocks, ::Sub declarations, ::Compound, ::Given, and ::When
-	# statements.
-	unless ( ref($Statement) =~ /\b(?:Scheduled|Sub|Compound|Given|When)$/ ) {
-		return 1;
-	}
-
-	# Of these five, ::Scheduled, ::Sub, ::Given, and ::When follow the same
-	# simple rule and can be handled first.
+	# Alrighty then, there are six implied-end statement types:
+	# ::Scheduled blocks, ::Sub declarations, ::Compound, ::Given, ::When,
+	# and ::Package statements.
+	return 1
+		if ref $Statement !~ /\b(?:Scheduled|Sub|Compound|Given|When|Package)$/;
+
+	# Of these six, ::Scheduled, ::Sub, ::Given, and ::When follow the same
+	# simple rule and can be handled first.  The block form of ::Package
+	# follows the rule, too.  (The non-block form of ::Package
+	# requires a statement terminator, and thus doesn't need to have
+	# an implied end detected.)
 	my @part      = $Statement->schildren;
 	my $LastChild = $part[-1];
-	unless ( $Statement->isa('PPI::Statement::Compound') ) {
-		# If the last significant element of the statement is a block,
-		# then a scheduled statement is done, no questions asked.
-		return ! $LastChild->isa('PPI::Structure::Block');
-	}
+	# If the last significant element of the statement is a block,
+	# then an implied-end statement is done, no questions asked.
+	return !$LastChild->isa('PPI::Structure::Block')
+		if !$Statement->isa('PPI::Statement::Compound');
 
 	# Now we get to compound statements, which kind of suck (to lex).
 	# However, of them all, the 'if' type, which includes unless, are
@@ -1118,6 +1149,16 @@ sub _curly {
 					and return 'PPI::Structure::Subscript';
 			}
 		}
+
+		# Are we the last argument of sub?
+		# E.g.: 'sub foo {}', 'sub foo ($) {}'
+		return 'PPI::Structure::Block' if $Parent->isa('PPI::Statement::Sub');
+
+		# Are we the second or third argument of package?
+		# E.g.: 'package Foo {}' or 'package Foo v1.2.3 {}'
+		return 'PPI::Structure::Block'
+			if $Parent->isa('PPI::Statement::Package');
+
 		if ( $CURLY_CLASSES{$content} ) {
 			# Known type
 			return $CURLY_CLASSES{$content};

diff --git a/lib/PPI/Statement/Package.pm b/lib/PPI/Statement/Package.pm
@@ -47,6 +47,9 @@ BEGIN {
 	@ISA     = 'PPI::Statement';
 }
 
+# Lexer clues
+sub __LEXER__normal() { '' }
+
 =pod
 
 =head2 namespace

diff --git a/lib/PPI/Statement/Sub.pm b/lib/PPI/Statement/Sub.pm
@@ -165,17 +165,17 @@ Returns true if it is a special reserved subroutine, or false if not.
 sub reserved {
 	my $self = shift;
 	my $name = $self->name or return '';
+	# perlsub is silent on whether reserveds can contain:
+	# - underscores;
+	# we allow them due to existing practice like CLONE_SKIP and __SUB__.
+	# - numbers; we allow them by PPI tradition.
 	$name eq uc $name;
 }
 
 1;
 
 =pod
 
-=head1 TO DO
-
-- Write unit tests for this package
-
 =head1 SUPPORT
 
 See the L<support section|PPI/SUPPORT> in the main module.

diff --git a/lib/PPI/Token.pm b/lib/PPI/Token.pm
@@ -172,9 +172,9 @@ sub content {
 sub insert_before {
 	my $self    = shift;
 	my $Element = _INSTANCE(shift, 'PPI::Element')  or return undef;
-	if ( $Element->isa('PPI::Structure') ) {
-		return $self->__insert_before($Element);
-	} elsif ( $Element->isa('PPI::Token') ) {
+	if ( $Element->isa('PPI::Statement')
+	  || $Element->isa('PPI::Structure')
+	  || $Element->isa('PPI::Token') ) {
 		return $self->__insert_before($Element);
 	}
 	'';
@@ -184,9 +184,9 @@ sub insert_before {
 sub insert_after {
 	my $self    = shift;
 	my $Element = _INSTANCE(shift, 'PPI::Element') or return undef;
-	if ( $Element->isa('PPI::Structure') ) {
-		return $self->__insert_after($Element);
-	} elsif ( $Element->isa('PPI::Token') ) {
+	if ( $Element->isa('PPI::Statement')
+	  || $Element->isa('PPI::Structure')
+	  || $Element->isa('PPI::Token') ) {
 		return $self->__insert_after($Element);
 	}
 	'';

diff --git a/lib/PPI/Token/HereDoc.pm b/lib/PPI/Token/HereDoc.pm
@@ -201,14 +201,10 @@ sub __TOKENIZER__on_char {
 		return undef;
 	}
 
-	# Define $line outside of the loop, so that if we encounter the
-	# end of the file, we have access to the last line still.
-	my $line;
-
 	# Suck in the HEREDOC
-	$token->{_heredoc} = [];
+	$token->{_heredoc} = \my @heredoc;
 	my $terminator = $token->{_terminator} . "\n";
-	while ( defined($line = $t->_get_line) ) {
+	while ( defined( my $line = $t->_get_line ) ) {
 		if ( $line eq $terminator ) {
 			# Keep the actual termination line for consistency
 			# when we are re-assembling the file
@@ -219,29 +215,30 @@ sub __TOKENIZER__on_char {
 		}
 
 		# Add the line
-		push @{$token->{_heredoc}}, $line;
+		push @heredoc, $line;
 	}
 
 	# End of file.
 	# Error: Didn't reach end of here-doc before end of file.
-	# $line might be undef if we get NO lines.
-	if ( defined $line and $line eq $token->{_terminator} ) {
-		# If the last line matches the terminator
-		# but is missing the newline, we want to allow
-		# it anyway (like perl itself does). In this case
-		# perl would normally throw a warning, but we will
-		# also ignore that as well.
-		pop @{$token->{_heredoc}};
-		$token->{_terminator_line} = $line;
-	} else {
-		# The HereDoc was not properly terminated.
-		$token->{_terminator_line} = undef;
 
-		# Trim off the trailing whitespace
-		if ( defined $token->{_heredoc}->[-1] and $t->{source_eof_chop} ) {
-			chop $token->{_heredoc}->[-1];
+	# If the here-doc block is not empty, look at the last line to determine if
+	# the here-doc terminator is missing a newline (which Perl would fail to
+	# compile but is easy to detect) or if the here-doc block was just not
+	# terminated at all (which Perl would fail to compile as well).
+	$token->{_terminator_line} = undef;
+	if ( @heredoc and defined $heredoc[-1] ) {
+		# See PPI::Tokenizer, the algorithm there adds a space at the end of the
+		# document that we need to make sure we remove.
+		if ( $t->{source_eof_chop} ) {
+			chop $heredoc[-1];
 			$t->{source_eof_chop} = '';
 		}
+
+		# Check if the last line of the file matches the terminator without
+		# newline at the end. If so, remove it from the content and set it as
+		# the terminator line.
+		$token->{_terminator_line} = pop @heredoc
+		  if $heredoc[-1] eq $token->{_terminator};
 	}
 
 	# Set a hint for PPI::Document->serialize so it can

diff --git a/lib/PPI/Token/Number/Hex.pm b/lib/PPI/Token/Number/Hex.pm
@@ -76,7 +76,7 @@ sub __TOKENIZER__on_char {
 	# Allow underscores straight through
 	return 1 if $char eq '_';
 
-	if ( $char =~ /[\da-f]/i ) {
+	if ( $char =~ /[[:xdigit:]]/ ) {
 		return 1;
 	}
 

diff --git a/lib/PPI/Token/Number/Version.pm b/lib/PPI/Token/Number/Version.pm
@@ -102,17 +102,23 @@ sub __TOKENIZER__on_char {
 sub __TOKENIZER__commit {
 	my $t = $_[1];
 
-	# Get the rest of the line
+	# Capture the rest of the token
 	pos $t->{line} = $t->{line_cursor};
-	if ( $t->{line} !~ m/\G(v\d+(?:\.\d+)*)/gc ) {
+	if ( $t->{line} !~ m/\G(v\d+(?:\.\d+)+|v\d+\b)/gc ) {
 		# This was not a v-string after all (it's a word)
 		return PPI::Token::Word->__TOKENIZER__commit($t);
 	}
 
+	my $content = $1;
+
+	# If there are no periods this could be a word starting with v\d
+	# Forced to be a word. Done.
+	return PPI::Token::Word->__TOKENIZER__commit($t)
+		if $content !~ /\./ and $t->__current_token_is_forced_word;
+
 	# This is a v-string
-	my $vstring = $1;
-	$t->{line_cursor} += length($vstring);
-	$t->_new_token('Number::Version', $vstring);
+	$t->{line_cursor} += length $content;
+	$t->_new_token( 'Number::Version', $content );
 	$t->_finalize_token->__TOKENIZER__on_char($t);
 }
 

diff --git a/lib/PPI/Token/QuoteLike/Words.pm b/lib/PPI/Token/QuoteLike/Words.pm
@@ -42,20 +42,27 @@ BEGIN {
 
 =head2 literal
 
-Returns the words contained.  Note that this method does not check the
+Returns the words contained as a list.  Note that this method does not check the
 context that the token is in; it always returns the list and not merely
 the last element if the token is in scalar context.
 
 =cut
 
 sub literal {
-	my $self    = shift;
-	my $section = $self->{sections}->[0];
-	return split ' ', substr(
-		$self->{content},
-		$section->{position},
-		$section->{size},
-	);
+	my ( $self ) = @_;
+
+	my $content = $self->_section_content(0);
+	return if !defined $content;
+
+	# Undo backslash escaping of '\', the left delimiter,
+	# and the right delimiter.  The right delimiter will
+	# only exist with paired delimiters: qw() qw[] qw<> qw{}.
+	my ( $left, $right ) = ( $self->_delimiters, '', '' );
+	$content =~ s/\\([\Q$left$right\\\E])/$1/g;
+
+	my @words = split ' ', $content;
+
+	return @words;
 }
 
 1;

diff --git a/lib/PPI/Token/Unknown.pm b/lib/PPI/Token/Unknown.pm
@@ -90,7 +90,7 @@ sub __TOKENIZER__on_char {
 				} elsif (
 					$p0->isa('PPI::Token::Structure')
 					and
-					$p0->content =~ /^(?:\)|\])$/
+					$p0->content =~ /^(?:\)|\]|\})$/
 				) {
 					$_class = 'Operator';
 				} else {

diff --git a/lib/PPI/Token/Whitespace.pm b/lib/PPI/Token/Whitespace.pm
@@ -203,7 +203,8 @@ sub __TOKENIZER__on_line_start {
 
 sub __TOKENIZER__on_char {
 	my $t    = $_[1];
-	my $char = ord substr $t->{line}, $t->{line_cursor}, 1;
+	my $c = substr $t->{line}, $t->{line_cursor}, 1;
+	my $char = ord $c;
 
 	# Do we definitely know what something is?
 	return $COMMITMAP[$char]->__TOKENIZER__commit($t) if $COMMITMAP[$char];
@@ -407,9 +408,9 @@ sub __TOKENIZER__on_char {
 		}
 
 	} elsif ( $char >= 128 ) { # Outside ASCII
-		return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $t =~ /\w/;
-		return 'Whitespace' if $t =~ /\s/;
-        }
+		return 'PPI::Token::Word'->__TOKENIZER__commit($t) if $c =~ /\w/;
+		return 'Whitespace' if $c =~ /\s/;
+	}
 
 
 	# All the whitespaces are covered, so what to do