Perl-Critic · ksurent · Apr 4, 2016 · Apr 4, 2016
diff --git a/lib/PPI/Normal.pm b/lib/PPI/Normal.pm
@@ -194,6 +194,11 @@ sub process {
 		&{"$function"}( $self->{Document} );
 	}
 
+	# Reset token offsets as they are not valid anymore
+	for my $token ($self->{Document}->tokens) {
+		$token->{_byte_start} = -1;
+	}
+
 	# Create the normalized Document object
 	my $Normalized = PPI::Document::Normalized->new(
 		Document  => $self->{Document},

diff --git a/lib/PPI/Token.pm b/lib/PPI/Token.pm
@@ -25,6 +25,8 @@ use Params::Util   qw{_INSTANCE};
 use PPI::Element   ();
 use PPI::Exception ();
 
+require bytes;
+
 use vars qw{$VERSION @ISA};
 BEGIN {
 	$VERSION = '1.220';
@@ -83,7 +85,10 @@ use PPI::Token::Unknown               ();
 # Constructor and Related
 
 sub new {
-	bless { content => (defined $_[1] ? "$_[1]" : '') }, $_[0];
+	bless {
+		content     => (defined $_[1] ? "$_[1]" : ''),
+		_byte_start => (defined $_[2] ? $_[2] : -1),
+	}, $_[0];
 }
 
 sub set_class {
@@ -158,6 +163,56 @@ The C<length> method returns the length of the string in a Token.
 sub length { CORE::length($_[0]->{content}) }
 
 
+=pod
+
+=head2 byte_span
+
+Returns an arrayref with zero-based offsets of the first and last bytes of that
+Token.
+
+Offsets are absolute byte positions within a Document, meaning the very first
+byte of the first token is always at position zero and the last byte of the
+last token is always at position I<document size in bytes - 1>.
+
+Example:
+
+	my $Document = PPI::Document->new( \'my $var = 42;' );
+	[ map($_->byte_span, $Document->tokens) ];
+
+will produce the following:
+
+	[
+		[0, 2],   # my
+		[3, 3],   # whitespace
+		[4, 7],   # $var
+		[8, 8],   # whitespace
+		[9, 9],   # =
+		[10, 10], # whitespace
+		[11, 12], # 42
+		[13, 13], # ;
+	]
+
+Returns C<undef> for tokens with unknown position (e.g. tokens not attached to
+a Document).
+
+For some token types computing a byte span is not supported. Currently there's
+only one unsupported type: L<PPI::Token::HereDoc>.
+Tokens of that type still contribute to the total size of the Document but do
+not have a span of their own (meaning this method will return C<undef>).
+
+Normalising a Document invalidates offsets of all tokens, making this method
+return C<undef>.
+
+B<NOTE>: as the method name suggests, offsets are caclulated in bytes, not
+characters.
+
+=cut
+
+sub byte_span {
+	my $start = $_[0]->{_byte_start};
+	return undef if $start < 0;
+	[ $start, $start + bytes::length($_[0]->{content}) - 1 ];
+}
 
 
 

diff --git a/lib/PPI/Token/HereDoc.pm b/lib/PPI/Token/HereDoc.pm
@@ -93,7 +93,7 @@ BEGIN {
 	@ISA     = 'PPI::Token';
 }
 
-
+require bytes;
 
 
 
@@ -214,6 +214,16 @@ sub __TOKENIZER__on_char {
 			# when we are re-assembling the file
 			$token->{_terminator_line} = $line;
 
+			# Actual content and terminator are not included when
+			# computing a HereDoc's byte length so we need to stash
+			# it so that we can manually fixup offsets later
+			#
+			# A line may contain multiple heredocs and we need them
+			# all so we're adding the length, not overwriting it
+			$t->{__current_heredoc_byte_length} +=
+				bytes::length(join("", @heredoc)) +
+				bytes::length($line);
+
 			# The HereDoc is now fully parsed
 			return $t->_finalize_token->__TOKENIZER__on_char( $t );
 		}
@@ -252,6 +262,12 @@ sub __TOKENIZER__on_char {
 	$t->_finalize_token->__TOKENIZER__on_char( $t );
 }
 
+# override byte_span() from the parent class because
+# for heredocs byte offsets are undefined
+sub byte_span {
+    return undef;
+}
+
 1;
 
 =pod

diff --git a/lib/PPI/Tokenizer.pm b/lib/PPI/Tokenizer.pm
@@ -92,6 +92,8 @@ BEGIN {
 	$VERSION = '1.220';
 }
 
+require bytes;
+
 # The x operator cannot follow most Perl operators, implying that
 # anything beginning with x following an operator is a word.
 # These are the exceptions.
@@ -146,6 +148,11 @@ sub new {
 		class        => 'PPI::Token::BOM',
 		zone         => 'PPI::Token::Whitespace',
 
+		# Bookkeeping needed to track byte offsets
+		file_byte_cursor => 0,
+		__total_heredoc_byte_length   => 0,
+		__current_heredoc_byte_length => 0,
+
 		# Output token buffer
 		tokens       => [],
 		token_cursor => 0,
@@ -464,6 +471,8 @@ sub _fill_line {
 	$self->{line_length} = length $line;
 	$self->{line_count}++;
 
+	$self->{__total_heredoc_byte_length} += $self->{__current_heredoc_byte_length};
+
 	1;
 }
 
@@ -600,10 +609,16 @@ sub _process_next_char {
 # Returns the resulting parse class as a convenience.
 sub _finalize_token {
 	my $self = shift;
-	return $self->{class} unless defined $self->{token};
+
+	defined(my $tok = $self->{token}) or return $self->{class};
+
+	# Include heredoc content and terminators
+	$tok->{_byte_start} = $self->{file_byte_cursor} + $self->{__total_heredoc_byte_length};
+
+	$self->{file_byte_cursor} += bytes::length($tok->{content});
 
 	# Add the token to the token buffer
-	push @{ $self->{tokens} }, $self->{token};
+	push @{ $self->{tokens} }, $tok;
 	$self->{token} = undef;
 
 	# Return the parse class to that of the zone we are in

diff --git a/t/08_regression.t b/t/08_regression.t
@@ -69,6 +69,7 @@ SCOPE: {
 	# Check the regexp matches what we would expect (specifically
 	# the fine details about the sections.
 	my $expected = {
+		_byte_start => 0,
 		_sections => 2,
 		braced    => 1,
 		content   => 's {foo} <bar>i',
@@ -99,6 +100,7 @@ SCOPE: {
 
 	# Check the internal details as before
 	my $expected = {
+		_byte_start => 0,
 		_sections => 2,
 		_error    => "No second section of regexp, or does not start with a balanced character",
 		braced    => 1,