Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lib/PPI/Normal.pm
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ sub process {
&{"$function"}( $self->{Document} );
}

# Reset token offsets as they are not valid anymore
for my $token ($self->{Document}->tokens) {
$token->{_byte_start} = -1;
}

# Create the normalized Document object
my $Normalized = PPI::Document::Normalized->new(
Document => $self->{Document},
Expand Down
57 changes: 56 additions & 1 deletion lib/PPI/Token.pm
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ use Params::Util qw{_INSTANCE};
use PPI::Element ();
use PPI::Exception ();

require bytes;

use vars qw{$VERSION @ISA};
BEGIN {
$VERSION = '1.220';
Expand Down Expand Up @@ -83,7 +85,10 @@ use PPI::Token::Unknown ();
# Constructor and Related

sub new {
bless { content => (defined $_[1] ? "$_[1]" : '') }, $_[0];
bless {
content => (defined $_[1] ? "$_[1]" : ''),
_byte_start => (defined $_[2] ? $_[2] : -1),
}, $_[0];
}

sub set_class {
Expand Down Expand Up @@ -158,6 +163,56 @@ The C<length> method returns the length of the string in a Token.
sub length { CORE::length($_[0]->{content}) }


=pod

=head2 byte_span

Returns an arrayref with zero-based offsets of the first and last bytes of that
Token.

Offsets are absolute byte positions within a Document, meaning the very first
byte of the first token is always at position zero and the last byte of the
last token is always at position I<document size in bytes - 1>.

Example:

my $Document = PPI::Document->new( \'my $var = 42;' );
[ map($_->byte_span, $Document->tokens) ];

will produce the following:

[
[0, 2], # my
[3, 3], # whitespace
[4, 7], # $var
[8, 8], # whitespace
[9, 9], # =
[10, 10], # whitespace
[11, 12], # 42
[13, 13], # ;
]

Returns C<undef> for tokens with unknown position (e.g. tokens not attached to
a Document).

For some token types computing a byte span is not supported. Currently there's
only one unsupported type: L<PPI::Token::HereDoc>.
Tokens of that type still contribute to the total size of the Document but do
not have a span of their own (meaning this method will return C<undef>).

Normalising a Document invalidates offsets of all tokens, making this method
return C<undef>.

B<NOTE>: as the method name suggests, offsets are caclulated in bytes, not
characters.

=cut

sub byte_span {
my $start = $_[0]->{_byte_start};
return undef if $start < 0;
[ $start, $start + bytes::length($_[0]->{content}) - 1 ];
}



Expand Down
18 changes: 17 additions & 1 deletion lib/PPI/Token/HereDoc.pm
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ BEGIN {
@ISA = 'PPI::Token';
}


require bytes;



Expand Down Expand Up @@ -214,6 +214,16 @@ sub __TOKENIZER__on_char {
# when we are re-assembling the file
$token->{_terminator_line} = $line;

# Actual content and terminator are not included when
# computing a HereDoc's byte length so we need to stash
# it so that we can manually fixup offsets later
#
# A line may contain multiple heredocs and we need them
# all so we're adding the length, not overwriting it
$t->{__current_heredoc_byte_length} +=
bytes::length(join("", @heredoc)) +
bytes::length($line);

# The HereDoc is now fully parsed
return $t->_finalize_token->__TOKENIZER__on_char( $t );
}
Expand Down Expand Up @@ -252,6 +262,12 @@ sub __TOKENIZER__on_char {
$t->_finalize_token->__TOKENIZER__on_char( $t );
}

# override byte_span() from the parent class because
# for heredocs byte offsets are undefined
sub byte_span {
return undef;
}

1;

=pod
Expand Down
19 changes: 17 additions & 2 deletions lib/PPI/Tokenizer.pm
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ BEGIN {
$VERSION = '1.220';
}

require bytes;

# The x operator cannot follow most Perl operators, implying that
# anything beginning with x following an operator is a word.
# These are the exceptions.
Expand Down Expand Up @@ -146,6 +148,11 @@ sub new {
class => 'PPI::Token::BOM',
zone => 'PPI::Token::Whitespace',

# Bookkeeping needed to track byte offsets
file_byte_cursor => 0,
__total_heredoc_byte_length => 0,
__current_heredoc_byte_length => 0,

# Output token buffer
tokens => [],
token_cursor => 0,
Expand Down Expand Up @@ -464,6 +471,8 @@ sub _fill_line {
$self->{line_length} = length $line;
$self->{line_count}++;

$self->{__total_heredoc_byte_length} += $self->{__current_heredoc_byte_length};

1;
}

Expand Down Expand Up @@ -600,10 +609,16 @@ sub _process_next_char {
# Returns the resulting parse class as a convenience.
sub _finalize_token {
my $self = shift;
return $self->{class} unless defined $self->{token};

defined(my $tok = $self->{token}) or return $self->{class};

# Include heredoc content and terminators
$tok->{_byte_start} = $self->{file_byte_cursor} + $self->{__total_heredoc_byte_length};

$self->{file_byte_cursor} += bytes::length($tok->{content});

# Add the token to the token buffer
push @{ $self->{tokens} }, $self->{token};
push @{ $self->{tokens} }, $tok;
$self->{token} = undef;

# Return the parse class to that of the zone we are in
Expand Down
2 changes: 2 additions & 0 deletions t/08_regression.t
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ SCOPE: {
# Check the regexp matches what we would expect (specifically
# the fine details about the sections.
my $expected = {
_byte_start => 0,
_sections => 2,
braced => 1,
content => 's {foo} <bar>i',
Expand Down Expand Up @@ -99,6 +100,7 @@ SCOPE: {

# Check the internal details as before
my $expected = {
_byte_start => 0,
_sections => 2,
_error => "No second section of regexp, or does not start with a balanced character",
braced => 1,
Expand Down
Loading