| Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Lexer.pm |
| Statements | Executed 59 statements in 3.10ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 3.13ms | 24.1ms | PPIx::Regexp::Lexer::BEGIN@61 |
| 1 | 1 | 1 | 1.02ms | 1.21ms | PPIx::Regexp::Lexer::BEGIN@44 |
| 1 | 1 | 1 | 281µs | 384µs | PPIx::Regexp::Lexer::BEGIN@58 |
| 1 | 1 | 1 | 257µs | 350µs | PPIx::Regexp::Lexer::BEGIN@55 |
| 1 | 1 | 1 | 250µs | 372µs | PPIx::Regexp::Lexer::BEGIN@49 |
| 1 | 1 | 1 | 246µs | 372µs | PPIx::Regexp::Lexer::BEGIN@46 |
| 1 | 1 | 1 | 227µs | 318µs | PPIx::Regexp::Lexer::BEGIN@51 |
| 1 | 1 | 1 | 193µs | 286µs | PPIx::Regexp::Lexer::BEGIN@48 |
| 1 | 1 | 1 | 192µs | 299µs | PPIx::Regexp::Lexer::BEGIN@47 |
| 1 | 1 | 1 | 184µs | 317µs | PPIx::Regexp::Lexer::BEGIN@53 |
| 1 | 1 | 1 | 178µs | 270µs | PPIx::Regexp::Lexer::BEGIN@52 |
| 1 | 1 | 1 | 162µs | 259µs | PPIx::Regexp::Lexer::BEGIN@54 |
| 1 | 1 | 1 | 152µs | 632µs | PPIx::Regexp::Lexer::BEGIN@60 |
| 1 | 1 | 1 | 138µs | 236µs | PPIx::Regexp::Lexer::BEGIN@59 |
| 1 | 1 | 1 | 133µs | 230µs | PPIx::Regexp::Lexer::BEGIN@50 |
| 1 | 1 | 1 | 131µs | 230µs | PPIx::Regexp::Lexer::BEGIN@45 |
| 1 | 1 | 1 | 130µs | 228µs | PPIx::Regexp::Lexer::BEGIN@57 |
| 1 | 1 | 1 | 129µs | 221µs | PPIx::Regexp::Lexer::BEGIN@43 |
| 1 | 1 | 1 | 114µs | 204µs | PPIx::Regexp::Lexer::BEGIN@56 |
| 1 | 1 | 1 | 12µs | 24µs | PPIx::Regexp::Lexer::BEGIN@36 |
| 1 | 1 | 1 | 8µs | 30µs | PPIx::Regexp::Lexer::BEGIN@62 |
| 1 | 1 | 1 | 8µs | 12µs | PPIx::Regexp::Lexer::BEGIN@37 |
| 1 | 1 | 1 | 7µs | 29µs | PPIx::Regexp::Lexer::BEGIN@41 |
| 1 | 1 | 1 | 7µs | 27µs | PPIx::Regexp::Lexer::BEGIN@42 |
| 1 | 1 | 1 | 7µs | 468µs | PPIx::Regexp::Lexer::BEGIN@39 |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_curly |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_finalize |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_get_delimited |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_get_token |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_in_regex_set |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_make_node |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_recover_curly |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_recover_curly_quantifiers |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_regex_set |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_round |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_square |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::_unget_token |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::errstr |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::failures |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::lex |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Lexer::new |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | =head1 NAME | ||||
| 2 | |||||
| 3 | PPIx::Regexp::Lexer - Assemble tokenizer output. | ||||
| 4 | |||||
| 5 | =head1 SYNOPSIS | ||||
| 6 | |||||
| 7 | use PPIx::Regexp::Lexer; | ||||
| 8 | use PPIx::Regexp::Dumper; | ||||
| 9 | my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx'); | ||||
| 10 | my $dmp = PPIx::Regexp::Dumper->new( $lex ); | ||||
| 11 | $dmp->print(); | ||||
| 12 | |||||
| 13 | =head1 INHERITANCE | ||||
| 14 | |||||
| 15 | C<PPIx::Regexp::Lexer> is a | ||||
| 16 | L<PPIx::Regexp::Support|PPIx::Regexp::Support>. | ||||
| 17 | |||||
| 18 | C<PPIx::Regexp::Lexer> has no descendants. | ||||
| 19 | |||||
| 20 | =head1 DESCRIPTION | ||||
| 21 | |||||
| 22 | This class takes the token stream generated by | ||||
| 23 | L<PPIx::Regexp::Tokenizer|PPIx::Regexp::Tokenizer> and generates the | ||||
| 24 | parse tree. | ||||
| 25 | |||||
| 26 | =head1 METHODS | ||||
| 27 | |||||
| 28 | This class provides the following public methods. Methods not documented | ||||
| 29 | here are private, and unsupported in the sense that the author reserves | ||||
| 30 | the right to change or remove them without notice. | ||||
| 31 | |||||
| 32 | =cut | ||||
| 33 | |||||
| 34 | package PPIx::Regexp::Lexer; | ||||
| 35 | |||||
| 36 | 2 | 21µs | 2 | 36µs | # spent 24µs (12+12) within PPIx::Regexp::Lexer::BEGIN@36 which was called:
# once (12µs+12µs) by PPIx::Regexp::BEGIN@90 at line 36 # spent 24µs making 1 call to PPIx::Regexp::Lexer::BEGIN@36
# spent 12µs making 1 call to strict::import |
| 37 | 2 | 24µs | 2 | 17µs | # spent 12µs (8+5) within PPIx::Regexp::Lexer::BEGIN@37 which was called:
# once (8µs+5µs) by PPIx::Regexp::BEGIN@90 at line 37 # spent 12µs making 1 call to PPIx::Regexp::Lexer::BEGIN@37
# spent 5µs making 1 call to warnings::import |
| 38 | |||||
| 39 | 2 | 23µs | 2 | 929µs | # spent 468µs (7+461) within PPIx::Regexp::Lexer::BEGIN@39 which was called:
# once (7µs+461µs) by PPIx::Regexp::BEGIN@90 at line 39 # spent 468µs making 1 call to PPIx::Regexp::Lexer::BEGIN@39
# spent 461µs making 1 call to base::import |
| 40 | |||||
| 41 | 2 | 21µs | 2 | 50µs | # spent 29µs (7+21) within PPIx::Regexp::Lexer::BEGIN@41 which was called:
# once (7µs+21µs) by PPIx::Regexp::BEGIN@90 at line 41 # spent 29µs making 1 call to PPIx::Regexp::Lexer::BEGIN@41
# spent 21µs making 1 call to Exporter::import |
| 42 | 2 | 18µs | 2 | 48µs | # spent 27µs (7+20) within PPIx::Regexp::Lexer::BEGIN@42 which was called:
# once (7µs+20µs) by PPIx::Regexp::BEGIN@90 at line 42 # spent 27µs making 1 call to PPIx::Regexp::Lexer::BEGIN@42
# spent 20µs making 1 call to Exporter::import |
| 43 | 2 | 90µs | 1 | 221µs | # spent 221µs (129+92) within PPIx::Regexp::Lexer::BEGIN@43 which was called:
# once (129µs+92µs) by PPIx::Regexp::BEGIN@90 at line 43 # spent 221µs making 1 call to PPIx::Regexp::Lexer::BEGIN@43 |
| 44 | 2 | 97µs | 1 | 1.21ms | # spent 1.21ms (1.02+195µs) within PPIx::Regexp::Lexer::BEGIN@44 which was called:
# once (1.02ms+195µs) by PPIx::Regexp::BEGIN@90 at line 44 # spent 1.21ms making 1 call to PPIx::Regexp::Lexer::BEGIN@44 |
| 45 | 2 | 91µs | 1 | 230µs | # spent 230µs (131+99) within PPIx::Regexp::Lexer::BEGIN@45 which was called:
# once (131µs+99µs) by PPIx::Regexp::BEGIN@90 at line 45 # spent 230µs making 1 call to PPIx::Regexp::Lexer::BEGIN@45 |
| 46 | 2 | 83µs | 1 | 372µs | # spent 372µs (246+125) within PPIx::Regexp::Lexer::BEGIN@46 which was called:
# once (246µs+125µs) by PPIx::Regexp::BEGIN@90 at line 46 # spent 372µs making 1 call to PPIx::Regexp::Lexer::BEGIN@46 |
| 47 | 2 | 92µs | 1 | 299µs | # spent 299µs (192+107) within PPIx::Regexp::Lexer::BEGIN@47 which was called:
# once (192µs+107µs) by PPIx::Regexp::BEGIN@90 at line 47 # spent 299µs making 1 call to PPIx::Regexp::Lexer::BEGIN@47 |
| 48 | 2 | 89µs | 1 | 286µs | # spent 286µs (193+93) within PPIx::Regexp::Lexer::BEGIN@48 which was called:
# once (193µs+93µs) by PPIx::Regexp::BEGIN@90 at line 48 # spent 286µs making 1 call to PPIx::Regexp::Lexer::BEGIN@48 |
| 49 | 2 | 91µs | 1 | 372µs | # spent 372µs (250+123) within PPIx::Regexp::Lexer::BEGIN@49 which was called:
# once (250µs+123µs) by PPIx::Regexp::BEGIN@90 at line 49 # spent 372µs making 1 call to PPIx::Regexp::Lexer::BEGIN@49 |
| 50 | 2 | 92µs | 1 | 230µs | # spent 230µs (133+97) within PPIx::Regexp::Lexer::BEGIN@50 which was called:
# once (133µs+97µs) by PPIx::Regexp::BEGIN@90 at line 50 # spent 230µs making 1 call to PPIx::Regexp::Lexer::BEGIN@50 |
| 51 | 2 | 84µs | 1 | 318µs | # spent 318µs (227+91) within PPIx::Regexp::Lexer::BEGIN@51 which was called:
# once (227µs+91µs) by PPIx::Regexp::BEGIN@90 at line 51 # spent 318µs making 1 call to PPIx::Regexp::Lexer::BEGIN@51 |
| 52 | 2 | 85µs | 1 | 270µs | # spent 270µs (178+91) within PPIx::Regexp::Lexer::BEGIN@52 which was called:
# once (178µs+91µs) by PPIx::Regexp::BEGIN@90 at line 52 # spent 270µs making 1 call to PPIx::Regexp::Lexer::BEGIN@52 |
| 53 | 2 | 92µs | 1 | 317µs | # spent 317µs (184+133) within PPIx::Regexp::Lexer::BEGIN@53 which was called:
# once (184µs+133µs) by PPIx::Regexp::BEGIN@90 at line 53 # spent 317µs making 1 call to PPIx::Regexp::Lexer::BEGIN@53 |
| 54 | 2 | 82µs | 1 | 259µs | # spent 259µs (162+97) within PPIx::Regexp::Lexer::BEGIN@54 which was called:
# once (162µs+97µs) by PPIx::Regexp::BEGIN@90 at line 54 # spent 259µs making 1 call to PPIx::Regexp::Lexer::BEGIN@54 |
| 55 | 2 | 83µs | 1 | 350µs | # spent 350µs (257+93) within PPIx::Regexp::Lexer::BEGIN@55 which was called:
# once (257µs+93µs) by PPIx::Regexp::BEGIN@90 at line 55 # spent 350µs making 1 call to PPIx::Regexp::Lexer::BEGIN@55 |
| 56 | 2 | 74µs | 1 | 204µs | # spent 204µs (114+91) within PPIx::Regexp::Lexer::BEGIN@56 which was called:
# once (114µs+91µs) by PPIx::Regexp::BEGIN@90 at line 56 # spent 204µs making 1 call to PPIx::Regexp::Lexer::BEGIN@56 |
| 57 | 2 | 85µs | 1 | 228µs | # spent 228µs (130+97) within PPIx::Regexp::Lexer::BEGIN@57 which was called:
# once (130µs+97µs) by PPIx::Regexp::BEGIN@90 at line 57 # spent 228µs making 1 call to PPIx::Regexp::Lexer::BEGIN@57 |
| 58 | 2 | 86µs | 1 | 384µs | # spent 384µs (281+103) within PPIx::Regexp::Lexer::BEGIN@58 which was called:
# once (281µs+103µs) by PPIx::Regexp::BEGIN@90 at line 58 # spent 384µs making 1 call to PPIx::Regexp::Lexer::BEGIN@58 |
| 59 | 2 | 97µs | 1 | 236µs | # spent 236µs (138+98) within PPIx::Regexp::Lexer::BEGIN@59 which was called:
# once (138µs+98µs) by PPIx::Regexp::BEGIN@90 at line 59 # spent 236µs making 1 call to PPIx::Regexp::Lexer::BEGIN@59 |
| 60 | 2 | 105µs | 1 | 632µs | # spent 632µs (152+480) within PPIx::Regexp::Lexer::BEGIN@60 which was called:
# once (152µs+480µs) by PPIx::Regexp::BEGIN@90 at line 60 # spent 632µs making 1 call to PPIx::Regexp::Lexer::BEGIN@60 |
| 61 | 2 | 82µs | 1 | 24.1ms | # spent 24.1ms (3.13+20.9) within PPIx::Regexp::Lexer::BEGIN@61 which was called:
# once (3.13ms+20.9ms) by PPIx::Regexp::BEGIN@90 at line 61 # spent 24.1ms making 1 call to PPIx::Regexp::Lexer::BEGIN@61 |
| 62 | 2 | 1.30ms | 2 | 52µs | # spent 30µs (8+22) within PPIx::Regexp::Lexer::BEGIN@62 which was called:
# once (8µs+22µs) by PPIx::Regexp::BEGIN@90 at line 62 # spent 30µs making 1 call to PPIx::Regexp::Lexer::BEGIN@62
# spent 22µs making 1 call to Exporter::import |
| 63 | |||||
| 64 | 1 | 700ns | our $VERSION = '0.036'; | ||
| 65 | |||||
| 66 | =head2 new | ||||
| 67 | |||||
| 68 | This method instantiates the lexer. It takes as its argument either a | ||||
| 69 | L<PPIx::Regexp::Tokenizer|PPIx::Regexp::Tokenizer> or the text to be | ||||
| 70 | parsed. In the latter case the tokenizer is instantiated from the text. | ||||
| 71 | |||||
| 72 | Any optional name/value pairs after the first argument are passed to the | ||||
| 73 | tokenizer, which interprets them or not as the case may be. | ||||
| 74 | |||||
| 75 | =cut | ||||
| 76 | |||||
| 77 | { | ||||
| 78 | |||||
| 79 | 2 | 700ns | my $errstr; | ||
| 80 | |||||
| 81 | sub new { | ||||
| 82 | my ( $class, $tokenizer, %args ) = @_; | ||||
| 83 | ref $class and $class = ref $class; | ||||
| 84 | |||||
| 85 | __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) | ||||
| 86 | or $tokenizer = PPIx::Regexp::Tokenizer->new( $tokenizer, %args ) | ||||
| 87 | or do { | ||||
| 88 | $errstr = PPIx::Regexp::Tokenizer->errstr(); | ||||
| 89 | return; | ||||
| 90 | }; | ||||
| 91 | |||||
| 92 | my $self = { | ||||
| 93 | deferred => [], # Deferred tokens | ||||
| 94 | failures => 0, | ||||
| 95 | tokenizer => $tokenizer, | ||||
| 96 | }; | ||||
| 97 | |||||
| 98 | bless $self, $class; | ||||
| 99 | return $self; | ||||
| 100 | } | ||||
| 101 | |||||
| 102 | sub errstr { | ||||
| 103 | return $errstr; | ||||
| 104 | } | ||||
| 105 | |||||
| 106 | } | ||||
| 107 | |||||
| 108 | =head2 errstr | ||||
| 109 | |||||
| 110 | This method returns the error string from the last attempt to | ||||
| 111 | instantiate a C<PPIx::Regexp::Lexer>. If the last attempt succeeded, the | ||||
| 112 | error will be C<undef>. | ||||
| 113 | |||||
| 114 | =cut | ||||
| 115 | |||||
| 116 | # Defined above | ||||
| 117 | |||||
| 118 | =head2 failures | ||||
| 119 | |||||
| 120 | print $lexer->failures(), " parse failures\n"; | ||||
| 121 | |||||
| 122 | This method returns the number of parse failures encountered. A | ||||
| 123 | parse failure is either a tokenization failure (see | ||||
| 124 | L<< PPIx::Regexp::Tokenizer->failures()|PPIx::Regexp::Tokenizer/failures >>) | ||||
| 125 | or a structural error. | ||||
| 126 | |||||
| 127 | =cut | ||||
| 128 | |||||
| 129 | sub failures { | ||||
| 130 | my ( $self ) = @_; | ||||
| 131 | return $self->{failures}; | ||||
| 132 | } | ||||
| 133 | |||||
| 134 | =head2 lex | ||||
| 135 | |||||
| 136 | This method lexes the tokens in the text, and returns the lexed list of | ||||
| 137 | elements. | ||||
| 138 | |||||
| 139 | =cut | ||||
| 140 | |||||
| 141 | sub lex { | ||||
| 142 | my ( $self ) = @_; | ||||
| 143 | |||||
| 144 | my @content; | ||||
| 145 | $self->{failures} = 0; | ||||
| 146 | |||||
| 147 | # Accept everything up to the first delimiter. | ||||
| 148 | { | ||||
| 149 | my $token = $self->_get_token() | ||||
| 150 | or return $self->_finalize( @content ); | ||||
| 151 | $token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do { | ||||
| 152 | push @content, $token; | ||||
| 153 | redo; | ||||
| 154 | }; | ||||
| 155 | $self->_unget_token( $token ); | ||||
| 156 | } | ||||
| 157 | |||||
| 158 | # Accept the first delimited structure. | ||||
| 159 | push @content, ( my $regexp = $self->_get_delimited( | ||||
| 160 | 'PPIx::Regexp::Structure::Regexp' ) ); | ||||
| 161 | |||||
| 162 | # If we are a substitution ... | ||||
| 163 | if ( $content[0]->content() eq 's' ) { | ||||
| 164 | |||||
| 165 | # Accept any insignificant stuff. | ||||
| 166 | while ( my $token = $self->_get_token() ) { | ||||
| 167 | if ( $token->significant() ) { | ||||
| 168 | $self->_unget_token( $token ); | ||||
| 169 | last; | ||||
| 170 | } else { | ||||
| 171 | push @content, $token; | ||||
| 172 | } | ||||
| 173 | } | ||||
| 174 | |||||
| 175 | # Figure out if we should expect an opening bracket. | ||||
| 176 | my $expect_open_bracket = $self->close_bracket( | ||||
| 177 | $regexp->start( 0 ) ) || 0; | ||||
| 178 | |||||
| 179 | # Accept the next delimited structure. | ||||
| 180 | push @content, $self->_get_delimited( | ||||
| 181 | 'PPIx::Regexp::Structure::Replacement', | ||||
| 182 | $expect_open_bracket, | ||||
| 183 | ); | ||||
| 184 | } | ||||
| 185 | |||||
| 186 | # Accept the modifiers (we hope!) plus any trailing white space. | ||||
| 187 | while ( my $token = $self->_get_token() ) { | ||||
| 188 | push @content, $token; | ||||
| 189 | } | ||||
| 190 | |||||
| 191 | # Let all the elements finalize themselves, recording any additional | ||||
| 192 | # errors as they do so. | ||||
| 193 | $self->_finalize( @content ); | ||||
| 194 | |||||
| 195 | # If we found a regular expression (and we should have done so) ... | ||||
| 196 | if ( $regexp ) { | ||||
| 197 | |||||
| 198 | # Retrieve the maximum capture group. | ||||
| 199 | my $max_capture = $regexp->max_capture_number(); | ||||
| 200 | |||||
| 201 | # Hashify the known capture names | ||||
| 202 | my $capture_name = { | ||||
| 203 | map { $_ => 1 } $regexp->capture_names(), | ||||
| 204 | }; | ||||
| 205 | |||||
| 206 | # For all the backreferences found | ||||
| 207 | foreach my $elem ( @{ $regexp->find( | ||||
| 208 | 'PPIx::Regexp::Token::Backreference' ) || [] } ) { | ||||
| 209 | # Rebless them as needed, recording any errors found. | ||||
| 210 | $self->{failures} += | ||||
| 211 | $elem->__PPIX_LEXER__rebless( | ||||
| 212 | capture_name => $capture_name, | ||||
| 213 | max_capture => $max_capture, | ||||
| 214 | ); | ||||
| 215 | } | ||||
| 216 | } | ||||
| 217 | |||||
| 218 | return @content; | ||||
| 219 | |||||
| 220 | } | ||||
| 221 | |||||
| 222 | # Finalize the content array, updating the parse failures count as we | ||||
| 223 | # go. | ||||
| 224 | sub _finalize { | ||||
| 225 | my ( $self, @content ) = @_; | ||||
| 226 | foreach my $elem ( @content ) { | ||||
| 227 | $self->{failures} += $elem->__PPIX_LEXER__finalize(); | ||||
| 228 | } | ||||
| 229 | defined wantarray and return @content; | ||||
| 230 | return; | ||||
| 231 | } | ||||
| 232 | |||||
| 233 | { | ||||
| 234 | |||||
| 235 | 2 | 2µs | my %bracket = ( | ||
| 236 | '{' => '}', | ||||
| 237 | '(' => ')', | ||||
| 238 | '[' => ']', | ||||
| 239 | '(?[' => '])', | ||||
| 240 | ## '<' => '>', | ||||
| 241 | ); | ||||
| 242 | |||||
| 243 | 1 | 1µs | my %unclosed = ( | ||
| 244 | '{' => '_recover_curly', | ||||
| 245 | ); | ||||
| 246 | |||||
| 247 | sub _get_delimited { | ||||
| 248 | my ( $self, $class, $expect_open_bracket ) = @_; | ||||
| 249 | defined $expect_open_bracket or $expect_open_bracket = 1; | ||||
| 250 | |||||
| 251 | my @rslt; | ||||
| 252 | $self->{_rslt} = \@rslt; | ||||
| 253 | |||||
| 254 | if ( $expect_open_bracket ) { | ||||
| 255 | if ( my $token = $self->_get_token() ) { | ||||
| 256 | push @rslt, []; | ||||
| 257 | if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { | ||||
| 258 | push @{ $rslt[-1] }, '', $token; | ||||
| 259 | } else { | ||||
| 260 | push @{ $rslt[-1] }, '', undef; | ||||
| 261 | $self->_unget_token( $token ); | ||||
| 262 | } | ||||
| 263 | } else { | ||||
| 264 | return; | ||||
| 265 | } | ||||
| 266 | } else { | ||||
| 267 | push @rslt, [ '', undef ]; | ||||
| 268 | } | ||||
| 269 | |||||
| 270 | while ( my $token = $self->_get_token() ) { | ||||
| 271 | if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { | ||||
| 272 | $self->_unget_token( $token ); | ||||
| 273 | last; | ||||
| 274 | } | ||||
| 275 | if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) { | ||||
| 276 | my $content = $token->content(); | ||||
| 277 | |||||
| 278 | if ( my $finish = $bracket{$content} ) { | ||||
| 279 | # Open bracket | ||||
| 280 | push @rslt, [ $finish, $token ]; | ||||
| 281 | |||||
| 282 | } elsif ( $content eq $rslt[-1][0] ) { | ||||
| 283 | |||||
| 284 | # Matched close bracket | ||||
| 285 | $self->_make_node( $token ); | ||||
| 286 | |||||
| 287 | } elsif ( $content ne ')' ) { | ||||
| 288 | |||||
| 289 | # If the close bracket is not a parenthesis, it becomes | ||||
| 290 | # a literal. | ||||
| 291 | bless $token, TOKEN_LITERAL; | ||||
| 292 | push @{ $rslt[-1] }, $token; | ||||
| 293 | |||||
| 294 | } elsif ( $content eq ')' | ||||
| 295 | and @rslt > 1 # Ignore enclosing delimiter | ||||
| 296 | and my $recover = $unclosed{$rslt[-1][1]->content()} ) { | ||||
| 297 | # If the close bracket is a parenthesis and there is a | ||||
| 298 | # recovery procedure, we use it. | ||||
| 299 | $self->$recover( $token ); | ||||
| 300 | |||||
| 301 | } else { | ||||
| 302 | |||||
| 303 | # Unmatched close with no recovery. | ||||
| 304 | $self->{failures}++; | ||||
| 305 | bless $token, 'PPIx::Regexp::Token::Unmatched'; | ||||
| 306 | push @{ $rslt[-1] }, $token; | ||||
| 307 | } | ||||
| 308 | |||||
| 309 | } else { | ||||
| 310 | push @{ $rslt[-1] }, $token; | ||||
| 311 | } | ||||
| 312 | |||||
| 313 | # We have to hand-roll the Range object. | ||||
| 314 | if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' ) | ||||
| 315 | && $rslt[-1][-2]->content() eq '-' | ||||
| 316 | && $rslt[-1][0] eq ']' # It's a character class | ||||
| 317 | ) { | ||||
| 318 | my @tokens = splice @{ $rslt[-1] }, -3; | ||||
| 319 | push @{ $rslt[-1] }, | ||||
| 320 | PPIx::Regexp::Node::Range->_new( @tokens ); | ||||
| 321 | } | ||||
| 322 | } | ||||
| 323 | |||||
| 324 | while ( @rslt > 1 ) { | ||||
| 325 | if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) { | ||||
| 326 | $self->$recover(); | ||||
| 327 | } else { | ||||
| 328 | $self->{failures}++; | ||||
| 329 | $self->_make_node( undef ); | ||||
| 330 | } | ||||
| 331 | } | ||||
| 332 | |||||
| 333 | if ( @rslt == 1 ) { | ||||
| 334 | my @last = @{ pop @rslt }; | ||||
| 335 | shift @last; | ||||
| 336 | push @last, $self->_get_token(); | ||||
| 337 | return $class->_new( @last ); | ||||
| 338 | } else { | ||||
| 339 | confess "Missing data"; | ||||
| 340 | } | ||||
| 341 | |||||
| 342 | } | ||||
| 343 | |||||
| 344 | } | ||||
| 345 | |||||
| 346 | # $token = $self->_get_token(); | ||||
| 347 | # | ||||
| 348 | # This method returns the next token from the tokenizer. | ||||
| 349 | |||||
| 350 | sub _get_token { | ||||
| 351 | my ( $self ) = @_; | ||||
| 352 | |||||
| 353 | if ( @{ $self->{deferred} } ) { | ||||
| 354 | return shift @{ $self->{deferred} }; | ||||
| 355 | } | ||||
| 356 | |||||
| 357 | my $token = $self->{tokenizer}->next_token() or return; | ||||
| 358 | |||||
| 359 | return $token; | ||||
| 360 | } | ||||
| 361 | |||||
| 362 | { | ||||
| 363 | |||||
| 364 | 2 | 1µs | my %handler = ( | ||
| 365 | '(' => '_round', | ||||
| 366 | '[' => '_square', | ||||
| 367 | '{' => '_curly', | ||||
| 368 | '(?[' => '_regex_set', | ||||
| 369 | ); | ||||
| 370 | |||||
| 371 | sub _make_node { | ||||
| 372 | my ( $self, $token ) = @_; | ||||
| 373 | my @args = @{ pop @{ $self->{_rslt} } }; | ||||
| 374 | shift @args; | ||||
| 375 | push @args, $token; | ||||
| 376 | my @node; | ||||
| 377 | if ( my $method = $handler{ $args[0]->content() } ) { | ||||
| 378 | @node = $self->$method( \@args ); | ||||
| 379 | } | ||||
| 380 | @node or @node = PPIx::Regexp::Structure->_new( @args ); | ||||
| 381 | push @{ $self->{_rslt}[-1] }, @node; | ||||
| 382 | return; | ||||
| 383 | } | ||||
| 384 | |||||
| 385 | } | ||||
| 386 | |||||
| 387 | sub _curly { | ||||
| 388 | my ( $self, $args ) = @_; | ||||
| 389 | |||||
| 390 | if ( $args->[-1] && $args->[-1]->is_quantifier() ) { | ||||
| 391 | |||||
| 392 | # If the tokenizer has marked the right curly as a quantifier, | ||||
| 393 | # make the whole thing a quantifier structure. | ||||
| 394 | return PPIx::Regexp::Structure::Quantifier->_new( @{ $args } ); | ||||
| 395 | |||||
| 396 | } elsif ( $args->[-1] ) { | ||||
| 397 | |||||
| 398 | # If there is a right curly but it is not a quantifier, | ||||
| 399 | # make both curlys into literals. | ||||
| 400 | foreach my $inx ( 0, -1 ) { | ||||
| 401 | bless $args->[$inx], TOKEN_LITERAL; | ||||
| 402 | } | ||||
| 403 | |||||
| 404 | # Try to recover possible quantifiers not recognized because we | ||||
| 405 | # thought this was a structure. | ||||
| 406 | $self->_recover_curly_quantifiers( $args ); | ||||
| 407 | |||||
| 408 | return @{ $args }; | ||||
| 409 | |||||
| 410 | } else { | ||||
| 411 | |||||
| 412 | # If there is no right curly, just make a generic structure | ||||
| 413 | # TODO maybe this should be something else? | ||||
| 414 | return PPIx::Regexp::Structure->_new( @{ $args } ); | ||||
| 415 | } | ||||
| 416 | } | ||||
| 417 | |||||
| 418 | # Recover from an unclosed left curly. | ||||
| 419 | sub _recover_curly { | ||||
| 420 | my ( $self, $token ) = @_; | ||||
| 421 | |||||
| 422 | # Get all the stuff we have accumulated for this curly. | ||||
| 423 | my @content = @{ pop @{ $self->{_rslt} } }; | ||||
| 424 | |||||
| 425 | # Lose the right bracket, which we have already failed to match. | ||||
| 426 | shift @content; | ||||
| 427 | |||||
| 428 | # Rebless the left curly to a literal. | ||||
| 429 | bless $content[0], TOKEN_LITERAL; | ||||
| 430 | |||||
| 431 | # Try to recover possible quantifiers not recognized because we | ||||
| 432 | # thought this was a structure. | ||||
| 433 | $self->_recover_curly_quantifiers( \@content ); | ||||
| 434 | |||||
| 435 | # Shove the curly and its putative contents into whatever structure | ||||
| 436 | # we have going. | ||||
| 437 | # The checks are to try to trap things like RT 56864, though on | ||||
| 438 | # further reflection it turned out that you could get here with an | ||||
| 439 | # empty $self->{_rslt} on things like 'm{)}'. This one did not get | ||||
| 440 | # made into an RT ticket, but was fixed by not calling the recovery | ||||
| 441 | # code if $self->{_rslt} contained only the enclosing delimiters. | ||||
| 442 | 'ARRAY' eq ref $self->{_rslt} | ||||
| 443 | or confess 'Programming error - $self->{_rslt} not array ref, ', | ||||
| 444 | "parsing '", $self->{tokenizer}->content(), "' at ", | ||||
| 445 | $token->content(); | ||||
| 446 | @{ $self->{_rslt} } | ||||
| 447 | or confess 'Programming error - $self->{_rslt} empty, ', | ||||
| 448 | "parsing '", $self->{tokenizer}->content(), "' at ", | ||||
| 449 | $token->content(); | ||||
| 450 | push @{ $self->{_rslt}[-1] }, @content; | ||||
| 451 | |||||
| 452 | # Shove the mismatched delimiter back into the input so we can have | ||||
| 453 | # another crack at it. | ||||
| 454 | $token and $self->_unget_token( $token ); | ||||
| 455 | |||||
| 456 | # We gone. | ||||
| 457 | return; | ||||
| 458 | } | ||||
| 459 | |||||
| 460 | sub _recover_curly_quantifiers { | ||||
| 461 | my ( $self, $args ) = @_; | ||||
| 462 | |||||
| 463 | if ( __instance( $args->[0], TOKEN_LITERAL ) | ||||
| 464 | && __instance( $args->[1], TOKEN_UNKNOWN ) | ||||
| 465 | && PPIx::Regexp::Token::Quantifier->could_be_quantifier( | ||||
| 466 | $args->[1]->content() ) | ||||
| 467 | ) { | ||||
| 468 | bless $args->[1], 'PPIx::Regexp::Token::Quantifier'; | ||||
| 469 | |||||
| 470 | if ( __instance( $args->[2], TOKEN_UNKNOWN ) | ||||
| 471 | && PPIx::Regexp::Token::Greediness->could_be_greediness( | ||||
| 472 | $args->[2]->content() ) | ||||
| 473 | ) { | ||||
| 474 | bless $args->[2], 'PPIx::Regexp::Token::Greediness'; | ||||
| 475 | } | ||||
| 476 | |||||
| 477 | } | ||||
| 478 | |||||
| 479 | return; | ||||
| 480 | } | ||||
| 481 | |||||
| 482 | sub _in_regex_set { | ||||
| 483 | my ( $self ) = @_; | ||||
| 484 | foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) { | ||||
| 485 | $stack_entry->[0] eq '])' | ||||
| 486 | and return 1; | ||||
| 487 | } | ||||
| 488 | return 0; | ||||
| 489 | } | ||||
| 490 | |||||
| 491 | sub _round { | ||||
| 492 | my ( $self, $args ) = @_; | ||||
| 493 | |||||
| 494 | # If we're inside a regex set, parens do not capture. | ||||
| 495 | $self->_in_regex_set() | ||||
| 496 | and return PPIx::Regexp::Structure->_new( @{ $args } ); | ||||
| 497 | |||||
| 498 | # The instantiator will rebless based on the first token if need be. | ||||
| 499 | return PPIx::Regexp::Structure::Capture->_new( @{ $args } ); | ||||
| 500 | } | ||||
| 501 | |||||
| 502 | sub _square { | ||||
| 503 | my ( $self, $args ) = @_; | ||||
| 504 | return PPIx::Regexp::Structure::CharClass->_new( @{ $args } ); | ||||
| 505 | } | ||||
| 506 | |||||
| 507 | sub _regex_set { | ||||
| 508 | my ( $self, $args ) = @_; | ||||
| 509 | return PPIx::Regexp::Structure::RegexSet->_new( @{ $args } ); | ||||
| 510 | } | ||||
| 511 | |||||
| 512 | # $self->_unget_token( $token ); | ||||
| 513 | # | ||||
| 514 | # This method caches its argument so that it will be returned by | ||||
| 515 | # the next call to C<_get_token()>. If more than one argument is | ||||
| 516 | # passed, they will be returned in the order given; that is, | ||||
| 517 | # _unget_token/_get_token work like unshift/shift. | ||||
| 518 | |||||
| 519 | sub _unget_token { | ||||
| 520 | my ( $self, @args ) = @_; | ||||
| 521 | unshift @{ $self->{deferred} }, @args; | ||||
| 522 | return $self; | ||||
| 523 | } | ||||
| 524 | |||||
| 525 | 1 | 6µs | 1; | ||
| 526 | |||||
| 527 | __END__ |