Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Lexer.pm |
Statements | Executed 59 statements in 3.10ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 3.13ms | 24.1ms | BEGIN@61 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 1.02ms | 1.21ms | BEGIN@44 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 281µs | 384µs | BEGIN@58 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 257µs | 350µs | BEGIN@55 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 250µs | 372µs | BEGIN@49 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 246µs | 372µs | BEGIN@46 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 227µs | 318µs | BEGIN@51 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 193µs | 286µs | BEGIN@48 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 192µs | 299µs | BEGIN@47 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 184µs | 317µs | BEGIN@53 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 178µs | 270µs | BEGIN@52 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 162µs | 259µs | BEGIN@54 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 152µs | 632µs | BEGIN@60 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 138µs | 236µs | BEGIN@59 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 133µs | 230µs | BEGIN@50 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 131µs | 230µs | BEGIN@45 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 130µs | 228µs | BEGIN@57 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 129µs | 221µs | BEGIN@43 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 114µs | 204µs | BEGIN@56 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 12µs | 24µs | BEGIN@36 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 8µs | 30µs | BEGIN@62 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 8µs | 12µs | BEGIN@37 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 7µs | 29µs | BEGIN@41 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 7µs | 27µs | BEGIN@42 | PPIx::Regexp::Lexer::
1 | 1 | 1 | 7µs | 468µs | BEGIN@39 | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _curly | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _finalize | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _get_delimited | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _get_token | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _in_regex_set | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _make_node | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _recover_curly | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _recover_curly_quantifiers | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _regex_set | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _round | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _square | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | _unget_token | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | errstr | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | failures | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | lex | PPIx::Regexp::Lexer::
0 | 0 | 0 | 0s | 0s | new | PPIx::Regexp::Lexer::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | =head1 NAME | ||||
2 | |||||
3 | PPIx::Regexp::Lexer - Assemble tokenizer output. | ||||
4 | |||||
5 | =head1 SYNOPSIS | ||||
6 | |||||
7 | use PPIx::Regexp::Lexer; | ||||
8 | use PPIx::Regexp::Dumper; | ||||
9 | my $lex = PPIx::Regexp::Lexer->new('qr{foo}smx'); | ||||
10 | my $dmp = PPIx::Regexp::Dumper->new( $lex ); | ||||
11 | $dmp->print(); | ||||
12 | |||||
13 | =head1 INHERITANCE | ||||
14 | |||||
15 | C<PPIx::Regexp::Lexer> is a | ||||
16 | L<PPIx::Regexp::Support|PPIx::Regexp::Support>. | ||||
17 | |||||
18 | C<PPIx::Regexp::Lexer> has no descendants. | ||||
19 | |||||
20 | =head1 DESCRIPTION | ||||
21 | |||||
22 | This class takes the token stream generated by | ||||
23 | L<PPIx::Regexp::Tokenizer|PPIx::Regexp::Tokenizer> and generates the | ||||
24 | parse tree. | ||||
25 | |||||
26 | =head1 METHODS | ||||
27 | |||||
28 | This class provides the following public methods. Methods not documented | ||||
29 | here are private, and unsupported in the sense that the author reserves | ||||
30 | the right to change or remove them without notice. | ||||
31 | |||||
32 | =cut | ||||
33 | |||||
34 | package PPIx::Regexp::Lexer; | ||||
35 | |||||
36 | 2 | 21µs | 2 | 36µs | # spent 24µs (12+12) within PPIx::Regexp::Lexer::BEGIN@36 which was called:
# once (12µs+12µs) by PPIx::Regexp::BEGIN@90 at line 36 # spent 24µs making 1 call to PPIx::Regexp::Lexer::BEGIN@36
# spent 12µs making 1 call to strict::import |
37 | 2 | 24µs | 2 | 17µs | # spent 12µs (8+5) within PPIx::Regexp::Lexer::BEGIN@37 which was called:
# once (8µs+5µs) by PPIx::Regexp::BEGIN@90 at line 37 # spent 12µs making 1 call to PPIx::Regexp::Lexer::BEGIN@37
# spent 5µs making 1 call to warnings::import |
38 | |||||
39 | 2 | 23µs | 2 | 929µs | # spent 468µs (7+461) within PPIx::Regexp::Lexer::BEGIN@39 which was called:
# once (7µs+461µs) by PPIx::Regexp::BEGIN@90 at line 39 # spent 468µs making 1 call to PPIx::Regexp::Lexer::BEGIN@39
# spent 461µs making 1 call to base::import |
40 | |||||
41 | 2 | 21µs | 2 | 50µs | # spent 29µs (7+21) within PPIx::Regexp::Lexer::BEGIN@41 which was called:
# once (7µs+21µs) by PPIx::Regexp::BEGIN@90 at line 41 # spent 29µs making 1 call to PPIx::Regexp::Lexer::BEGIN@41
# spent 21µs making 1 call to Exporter::import |
42 | 2 | 18µs | 2 | 48µs | # spent 27µs (7+20) within PPIx::Regexp::Lexer::BEGIN@42 which was called:
# once (7µs+20µs) by PPIx::Regexp::BEGIN@90 at line 42 # spent 27µs making 1 call to PPIx::Regexp::Lexer::BEGIN@42
# spent 20µs making 1 call to Exporter::import |
43 | 2 | 90µs | 1 | 221µs | # spent 221µs (129+92) within PPIx::Regexp::Lexer::BEGIN@43 which was called:
# once (129µs+92µs) by PPIx::Regexp::BEGIN@90 at line 43 # spent 221µs making 1 call to PPIx::Regexp::Lexer::BEGIN@43 |
44 | 2 | 97µs | 1 | 1.21ms | # spent 1.21ms (1.02+195µs) within PPIx::Regexp::Lexer::BEGIN@44 which was called:
# once (1.02ms+195µs) by PPIx::Regexp::BEGIN@90 at line 44 # spent 1.21ms making 1 call to PPIx::Regexp::Lexer::BEGIN@44 |
45 | 2 | 91µs | 1 | 230µs | # spent 230µs (131+99) within PPIx::Regexp::Lexer::BEGIN@45 which was called:
# once (131µs+99µs) by PPIx::Regexp::BEGIN@90 at line 45 # spent 230µs making 1 call to PPIx::Regexp::Lexer::BEGIN@45 |
46 | 2 | 83µs | 1 | 372µs | # spent 372µs (246+125) within PPIx::Regexp::Lexer::BEGIN@46 which was called:
# once (246µs+125µs) by PPIx::Regexp::BEGIN@90 at line 46 # spent 372µs making 1 call to PPIx::Regexp::Lexer::BEGIN@46 |
47 | 2 | 92µs | 1 | 299µs | # spent 299µs (192+107) within PPIx::Regexp::Lexer::BEGIN@47 which was called:
# once (192µs+107µs) by PPIx::Regexp::BEGIN@90 at line 47 # spent 299µs making 1 call to PPIx::Regexp::Lexer::BEGIN@47 |
48 | 2 | 89µs | 1 | 286µs | # spent 286µs (193+93) within PPIx::Regexp::Lexer::BEGIN@48 which was called:
# once (193µs+93µs) by PPIx::Regexp::BEGIN@90 at line 48 # spent 286µs making 1 call to PPIx::Regexp::Lexer::BEGIN@48 |
49 | 2 | 91µs | 1 | 372µs | # spent 372µs (250+123) within PPIx::Regexp::Lexer::BEGIN@49 which was called:
# once (250µs+123µs) by PPIx::Regexp::BEGIN@90 at line 49 # spent 372µs making 1 call to PPIx::Regexp::Lexer::BEGIN@49 |
50 | 2 | 92µs | 1 | 230µs | # spent 230µs (133+97) within PPIx::Regexp::Lexer::BEGIN@50 which was called:
# once (133µs+97µs) by PPIx::Regexp::BEGIN@90 at line 50 # spent 230µs making 1 call to PPIx::Regexp::Lexer::BEGIN@50 |
51 | 2 | 84µs | 1 | 318µs | # spent 318µs (227+91) within PPIx::Regexp::Lexer::BEGIN@51 which was called:
# once (227µs+91µs) by PPIx::Regexp::BEGIN@90 at line 51 # spent 318µs making 1 call to PPIx::Regexp::Lexer::BEGIN@51 |
52 | 2 | 85µs | 1 | 270µs | # spent 270µs (178+91) within PPIx::Regexp::Lexer::BEGIN@52 which was called:
# once (178µs+91µs) by PPIx::Regexp::BEGIN@90 at line 52 # spent 270µs making 1 call to PPIx::Regexp::Lexer::BEGIN@52 |
53 | 2 | 92µs | 1 | 317µs | # spent 317µs (184+133) within PPIx::Regexp::Lexer::BEGIN@53 which was called:
# once (184µs+133µs) by PPIx::Regexp::BEGIN@90 at line 53 # spent 317µs making 1 call to PPIx::Regexp::Lexer::BEGIN@53 |
54 | 2 | 82µs | 1 | 259µs | # spent 259µs (162+97) within PPIx::Regexp::Lexer::BEGIN@54 which was called:
# once (162µs+97µs) by PPIx::Regexp::BEGIN@90 at line 54 # spent 259µs making 1 call to PPIx::Regexp::Lexer::BEGIN@54 |
55 | 2 | 83µs | 1 | 350µs | # spent 350µs (257+93) within PPIx::Regexp::Lexer::BEGIN@55 which was called:
# once (257µs+93µs) by PPIx::Regexp::BEGIN@90 at line 55 # spent 350µs making 1 call to PPIx::Regexp::Lexer::BEGIN@55 |
56 | 2 | 74µs | 1 | 204µs | # spent 204µs (114+91) within PPIx::Regexp::Lexer::BEGIN@56 which was called:
# once (114µs+91µs) by PPIx::Regexp::BEGIN@90 at line 56 # spent 204µs making 1 call to PPIx::Regexp::Lexer::BEGIN@56 |
57 | 2 | 85µs | 1 | 228µs | # spent 228µs (130+97) within PPIx::Regexp::Lexer::BEGIN@57 which was called:
# once (130µs+97µs) by PPIx::Regexp::BEGIN@90 at line 57 # spent 228µs making 1 call to PPIx::Regexp::Lexer::BEGIN@57 |
58 | 2 | 86µs | 1 | 384µs | # spent 384µs (281+103) within PPIx::Regexp::Lexer::BEGIN@58 which was called:
# once (281µs+103µs) by PPIx::Regexp::BEGIN@90 at line 58 # spent 384µs making 1 call to PPIx::Regexp::Lexer::BEGIN@58 |
59 | 2 | 97µs | 1 | 236µs | # spent 236µs (138+98) within PPIx::Regexp::Lexer::BEGIN@59 which was called:
# once (138µs+98µs) by PPIx::Regexp::BEGIN@90 at line 59 # spent 236µs making 1 call to PPIx::Regexp::Lexer::BEGIN@59 |
60 | 2 | 105µs | 1 | 632µs | # spent 632µs (152+480) within PPIx::Regexp::Lexer::BEGIN@60 which was called:
# once (152µs+480µs) by PPIx::Regexp::BEGIN@90 at line 60 # spent 632µs making 1 call to PPIx::Regexp::Lexer::BEGIN@60 |
61 | 2 | 82µs | 1 | 24.1ms | # spent 24.1ms (3.13+20.9) within PPIx::Regexp::Lexer::BEGIN@61 which was called:
# once (3.13ms+20.9ms) by PPIx::Regexp::BEGIN@90 at line 61 # spent 24.1ms making 1 call to PPIx::Regexp::Lexer::BEGIN@61 |
62 | 2 | 1.30ms | 2 | 52µs | # spent 30µs (8+22) within PPIx::Regexp::Lexer::BEGIN@62 which was called:
# once (8µs+22µs) by PPIx::Regexp::BEGIN@90 at line 62 # spent 30µs making 1 call to PPIx::Regexp::Lexer::BEGIN@62
# spent 22µs making 1 call to Exporter::import |
63 | |||||
64 | 1 | 700ns | our $VERSION = '0.036'; | ||
65 | |||||
66 | =head2 new | ||||
67 | |||||
68 | This method instantiates the lexer. It takes as its argument either a | ||||
69 | L<PPIx::Regexp::Tokenizer|PPIx::Regexp::Tokenizer> or the text to be | ||||
70 | parsed. In the latter case the tokenizer is instantiated from the text. | ||||
71 | |||||
72 | Any optional name/value pairs after the first argument are passed to the | ||||
73 | tokenizer, which interprets them or not as the case may be. | ||||
74 | |||||
75 | =cut | ||||
76 | |||||
77 | { | ||||
78 | |||||
79 | 2 | 700ns | my $errstr; | ||
80 | |||||
81 | sub new { | ||||
82 | my ( $class, $tokenizer, %args ) = @_; | ||||
83 | ref $class and $class = ref $class; | ||||
84 | |||||
85 | __instance( $tokenizer, 'PPIx::Regexp::Tokenizer' ) | ||||
86 | or $tokenizer = PPIx::Regexp::Tokenizer->new( $tokenizer, %args ) | ||||
87 | or do { | ||||
88 | $errstr = PPIx::Regexp::Tokenizer->errstr(); | ||||
89 | return; | ||||
90 | }; | ||||
91 | |||||
92 | my $self = { | ||||
93 | deferred => [], # Deferred tokens | ||||
94 | failures => 0, | ||||
95 | tokenizer => $tokenizer, | ||||
96 | }; | ||||
97 | |||||
98 | bless $self, $class; | ||||
99 | return $self; | ||||
100 | } | ||||
101 | |||||
102 | sub errstr { | ||||
103 | return $errstr; | ||||
104 | } | ||||
105 | |||||
106 | } | ||||
107 | |||||
108 | =head2 errstr | ||||
109 | |||||
110 | This method returns the error string from the last attempt to | ||||
111 | instantiate a C<PPIx::Regexp::Lexer>. If the last attempt succeeded, the | ||||
112 | error will be C<undef>. | ||||
113 | |||||
114 | =cut | ||||
115 | |||||
116 | # Defined above | ||||
117 | |||||
118 | =head2 failures | ||||
119 | |||||
120 | print $lexer->failures(), " parse failures\n"; | ||||
121 | |||||
122 | This method returns the number of parse failures encountered. A | ||||
123 | parse failure is either a tokenization failure (see | ||||
124 | L<< PPIx::Regexp::Tokenizer->failures()|PPIx::Regexp::Tokenizer/failures >>) | ||||
125 | or a structural error. | ||||
126 | |||||
127 | =cut | ||||
128 | |||||
129 | sub failures { | ||||
130 | my ( $self ) = @_; | ||||
131 | return $self->{failures}; | ||||
132 | } | ||||
133 | |||||
134 | =head2 lex | ||||
135 | |||||
136 | This method lexes the tokens in the text, and returns the lexed list of | ||||
137 | elements. | ||||
138 | |||||
139 | =cut | ||||
140 | |||||
141 | sub lex { | ||||
142 | my ( $self ) = @_; | ||||
143 | |||||
144 | my @content; | ||||
145 | $self->{failures} = 0; | ||||
146 | |||||
147 | # Accept everything up to the first delimiter. | ||||
148 | { | ||||
149 | my $token = $self->_get_token() | ||||
150 | or return $self->_finalize( @content ); | ||||
151 | $token->isa( 'PPIx::Regexp::Token::Delimiter' ) or do { | ||||
152 | push @content, $token; | ||||
153 | redo; | ||||
154 | }; | ||||
155 | $self->_unget_token( $token ); | ||||
156 | } | ||||
157 | |||||
158 | # Accept the first delimited structure. | ||||
159 | push @content, ( my $regexp = $self->_get_delimited( | ||||
160 | 'PPIx::Regexp::Structure::Regexp' ) ); | ||||
161 | |||||
162 | # If we are a substitution ... | ||||
163 | if ( $content[0]->content() eq 's' ) { | ||||
164 | |||||
165 | # Accept any insignificant stuff. | ||||
166 | while ( my $token = $self->_get_token() ) { | ||||
167 | if ( $token->significant() ) { | ||||
168 | $self->_unget_token( $token ); | ||||
169 | last; | ||||
170 | } else { | ||||
171 | push @content, $token; | ||||
172 | } | ||||
173 | } | ||||
174 | |||||
175 | # Figure out if we should expect an opening bracket. | ||||
176 | my $expect_open_bracket = $self->close_bracket( | ||||
177 | $regexp->start( 0 ) ) || 0; | ||||
178 | |||||
179 | # Accept the next delimited structure. | ||||
180 | push @content, $self->_get_delimited( | ||||
181 | 'PPIx::Regexp::Structure::Replacement', | ||||
182 | $expect_open_bracket, | ||||
183 | ); | ||||
184 | } | ||||
185 | |||||
186 | # Accept the modifiers (we hope!) plus any trailing white space. | ||||
187 | while ( my $token = $self->_get_token() ) { | ||||
188 | push @content, $token; | ||||
189 | } | ||||
190 | |||||
191 | # Let all the elements finalize themselves, recording any additional | ||||
192 | # errors as they do so. | ||||
193 | $self->_finalize( @content ); | ||||
194 | |||||
195 | # If we found a regular expression (and we should have done so) ... | ||||
196 | if ( $regexp ) { | ||||
197 | |||||
198 | # Retrieve the maximum capture group. | ||||
199 | my $max_capture = $regexp->max_capture_number(); | ||||
200 | |||||
201 | # Hashify the known capture names | ||||
202 | my $capture_name = { | ||||
203 | map { $_ => 1 } $regexp->capture_names(), | ||||
204 | }; | ||||
205 | |||||
206 | # For all the backreferences found | ||||
207 | foreach my $elem ( @{ $regexp->find( | ||||
208 | 'PPIx::Regexp::Token::Backreference' ) || [] } ) { | ||||
209 | # Rebless them as needed, recording any errors found. | ||||
210 | $self->{failures} += | ||||
211 | $elem->__PPIX_LEXER__rebless( | ||||
212 | capture_name => $capture_name, | ||||
213 | max_capture => $max_capture, | ||||
214 | ); | ||||
215 | } | ||||
216 | } | ||||
217 | |||||
218 | return @content; | ||||
219 | |||||
220 | } | ||||
221 | |||||
222 | # Finalize the content array, updating the parse failures count as we | ||||
223 | # go. | ||||
224 | sub _finalize { | ||||
225 | my ( $self, @content ) = @_; | ||||
226 | foreach my $elem ( @content ) { | ||||
227 | $self->{failures} += $elem->__PPIX_LEXER__finalize(); | ||||
228 | } | ||||
229 | defined wantarray and return @content; | ||||
230 | return; | ||||
231 | } | ||||
232 | |||||
233 | { | ||||
234 | |||||
235 | 2 | 2µs | my %bracket = ( | ||
236 | '{' => '}', | ||||
237 | '(' => ')', | ||||
238 | '[' => ']', | ||||
239 | '(?[' => '])', | ||||
240 | ## '<' => '>', | ||||
241 | ); | ||||
242 | |||||
243 | 1 | 1µs | my %unclosed = ( | ||
244 | '{' => '_recover_curly', | ||||
245 | ); | ||||
246 | |||||
247 | sub _get_delimited { | ||||
248 | my ( $self, $class, $expect_open_bracket ) = @_; | ||||
249 | defined $expect_open_bracket or $expect_open_bracket = 1; | ||||
250 | |||||
251 | my @rslt; | ||||
252 | $self->{_rslt} = \@rslt; | ||||
253 | |||||
254 | if ( $expect_open_bracket ) { | ||||
255 | if ( my $token = $self->_get_token() ) { | ||||
256 | push @rslt, []; | ||||
257 | if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { | ||||
258 | push @{ $rslt[-1] }, '', $token; | ||||
259 | } else { | ||||
260 | push @{ $rslt[-1] }, '', undef; | ||||
261 | $self->_unget_token( $token ); | ||||
262 | } | ||||
263 | } else { | ||||
264 | return; | ||||
265 | } | ||||
266 | } else { | ||||
267 | push @rslt, [ '', undef ]; | ||||
268 | } | ||||
269 | |||||
270 | while ( my $token = $self->_get_token() ) { | ||||
271 | if ( $token->isa( 'PPIx::Regexp::Token::Delimiter' ) ) { | ||||
272 | $self->_unget_token( $token ); | ||||
273 | last; | ||||
274 | } | ||||
275 | if ( $token->isa( 'PPIx::Regexp::Token::Structure' ) ) { | ||||
276 | my $content = $token->content(); | ||||
277 | |||||
278 | if ( my $finish = $bracket{$content} ) { | ||||
279 | # Open bracket | ||||
280 | push @rslt, [ $finish, $token ]; | ||||
281 | |||||
282 | } elsif ( $content eq $rslt[-1][0] ) { | ||||
283 | |||||
284 | # Matched close bracket | ||||
285 | $self->_make_node( $token ); | ||||
286 | |||||
287 | } elsif ( $content ne ')' ) { | ||||
288 | |||||
289 | # If the close bracket is not a parenthesis, it becomes | ||||
290 | # a literal. | ||||
291 | bless $token, TOKEN_LITERAL; | ||||
292 | push @{ $rslt[-1] }, $token; | ||||
293 | |||||
294 | } elsif ( $content eq ')' | ||||
295 | and @rslt > 1 # Ignore enclosing delimiter | ||||
296 | and my $recover = $unclosed{$rslt[-1][1]->content()} ) { | ||||
297 | # If the close bracket is a parenthesis and there is a | ||||
298 | # recovery procedure, we use it. | ||||
299 | $self->$recover( $token ); | ||||
300 | |||||
301 | } else { | ||||
302 | |||||
303 | # Unmatched close with no recovery. | ||||
304 | $self->{failures}++; | ||||
305 | bless $token, 'PPIx::Regexp::Token::Unmatched'; | ||||
306 | push @{ $rslt[-1] }, $token; | ||||
307 | } | ||||
308 | |||||
309 | } else { | ||||
310 | push @{ $rslt[-1] }, $token; | ||||
311 | } | ||||
312 | |||||
313 | # We have to hand-roll the Range object. | ||||
314 | if ( __instance( $rslt[-1][-2], 'PPIx::Regexp::Token::Operator' ) | ||||
315 | && $rslt[-1][-2]->content() eq '-' | ||||
316 | && $rslt[-1][0] eq ']' # It's a character class | ||||
317 | ) { | ||||
318 | my @tokens = splice @{ $rslt[-1] }, -3; | ||||
319 | push @{ $rslt[-1] }, | ||||
320 | PPIx::Regexp::Node::Range->_new( @tokens ); | ||||
321 | } | ||||
322 | } | ||||
323 | |||||
324 | while ( @rslt > 1 ) { | ||||
325 | if ( my $recover = $unclosed{$rslt[-1][1]->content()} ) { | ||||
326 | $self->$recover(); | ||||
327 | } else { | ||||
328 | $self->{failures}++; | ||||
329 | $self->_make_node( undef ); | ||||
330 | } | ||||
331 | } | ||||
332 | |||||
333 | if ( @rslt == 1 ) { | ||||
334 | my @last = @{ pop @rslt }; | ||||
335 | shift @last; | ||||
336 | push @last, $self->_get_token(); | ||||
337 | return $class->_new( @last ); | ||||
338 | } else { | ||||
339 | confess "Missing data"; | ||||
340 | } | ||||
341 | |||||
342 | } | ||||
343 | |||||
344 | } | ||||
345 | |||||
346 | # $token = $self->_get_token(); | ||||
347 | # | ||||
348 | # This method returns the next token from the tokenizer. | ||||
349 | |||||
350 | sub _get_token { | ||||
351 | my ( $self ) = @_; | ||||
352 | |||||
353 | if ( @{ $self->{deferred} } ) { | ||||
354 | return shift @{ $self->{deferred} }; | ||||
355 | } | ||||
356 | |||||
357 | my $token = $self->{tokenizer}->next_token() or return; | ||||
358 | |||||
359 | return $token; | ||||
360 | } | ||||
361 | |||||
362 | { | ||||
363 | |||||
364 | 2 | 1µs | my %handler = ( | ||
365 | '(' => '_round', | ||||
366 | '[' => '_square', | ||||
367 | '{' => '_curly', | ||||
368 | '(?[' => '_regex_set', | ||||
369 | ); | ||||
370 | |||||
371 | sub _make_node { | ||||
372 | my ( $self, $token ) = @_; | ||||
373 | my @args = @{ pop @{ $self->{_rslt} } }; | ||||
374 | shift @args; | ||||
375 | push @args, $token; | ||||
376 | my @node; | ||||
377 | if ( my $method = $handler{ $args[0]->content() } ) { | ||||
378 | @node = $self->$method( \@args ); | ||||
379 | } | ||||
380 | @node or @node = PPIx::Regexp::Structure->_new( @args ); | ||||
381 | push @{ $self->{_rslt}[-1] }, @node; | ||||
382 | return; | ||||
383 | } | ||||
384 | |||||
385 | } | ||||
386 | |||||
387 | sub _curly { | ||||
388 | my ( $self, $args ) = @_; | ||||
389 | |||||
390 | if ( $args->[-1] && $args->[-1]->is_quantifier() ) { | ||||
391 | |||||
392 | # If the tokenizer has marked the right curly as a quantifier, | ||||
393 | # make the whole thing a quantifier structure. | ||||
394 | return PPIx::Regexp::Structure::Quantifier->_new( @{ $args } ); | ||||
395 | |||||
396 | } elsif ( $args->[-1] ) { | ||||
397 | |||||
398 | # If there is a right curly but it is not a quantifier, | ||||
399 | # make both curlys into literals. | ||||
400 | foreach my $inx ( 0, -1 ) { | ||||
401 | bless $args->[$inx], TOKEN_LITERAL; | ||||
402 | } | ||||
403 | |||||
404 | # Try to recover possible quantifiers not recognized because we | ||||
405 | # thought this was a structure. | ||||
406 | $self->_recover_curly_quantifiers( $args ); | ||||
407 | |||||
408 | return @{ $args }; | ||||
409 | |||||
410 | } else { | ||||
411 | |||||
412 | # If there is no right curly, just make a generic structure | ||||
413 | # TODO maybe this should be something else? | ||||
414 | return PPIx::Regexp::Structure->_new( @{ $args } ); | ||||
415 | } | ||||
416 | } | ||||
417 | |||||
418 | # Recover from an unclosed left curly. | ||||
419 | sub _recover_curly { | ||||
420 | my ( $self, $token ) = @_; | ||||
421 | |||||
422 | # Get all the stuff we have accumulated for this curly. | ||||
423 | my @content = @{ pop @{ $self->{_rslt} } }; | ||||
424 | |||||
425 | # Lose the right bracket, which we have already failed to match. | ||||
426 | shift @content; | ||||
427 | |||||
428 | # Rebless the left curly to a literal. | ||||
429 | bless $content[0], TOKEN_LITERAL; | ||||
430 | |||||
431 | # Try to recover possible quantifiers not recognized because we | ||||
432 | # thought this was a structure. | ||||
433 | $self->_recover_curly_quantifiers( \@content ); | ||||
434 | |||||
435 | # Shove the curly and its putative contents into whatever structure | ||||
436 | # we have going. | ||||
437 | # The checks are to try to trap things like RT 56864, though on | ||||
438 | # further reflection it turned out that you could get here with an | ||||
439 | # empty $self->{_rslt} on things like 'm{)}'. This one did not get | ||||
440 | # made into an RT ticket, but was fixed by not calling the recovery | ||||
441 | # code if $self->{_rslt} contained only the enclosing delimiters. | ||||
442 | 'ARRAY' eq ref $self->{_rslt} | ||||
443 | or confess 'Programming error - $self->{_rslt} not array ref, ', | ||||
444 | "parsing '", $self->{tokenizer}->content(), "' at ", | ||||
445 | $token->content(); | ||||
446 | @{ $self->{_rslt} } | ||||
447 | or confess 'Programming error - $self->{_rslt} empty, ', | ||||
448 | "parsing '", $self->{tokenizer}->content(), "' at ", | ||||
449 | $token->content(); | ||||
450 | push @{ $self->{_rslt}[-1] }, @content; | ||||
451 | |||||
452 | # Shove the mismatched delimiter back into the input so we can have | ||||
453 | # another crack at it. | ||||
454 | $token and $self->_unget_token( $token ); | ||||
455 | |||||
456 | # We gone. | ||||
457 | return; | ||||
458 | } | ||||
459 | |||||
460 | sub _recover_curly_quantifiers { | ||||
461 | my ( $self, $args ) = @_; | ||||
462 | |||||
463 | if ( __instance( $args->[0], TOKEN_LITERAL ) | ||||
464 | && __instance( $args->[1], TOKEN_UNKNOWN ) | ||||
465 | && PPIx::Regexp::Token::Quantifier->could_be_quantifier( | ||||
466 | $args->[1]->content() ) | ||||
467 | ) { | ||||
468 | bless $args->[1], 'PPIx::Regexp::Token::Quantifier'; | ||||
469 | |||||
470 | if ( __instance( $args->[2], TOKEN_UNKNOWN ) | ||||
471 | && PPIx::Regexp::Token::Greediness->could_be_greediness( | ||||
472 | $args->[2]->content() ) | ||||
473 | ) { | ||||
474 | bless $args->[2], 'PPIx::Regexp::Token::Greediness'; | ||||
475 | } | ||||
476 | |||||
477 | } | ||||
478 | |||||
479 | return; | ||||
480 | } | ||||
481 | |||||
482 | sub _in_regex_set { | ||||
483 | my ( $self ) = @_; | ||||
484 | foreach my $stack_entry ( reverse @{ $self->{_rslt} } ) { | ||||
485 | $stack_entry->[0] eq '])' | ||||
486 | and return 1; | ||||
487 | } | ||||
488 | return 0; | ||||
489 | } | ||||
490 | |||||
491 | sub _round { | ||||
492 | my ( $self, $args ) = @_; | ||||
493 | |||||
494 | # If we're inside a regex set, parens do not capture. | ||||
495 | $self->_in_regex_set() | ||||
496 | and return PPIx::Regexp::Structure->_new( @{ $args } ); | ||||
497 | |||||
498 | # The instantiator will rebless based on the first token if need be. | ||||
499 | return PPIx::Regexp::Structure::Capture->_new( @{ $args } ); | ||||
500 | } | ||||
501 | |||||
502 | sub _square { | ||||
503 | my ( $self, $args ) = @_; | ||||
504 | return PPIx::Regexp::Structure::CharClass->_new( @{ $args } ); | ||||
505 | } | ||||
506 | |||||
507 | sub _regex_set { | ||||
508 | my ( $self, $args ) = @_; | ||||
509 | return PPIx::Regexp::Structure::RegexSet->_new( @{ $args } ); | ||||
510 | } | ||||
511 | |||||
512 | # $self->_unget_token( $token ); | ||||
513 | # | ||||
514 | # This method caches its argument so that it will be returned by | ||||
515 | # the next call to C<_get_token()>. If more than one argument is | ||||
516 | # passed, they will be returned in the order given; that is, | ||||
517 | # _unget_token/_get_token work like unshift/shift. | ||||
518 | |||||
519 | sub _unget_token { | ||||
520 | my ( $self, @args ) = @_; | ||||
521 | unshift @{ $self->{deferred} }, @args; | ||||
522 | return $self; | ||||
523 | } | ||||
524 | |||||
525 | 1 | 6µs | 1; | ||
526 | |||||
527 | __END__ |