Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Token/Structure.pm |
Statements | Executed 31 statements in 1.57ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 1.60ms | 1.80ms | BEGIN@51 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 403µs | 655µs | BEGIN@54 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 19µs | 39µs | BEGIN@35 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 13µs | 20µs | BEGIN@36 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 11µs | 96µs | BEGIN@38 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 11µs | 66µs | BEGIN@40 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 7µs | 7µs | BEGIN@52 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 5µs | 5µs | BEGIN@50 | PPIx::Regexp::Token::Structure::
1 | 1 | 1 | 5µs | 5µs | BEGIN@53 | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | __ANON__[:152] | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | __ANON__[:220] | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | __ANON__[:245] | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | __PPIX_LEXER__finalize | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | __PPIX_TOKENIZER__regexp | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | can_be_quantified | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | is_quantifier | PPIx::Regexp::Token::Structure::
0 | 0 | 0 | 0s | 0s | perl_version_introduced | PPIx::Regexp::Token::Structure::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | =head1 NAME | ||||
2 | |||||
3 | PPIx::Regexp::Token::Structure - Represent structural elements. | ||||
4 | |||||
5 | =head1 SYNOPSIS | ||||
6 | |||||
7 | use PPIx::Regexp::Dumper; | ||||
8 | PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' ) | ||||
9 | ->print(); | ||||
10 | |||||
11 | =head1 INHERITANCE | ||||
12 | |||||
13 | C<PPIx::Regexp::Token::Structure> is a | ||||
14 | L<PPIx::Regexp::Token|PPIx::Regexp::Token>. | ||||
15 | |||||
16 | C<PPIx::Regexp::Token::Structure> is the parent of | ||||
17 | L<PPIx::Regexp::Token::Delimiter|PPIx::Regexp::Token::Delimiter>. | ||||
18 | |||||
19 | =head1 DESCRIPTION | ||||
20 | |||||
21 | This class represents things that define the structure of the regular | ||||
22 | expression. This typically means brackets of various sorts, but to | ||||
23 | prevent proliferation of token classes the type of the regular | ||||
24 | expression is stored here. | ||||
25 | |||||
26 | =head1 METHODS | ||||
27 | |||||
28 | This class provides no public methods beyond those provided by its | ||||
29 | superclass. | ||||
30 | |||||
31 | =cut | ||||
32 | |||||
33 | package PPIx::Regexp::Token::Structure; | ||||
34 | |||||
35 | 2 | 33µs | 2 | 58µs | # spent 39µs (19+19) within PPIx::Regexp::Token::Structure::BEGIN@35 which was called:
# once (19µs+19µs) by base::import at line 35 # spent 39µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@35
# spent 19µs making 1 call to strict::import |
36 | 2 | 33µs | 2 | 27µs | # spent 20µs (13+7) within PPIx::Regexp::Token::Structure::BEGIN@36 which was called:
# once (13µs+7µs) by base::import at line 36 # spent 20µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@36
# spent 7µs making 1 call to warnings::import |
37 | |||||
38 | 2 | 43µs | 2 | 96µs | # spent 96µs (11+85) within PPIx::Regexp::Token::Structure::BEGIN@38 which was called:
# once (11µs+85µs) by base::import at line 38 # spent 96µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@38
# spent 85µs making 1 call to base::import, recursion: max depth 1, sum of overlapping time 85µs |
39 | |||||
40 | 1 | 400ns | # spent 66µs (11+55) within PPIx::Regexp::Token::Structure::BEGIN@40 which was called:
# once (11µs+55µs) by base::import at line 46 | ||
41 | COOKIE_CLASS | ||||
42 | COOKIE_QUANT | ||||
43 | COOKIE_REGEX_SET | ||||
44 | MINIMUM_PERL | ||||
45 | TOKEN_LITERAL | ||||
46 | 1 | 34µs | 2 | 121µs | }; # spent 66µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@40
# spent 55µs making 1 call to Exporter::import |
47 | |||||
48 | # Tokens we are responsible for making, under at least some | ||||
49 | # circumstances. | ||||
50 | 2 | 26µs | 1 | 5µs | # spent 5µs within PPIx::Regexp::Token::Structure::BEGIN@50 which was called:
# once (5µs+0s) by base::import at line 50 # spent 5µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@50 |
51 | 2 | 161µs | 1 | 1.80ms | # spent 1.80ms (1.60+202µs) within PPIx::Regexp::Token::Structure::BEGIN@51 which was called:
# once (1.60ms+202µs) by base::import at line 51 # spent 1.80ms making 1 call to PPIx::Regexp::Token::Structure::BEGIN@51 |
52 | 2 | 28µs | 1 | 7µs | # spent 7µs within PPIx::Regexp::Token::Structure::BEGIN@52 which was called:
# once (7µs+0s) by base::import at line 52 # spent 7µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@52 |
53 | 2 | 26µs | 1 | 5µs | # spent 5µs within PPIx::Regexp::Token::Structure::BEGIN@53 which was called:
# once (5µs+0s) by base::import at line 53 # spent 5µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@53 |
54 | 2 | 1.14ms | 1 | 655µs | # spent 655µs (403+252) within PPIx::Regexp::Token::Structure::BEGIN@54 which was called:
# once (403µs+252µs) by base::import at line 54 # spent 655µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@54 |
55 | |||||
56 | 1 | 800ns | our $VERSION = '0.036'; | ||
57 | |||||
58 | # Return true if the token can be quantified, and false otherwise | ||||
59 | |||||
60 | 1 | 4µs | my %quant = map { $_ => 1 } ')', ']'; | ||
61 | sub can_be_quantified { | ||||
62 | my ( $self ) = @_; | ||||
63 | ref $self or return; | ||||
64 | return $quant{ $self->content() }; | ||||
65 | }; | ||||
66 | |||||
67 | sub is_quantifier { | ||||
68 | my ( $self ) = @_; | ||||
69 | ref $self or return; | ||||
70 | return $self->{is_quantifier}; | ||||
71 | } | ||||
72 | |||||
73 | { | ||||
74 | |||||
75 | # Note that the implementation equivocates on the ::Token::Structure | ||||
76 | # class, using it both for the initial token that determines the | ||||
77 | # type of the regex and things like parentheses internal to the | ||||
78 | # regex. Rather than sort out this equivocation, I have relied on | ||||
79 | # the currently-true assumption that 'qr' will not satisfy the | ||||
80 | # ::Token::Structure recognition logic, and the only way this class | ||||
81 | # can acquire this content is by the brute-force approach used to | ||||
82 | # generate the initial token object. | ||||
83 | |||||
84 | 2 | 3µs | my %perl_version_introduced = ( | ||
85 | qr => '5.005', | ||||
86 | '(?[' => '5.017008', | ||||
87 | ); | ||||
88 | |||||
89 | sub perl_version_introduced { | ||||
90 | my ( $self ) = @_; | ||||
91 | return $perl_version_introduced{ $self->content() } || MINIMUM_PERL; | ||||
92 | } | ||||
93 | } | ||||
94 | |||||
95 | { | ||||
96 | |||||
97 | 2 | 5µs | my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /; | ||
98 | |||||
99 | # Regular expressions to match various parenthesized tokens, and the | ||||
100 | # classes to make them into. | ||||
101 | |||||
102 | 5 | 15µs | 5 | 106µs | my @paren_token = map { # spent 53µs making 1 call to PPIx::Regexp::Token::Recursion::__PPIX_TOKEN__recognize
# spent 14µs making 1 call to PPIx::Regexp::Token::Backtrack::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Comment::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Backreference::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Modifier::__PPIX_TOKEN__recognize |
103 | 1 | 5µs | [ $_ => $_->__PPIX_TOKEN__recognize() ] | ||
104 | } | ||||
105 | 'PPIx::Regexp::Token::Comment', | ||||
106 | 'PPIx::Regexp::Token::Modifier', | ||||
107 | 'PPIx::Regexp::Token::Backreference', | ||||
108 | 'PPIx::Regexp::Token::Backtrack', | ||||
109 | 'PPIx::Regexp::Token::Recursion', | ||||
110 | ; | ||||
111 | |||||
112 | sub __PPIX_TOKENIZER__regexp { | ||||
113 | my ( $class, $tokenizer, $character ) = @_; | ||||
114 | |||||
115 | # We are not interested in anything but delimiters. | ||||
116 | $delim{$character} or return; | ||||
117 | |||||
118 | # Inside a character class, all the delimiters are normal characters | ||||
119 | # except for the close square bracket. | ||||
120 | if ( $tokenizer->cookie( COOKIE_CLASS ) ) { | ||||
121 | $character eq ']' | ||||
122 | or return $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
123 | } | ||||
124 | |||||
125 | # Open parentheses have various interesting possibilities ... | ||||
126 | if ( $character eq '(' ) { | ||||
127 | |||||
128 | # Sometimes the whole bunch of parenthesized characters seems | ||||
129 | # naturally to be a token. | ||||
130 | foreach ( @paren_token ) { | ||||
131 | my ( $class, @recognize ) = @{ $_ }; | ||||
132 | foreach ( @recognize ) { | ||||
133 | my ( $regexp, $arg ) = @{ $_ }; | ||||
134 | my $accept = $tokenizer->find_regexp( $regexp ) or next; | ||||
135 | return $tokenizer->make_token( $accept, $class, $arg ); | ||||
136 | } | ||||
137 | } | ||||
138 | |||||
139 | # Modifier changes are local to this parenthesis group | ||||
140 | $tokenizer->modifier_duplicate(); | ||||
141 | |||||
142 | # The regex-set functionality introduced with 5.17.8 is most | ||||
143 | # conveniently handled by treating the initial '(?[' and | ||||
144 | # final '])' as ::Structure tokens. Fortunately for us, | ||||
145 | # perl5178delta documents that these may not have interior | ||||
146 | # spaces. | ||||
147 | |||||
148 | if ( my $accept = $tokenizer->find_regexp( | ||||
149 | qr{ \A [(] [?] [[] }smx # ] ) - help for vim | ||||
150 | ) | ||||
151 | ) { | ||||
152 | $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } ); | ||||
153 | $tokenizer->modifier_modify( x => 1 ); # Implicitly /x | ||||
154 | return $accept; | ||||
155 | } | ||||
156 | |||||
157 | # We expect certain tokens only after a left paren. | ||||
158 | $tokenizer->expect( | ||||
159 | 'PPIx::Regexp::Token::GroupType::Modifier', | ||||
160 | 'PPIx::Regexp::Token::GroupType::NamedCapture', | ||||
161 | 'PPIx::Regexp::Token::GroupType::Assertion', | ||||
162 | 'PPIx::Regexp::Token::GroupType::Code', | ||||
163 | 'PPIx::Regexp::Token::GroupType::BranchReset', | ||||
164 | 'PPIx::Regexp::Token::GroupType::Subexpression', | ||||
165 | 'PPIx::Regexp::Token::GroupType::Switch', | ||||
166 | ); | ||||
167 | |||||
168 | # Accept the parenthesis. | ||||
169 | return 1; | ||||
170 | } | ||||
171 | |||||
172 | # Close parentheses end modifier localization | ||||
173 | if ( $character eq ')' ) { | ||||
174 | $tokenizer->modifier_pop(); | ||||
175 | return 1; | ||||
176 | } | ||||
177 | |||||
178 | # Open curlys are complicated because they may or may not represent | ||||
179 | # the beginning of a quantifier, depending on what comes before the | ||||
180 | # close curly. So we set a cookie to monitor the token stream for | ||||
181 | # interlopers. If all goes well, the right curly will find the | ||||
182 | # cookie and know it is supposed to be a quantifier. | ||||
183 | if ( $character eq '{' ) { | ||||
184 | |||||
185 | # If the prior token can not be quantified, all this is | ||||
186 | # unnecessary. | ||||
187 | $tokenizer->prior( 'can_be_quantified' ) | ||||
188 | or return 1; | ||||
189 | |||||
190 | # We make our token now, before setting the cookie. Otherwise | ||||
191 | # the cookie has to deal with this token. | ||||
192 | my $token = $tokenizer->make_token( 1 ); | ||||
193 | |||||
194 | # A cookie for the next '}'. | ||||
195 | my $commas = 0; | ||||
196 | $tokenizer->cookie( COOKIE_QUANT, sub { | ||||
197 | my ( $tokenizer, $token ) = @_; | ||||
198 | $token or return 1; | ||||
199 | |||||
200 | # Of literals, we accept exactly one comma provided it | ||||
201 | # is not immediately after a '{'. We also accept | ||||
202 | # anything that matches '\d'; | ||||
203 | if ( $token->isa( TOKEN_LITERAL ) ) { | ||||
204 | my $character = $token->content(); | ||||
205 | if ( $character eq ',' ) { | ||||
206 | $commas++ and return; | ||||
207 | return $tokenizer->prior( 'content' ) ne '{'; | ||||
208 | } | ||||
209 | return $character =~ m/ \A \d \z /smx; | ||||
210 | } | ||||
211 | |||||
212 | # Since we do not know what is in an interpolation, we | ||||
213 | # trustingly accept it. | ||||
214 | if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) | ||||
215 | ) { | ||||
216 | return 1; | ||||
217 | } | ||||
218 | |||||
219 | return; | ||||
220 | }, | ||||
221 | ); | ||||
222 | |||||
223 | return $token; | ||||
224 | } | ||||
225 | |||||
226 | # The close curly bracket is a little complicated because if the | ||||
227 | # cookie posted by the left curly bracket is still around, we are a | ||||
228 | # quantifier, otherwise not. | ||||
229 | if ( $character eq '}' ) { | ||||
230 | $tokenizer->cookie( COOKIE_QUANT, undef ) | ||||
231 | or return 1; | ||||
232 | $tokenizer->prior( 'class' )->isa( __PACKAGE__ ) | ||||
233 | and return 1; | ||||
234 | my $token = $tokenizer->make_token( 1 ); | ||||
235 | $token->{is_quantifier} = 1; | ||||
236 | return $token; | ||||
237 | } | ||||
238 | |||||
239 | # The parse rules are different inside a character class, so we set | ||||
240 | # another cookie. Sigh. If your tool is a hammer ... | ||||
241 | if ( $character eq '[' ) { | ||||
242 | |||||
243 | # Set our cookie. Since it always returns 1, it does not matter | ||||
244 | # where in the following mess we set it. | ||||
245 | $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } ); | ||||
246 | |||||
247 | # Make our token now, since the easiest place to deal with the | ||||
248 | # beginning-of-character-class strangeness seems to be right | ||||
249 | # here. | ||||
250 | my @tokens = $tokenizer->make_token( 1 ); | ||||
251 | |||||
252 | # Get the next character, returning tokens if there is none. | ||||
253 | defined ( $character = $tokenizer->peek() ) | ||||
254 | or return @tokens; | ||||
255 | |||||
256 | # If we have a caret, it is a negation operator. Make its token | ||||
257 | # and fetch the next character, returning if none. | ||||
258 | if ( $character eq '^' ) { | ||||
259 | push @tokens, $tokenizer->make_token( | ||||
260 | 1, 'PPIx::Regexp::Token::Operator' ); | ||||
261 | defined ( $character = $tokenizer->peek() ) | ||||
262 | or return @tokens; | ||||
263 | } | ||||
264 | |||||
265 | # If we have a close square at this point, it is not the end of | ||||
266 | # the class, but just a literal. Make its token. | ||||
267 | $character eq ']' | ||||
268 | and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
269 | |||||
270 | # Return all tokens made. | ||||
271 | return @tokens; | ||||
272 | } | ||||
273 | # per perlop, the metas inside a [] are -]\^$. | ||||
274 | # per perlop, the metas outside a [] are {}[]()^$.|*+?\ | ||||
275 | # The difference is that {}[().|*+? are not metas in [], but - is. | ||||
276 | |||||
277 | # Close bracket is complicated by the addition of regex sets. | ||||
278 | # And more complicated by the fact that you can have an | ||||
279 | # old-style character class inside a regex set. Fortunately they | ||||
280 | # have not (yet!) permitted nested regex sets. | ||||
281 | if ( $character eq ']' ) { | ||||
282 | |||||
283 | # If we find '])' and COOKIE_REGEX_SET is present, we have a | ||||
284 | # regex set. We need to delete the cookie and accept both | ||||
285 | # characters. | ||||
286 | if ( ( my $accept = $tokenizer->find_regexp( | ||||
287 | # help vim - ( [ | ||||
288 | qr{ \A []] [)] }smx | ||||
289 | ) ) | ||||
290 | && $tokenizer->cookie( COOKIE_REGEX_SET ) | ||||
291 | |||||
292 | ) { | ||||
293 | $tokenizer->cookie( COOKIE_REGEX_SET, undef ); | ||||
294 | return $accept; | ||||
295 | } | ||||
296 | |||||
297 | # Otherwise we assume we're in a bracketed character class, | ||||
298 | # delete the cookie, and accept the close bracket. | ||||
299 | $tokenizer->cookie( COOKIE_CLASS, undef ); | ||||
300 | return 1; | ||||
301 | } | ||||
302 | |||||
303 | return 1; | ||||
304 | } | ||||
305 | |||||
306 | } | ||||
307 | |||||
308 | # Called by the lexer once it has done its worst to all the tokens. | ||||
309 | # Called as a method with no arguments. The return is the number of | ||||
310 | # parse failures discovered when finalizing. | ||||
311 | sub __PPIX_LEXER__finalize { | ||||
312 | my ( $self ) = @_; | ||||
313 | delete $self->{is_quantifier}; | ||||
314 | return 0; | ||||
315 | } | ||||
316 | |||||
317 | 1 | 10µs | 1; | ||
318 | |||||
319 | __END__ |