Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Token/Literal.pm |
Statements | Executed 22 statements in 1.62ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 14µs | 29µs | BEGIN@32 | PPIx::Regexp::Token::Literal::
1 | 1 | 1 | 9µs | 43µs | BEGIN@37 | PPIx::Regexp::Token::Literal::
1 | 1 | 1 | 8µs | 14µs | BEGIN@33 | PPIx::Regexp::Token::Literal::
1 | 1 | 1 | 8µs | 72µs | BEGIN@35 | PPIx::Regexp::Token::Literal::
2 | 2 | 2 | 4µs | 4µs | CORE:qr (opcode) | PPIx::Regexp::Token::Literal::
1 | 1 | 1 | 900ns | 900ns | CORE:regcomp (opcode) | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | __PPIX_TOKENIZER__regexp | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | _escaped | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | _have_charnames_vianame | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | _ordinal | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | ordinal | PPIx::Regexp::Token::Literal::
0 | 0 | 0 | 0s | 0s | perl_version_introduced | PPIx::Regexp::Token::Literal::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | =head1 NAME | ||||
2 | |||||
3 | PPIx::Regexp::Token::Literal - Represent a literal character | ||||
4 | |||||
5 | =head1 SYNOPSIS | ||||
6 | |||||
7 | use PPIx::Regexp::Dumper; | ||||
8 | PPIx::Regexp::Dumper->new( 'qr{foo}smx' ) | ||||
9 | ->print(); | ||||
10 | |||||
11 | =head1 INHERITANCE | ||||
12 | |||||
13 | C<PPIx::Regexp::Token::Literal> is a | ||||
14 | L<PPIx::Regexp::Token|PPIx::Regexp::Token>. | ||||
15 | |||||
16 | C<PPIx::Regexp::Token::Literal> has no descendants. | ||||
17 | |||||
18 | =head1 DESCRIPTION | ||||
19 | |||||
20 | This class represents a literal character, no matter how specified. | ||||
21 | |||||
22 | =head1 METHODS | ||||
23 | |||||
24 | This class provides the following public methods. Methods not documented | ||||
25 | here are private, and unsupported in the sense that the author reserves | ||||
26 | the right to change or remove them without notice. | ||||
27 | |||||
28 | =cut | ||||
29 | |||||
30 | package PPIx::Regexp::Token::Literal; | ||||
31 | |||||
32 | 2 | 25µs | 2 | 43µs | # spent 29µs (14+14) within PPIx::Regexp::Token::Literal::BEGIN@32 which was called:
# once (14µs+14µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 32 # spent 29µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@32
# spent 14µs making 1 call to strict::import |
33 | 2 | 23µs | 2 | 20µs | # spent 14µs (8+6) within PPIx::Regexp::Token::Literal::BEGIN@33 which was called:
# once (8µs+6µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 33 # spent 14µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@33
# spent 6µs making 1 call to warnings::import |
34 | |||||
35 | 2 | 49µs | 2 | 136µs | # spent 72µs (8+64) within PPIx::Regexp::Token::Literal::BEGIN@35 which was called:
# once (8µs+64µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 35 # spent 72µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@35
# spent 64µs making 1 call to base::import |
36 | |||||
37 | 1 | 300ns | # spent 43µs (9+34) within PPIx::Regexp::Token::Literal::BEGIN@37 which was called:
# once (9µs+34µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 39 | ||
38 | COOKIE_CLASS COOKIE_REGEX_SET MINIMUM_PERL TOKEN_UNKNOWN | ||||
39 | 1 | 1.39ms | 2 | 78µs | }; # spent 43µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@37
# spent 34µs making 1 call to Exporter::import |
40 | |||||
41 | 1 | 700ns | our $VERSION = '0.036'; | ||
42 | |||||
43 | # Return true if the token can be quantified, and false otherwise | ||||
44 | # sub can_be_quantified { return }; | ||||
45 | |||||
46 | sub perl_version_introduced { | ||||
47 | my ( $self ) = @_; | ||||
48 | exists $self->{perl_version_introduced} | ||||
49 | and return $self->{perl_version_introduced}; | ||||
50 | ( my $content = $self->content() ) =~ m/ \A \\ o /smx | ||||
51 | and return ( $self->{perl_version_introduced} = '5.013003' ); | ||||
52 | $content =~ m/ \A \\ N [{] U [+] /smx | ||||
53 | and return ( $self->{perl_version_introduced} = '5.008' ); | ||||
54 | $content =~ m/ \A \\ x [{] /smx # } | ||||
55 | and return ( $self->{perl_version_introduced} = '5.006' ); | ||||
56 | $content =~ m/ \A \\ N /smx | ||||
57 | and return ( $self->{perl_version_introduced} = '5.006001' ); | ||||
58 | return ( $self->{perl_version_introduced} = MINIMUM_PERL ); | ||||
59 | } | ||||
60 | |||||
61 | # Some characters may or may not be literals depending on whether we are | ||||
62 | # inside a character class. The following hash identifies those | ||||
63 | # characters and says what we should return when outside (index 0) or | ||||
64 | # inside (index 1) a character class, as judged by the presence of the | ||||
65 | # relevant cookie. | ||||
66 | 1 | 5µs | my %double_agent = ( | ||
67 | '.' => [ undef, 1 ], | ||||
68 | '*' => [ undef, 1 ], | ||||
69 | '?' => [ undef, 1 ], | ||||
70 | '+' => [ undef, 1 ], | ||||
71 | '-' => [ 1, undef ], | ||||
72 | '|' => [ undef, 1 ], | ||||
73 | ); | ||||
74 | |||||
75 | # These are the characters that other external tokenizers need to see, | ||||
76 | # or at least that we need to take a closer look at. All others can be | ||||
77 | # unconditionally made into single-character literals. | ||||
78 | 1 | 31µs | 2 | 3µs | my %extra_ordinary = map { $_ => 1 } # spent 2µs making 1 call to PPIx::Regexp::Token::Literal::CORE:qr
# spent 900ns making 1 call to PPIx::Regexp::Token::Literal::CORE:regcomp |
79 | split qr{}smx, '$@*+?.\\(){}[]^|-#'; | ||||
80 | # $ -> Token::Interpolation, Token::Assertion | ||||
81 | # @ -> Token::Interpolation | ||||
82 | # * -> Token::Quantifier | ||||
83 | # + ? -> Token::Quantifier, Token::Greediness | ||||
84 | # . -> Token::CharClass::Simple | ||||
85 | # \ -> Token::Control, Token::CharClass::Simple, Token::Assertion, | ||||
86 | # Token::Backreference | ||||
87 | # ( ) { } [ ] -> Token::Structure | ||||
88 | # ^ -> Token::Assertion | ||||
89 | # | - -> Token::Operator | ||||
90 | |||||
91 | 1 | 4µs | my %regex_set_operator = map { $_ => 1 } qw{ & + | - ^ ! }; | ||
92 | |||||
93 | # The regex for the extended white space available under regex sets in | ||||
94 | # Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get | ||||
95 | # this to work under Perl 5.6.2, so for that we fall back to ASCII white | ||||
96 | # space. The stringy eval is because I have been unable to get | ||||
97 | # satisfaction out of either interpolated characters (in general) or | ||||
98 | # eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto). | ||||
99 | # | ||||
100 | # See PPIx::Regexp::Structure::RegexSet for the documentation of this | ||||
101 | # mess. | ||||
102 | # my $white_space_re = $] >= 5.008 ? | ||||
103 | # 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' : | ||||
104 | # 'qr< \\A \\s+ >smx'; | ||||
105 | # | ||||
106 | # RT #91798 | ||||
107 | # The above turns out to be wrong, because \s matches too many | ||||
108 | # characters. We need the following to get the right match. Note that | ||||
109 | # \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{} | ||||
110 | # characters were NOT added (as I originally thought) but were simply | ||||
111 | # made characters that generated warnings when escaped, in preparation | ||||
112 | # for adding them. When they actually get added, I will have to add back | ||||
113 | # the trinary operator. Sigh. | ||||
114 | 1 | 400ns | my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx'; | ||
115 | 1 | 24µs | $white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval) # spent 8µs executing statements in string eval | ||
116 | |||||
117 | 1 | 4µs | my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ }; | ||
118 | |||||
119 | sub __PPIX_TOKENIZER__regexp { | ||||
120 | my ( $class, $tokenizer, $character, $char_type ) = @_; | ||||
121 | |||||
122 | if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) { | ||||
123 | # If we're inside a regex set no literals are allowed, but not | ||||
124 | # all characters that get here are seen as literals. | ||||
125 | |||||
126 | $regex_set_operator{$character} | ||||
127 | and return $tokenizer->make_token( | ||||
128 | length $character, 'PPIx::Regexp::Token::Operator' ); | ||||
129 | |||||
130 | my $accept; | ||||
131 | |||||
132 | $accept = $tokenizer->find_regexp( $white_space_re ) | ||||
133 | and return $tokenizer->make_token( | ||||
134 | $accept, 'PPIx::Regexp::Token::Whitespace' ); | ||||
135 | |||||
136 | $accept = _escaped( $tokenizer, $character ) | ||||
137 | and return $accept; | ||||
138 | |||||
139 | $regex_pass_on{$character} | ||||
140 | and return; | ||||
141 | |||||
142 | # At this point we have a single character which is poised to be | ||||
143 | # interpreted as a literal. These are not legal in a regex set | ||||
144 | # except when also in a bracketed class. | ||||
145 | return $tokenizer->cookie( COOKIE_CLASS ) ? | ||||
146 | length $character : | ||||
147 | $tokenizer->make_token( | ||||
148 | length $character, TOKEN_UNKNOWN, { | ||||
149 | error => 'Literal not valid in Regex set', | ||||
150 | }, | ||||
151 | ); | ||||
152 | |||||
153 | } else { | ||||
154 | |||||
155 | # Otherwise handle the characters that may or may not be | ||||
156 | # literals depending on whether or not we are in a character | ||||
157 | # class. | ||||
158 | if ( my $class = $double_agent{$character} ) { | ||||
159 | my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0; | ||||
160 | return $class->[$inx]; | ||||
161 | } | ||||
162 | } | ||||
163 | |||||
164 | # If /x is in effect _and_ we are not inside a character class, \s | ||||
165 | # is whitespace, and '#' introduces a comment. Otherwise they are | ||||
166 | # both literals. | ||||
167 | if ( $tokenizer->modifier( 'x' ) && | ||||
168 | ! $tokenizer->cookie( COOKIE_CLASS ) ) { | ||||
169 | my $accept; | ||||
170 | $accept = $tokenizer->find_regexp( $white_space_re ) | ||||
171 | and return $tokenizer->make_token( | ||||
172 | $accept, 'PPIx::Regexp::Token::Whitespace' ); | ||||
173 | $accept = $tokenizer->find_regexp( | ||||
174 | qr{ \A \# [^\n]* (?: \n | \z) }smx ) | ||||
175 | and return $tokenizer->make_token( | ||||
176 | $accept, 'PPIx::Regexp::Token::Comment' ); | ||||
177 | } else { | ||||
178 | ( $character eq '#' || $character =~ m/ \A \s \z /smx ) | ||||
179 | and return 1; | ||||
180 | } | ||||
181 | |||||
182 | my $accept; | ||||
183 | $accept = _escaped( $tokenizer, $character ) | ||||
184 | and return $accept; | ||||
185 | |||||
186 | # All other characters which are not extra ordinary get accepted. | ||||
187 | $extra_ordinary{$character} or return 1; | ||||
188 | |||||
189 | return; | ||||
190 | } | ||||
191 | |||||
192 | |||||
193 | =begin comment | ||||
194 | |||||
195 | The following is from perlop: | ||||
196 | |||||
197 | The character following "\c" is mapped to some other character by | ||||
198 | converting letters to upper case and then (on ASCII systems) by | ||||
199 | inverting the 7th bit (0x40). The most interesting range is from '@' to | ||||
200 | '_' (0x40 through 0x5F), resulting in a control character from 0x00 | ||||
201 | through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only | ||||
202 | '@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting | ||||
203 | in 0x00 through 0x1F and 0x7F. | ||||
204 | |||||
205 | =end comment | ||||
206 | |||||
207 | =cut | ||||
208 | |||||
209 | # Recognize all the escaped constructions that generate literal | ||||
210 | # characters in one gigantic regexp. Technically \1.. through \7.. are | ||||
211 | # octal literals too, but we can not disambiguate these from back | ||||
212 | # references until we know how many there are. So the lexer gets another | ||||
213 | # dirty job. | ||||
214 | sub _escaped { | ||||
215 | my ( $tokenizer, $character ) = @_; | ||||
216 | |||||
217 | $character eq '\\' | ||||
218 | or return; | ||||
219 | |||||
220 | if ( my $accept = $tokenizer->find_regexp( | ||||
221 | qr< \A \\ (?: | ||||
222 | [^\w\s] | # delimiters/metas | ||||
223 | [tnrfae] | # C-style escapes | ||||
224 | 0 [01234567]{0,2} | # octal | ||||
225 | # [01234567]{1,3} | # made from backref by lexer | ||||
226 | c [][[:alpha:]\@\\^_?] | # control characters | ||||
227 | x (?: \{ [[:xdigit:]]* \} | [[:xdigit:]]{0,2} ) | # hex | ||||
228 | o [{] [01234567]+ [}] | # octal as of 5.13.3 | ||||
229 | ## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* | # must begin w/ alpha | ||||
230 | ## U [+] [[:xdigit:]]+ ) \} ) | # unicode | ||||
231 | N (?: [{] (?= \D ) [^\}]+ [}] ) # unicode | ||||
232 | ) >smx ) ) { | ||||
233 | return $accept; | ||||
234 | } | ||||
235 | return; | ||||
236 | } | ||||
237 | |||||
238 | =head2 ordinal | ||||
239 | |||||
240 | print 'The ordinal of ', $token->content(), | ||||
241 | ' is ', $token->ordinal(), "\n"; | ||||
242 | |||||
243 | This method returns the ordinal of the literal if it can figure it out. | ||||
244 | It is analogous to the C<ord> built-in. | ||||
245 | |||||
246 | It will not attempt to determine the ordinal of a unicode name | ||||
247 | (C<\N{...}>) unless L<charnames|charnames> has been loaded, and supports | ||||
248 | the L<vianame()|charnames/vianame> function. Instead, it will return | ||||
249 | C<undef>. Users of Perl 5.6.2 and older may be out of luck here. | ||||
250 | |||||
251 | Unicode code points (e.g. C<\N{U+abcd}>) should work independently of | ||||
252 | L<charnames|charnames>, and just return the value of C<abcd>. | ||||
253 | |||||
254 | It will never attempt to return the ordinal of an octet (C<\C{...}>) | ||||
255 | because I don't understand the syntax. | ||||
256 | |||||
257 | =cut | ||||
258 | |||||
259 | { | ||||
260 | |||||
261 | 2 | 32µs | my %escapes = ( | ||
262 | '\\t' => ord "\t", | ||||
263 | '\\n' => ord "\n", | ||||
264 | '\\r' => ord "\r", | ||||
265 | '\\f' => ord "\f", | ||||
266 | '\\a' => ord "\a", | ||||
267 | '\\b' => ord "\b", | ||||
268 | '\\e' => ord "\e", | ||||
269 | '\\c?' => ord "\c?", | ||||
270 | '\\c@' => ord "\c@", | ||||
271 | '\\cA' => ord "\cA", | ||||
272 | '\\ca' => ord "\cA", | ||||
273 | '\\cB' => ord "\cB", | ||||
274 | '\\cb' => ord "\cB", | ||||
275 | '\\cC' => ord "\cC", | ||||
276 | '\\cc' => ord "\cC", | ||||
277 | '\\cD' => ord "\cD", | ||||
278 | '\\cd' => ord "\cD", | ||||
279 | '\\cE' => ord "\cE", | ||||
280 | '\\ce' => ord "\cE", | ||||
281 | '\\cF' => ord "\cF", | ||||
282 | '\\cf' => ord "\cF", | ||||
283 | '\\cG' => ord "\cG", | ||||
284 | '\\cg' => ord "\cG", | ||||
285 | '\\cH' => ord "\cH", | ||||
286 | '\\ch' => ord "\cH", | ||||
287 | '\\cI' => ord "\cI", | ||||
288 | '\\ci' => ord "\cI", | ||||
289 | '\\cJ' => ord "\cJ", | ||||
290 | '\\cj' => ord "\cJ", | ||||
291 | '\\cK' => ord "\cK", | ||||
292 | '\\ck' => ord "\cK", | ||||
293 | '\\cL' => ord "\cL", | ||||
294 | '\\cl' => ord "\cL", | ||||
295 | '\\cM' => ord "\cM", | ||||
296 | '\\cm' => ord "\cM", | ||||
297 | '\\cN' => ord "\cN", | ||||
298 | '\\cn' => ord "\cN", | ||||
299 | '\\cO' => ord "\cO", | ||||
300 | '\\co' => ord "\cO", | ||||
301 | '\\cP' => ord "\cP", | ||||
302 | '\\cp' => ord "\cP", | ||||
303 | '\\cQ' => ord "\cQ", | ||||
304 | '\\cq' => ord "\cQ", | ||||
305 | '\\cR' => ord "\cR", | ||||
306 | '\\cr' => ord "\cR", | ||||
307 | '\\cS' => ord "\cS", | ||||
308 | '\\cs' => ord "\cS", | ||||
309 | '\\cT' => ord "\cT", | ||||
310 | '\\ct' => ord "\cT", | ||||
311 | '\\cU' => ord "\cU", | ||||
312 | '\\cu' => ord "\cU", | ||||
313 | '\\cV' => ord "\cV", | ||||
314 | '\\cv' => ord "\cV", | ||||
315 | '\\cW' => ord "\cW", | ||||
316 | '\\cw' => ord "\cW", | ||||
317 | '\\cX' => ord "\cX", | ||||
318 | '\\cx' => ord "\cX", | ||||
319 | '\\cY' => ord "\cY", | ||||
320 | '\\cy' => ord "\cY", | ||||
321 | '\\cZ' => ord "\cZ", | ||||
322 | '\\cz' => ord "\cZ", | ||||
323 | '\\c[' => ord "\c[", | ||||
324 | '\\c\\\\' => ord "\c\\", # " # Get Vim's head straight. | ||||
325 | '\\c]' => ord "\c]", | ||||
326 | '\\c^' => ord "\c^", | ||||
327 | '\\c_' => ord "\c_", | ||||
328 | ); | ||||
329 | |||||
330 | sub ordinal { | ||||
331 | my ( $self ) = @_; | ||||
332 | exists $self->{ordinal} and return $self->{ordinal}; | ||||
333 | return ( $self->{ordinal} = $self->_ordinal() ); | ||||
334 | } | ||||
335 | |||||
336 | 1 | 8µs | my %octal = map {; "$_" => 1 } ( 0 .. 7 ); | ||
337 | |||||
338 | sub _ordinal { | ||||
339 | my ( $self ) = @_; | ||||
340 | my $content = $self->content(); | ||||
341 | |||||
342 | $content =~ m/ \A \\ /smx or return ord $content; | ||||
343 | |||||
344 | exists $escapes{$content} and return $escapes{$content}; | ||||
345 | |||||
346 | my $indicator = substr $content, 1, 1; | ||||
347 | |||||
348 | $octal{$indicator} and return oct substr $content, 1; | ||||
349 | |||||
350 | if ( $indicator eq 'x' ) { | ||||
351 | $content =~ m/ \A \\ x \{ ( [[:xdigit:]]+ ) \} \z /smx | ||||
352 | and return hex $1; | ||||
353 | $content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx | ||||
354 | and return hex $1; | ||||
355 | return; | ||||
356 | } | ||||
357 | |||||
358 | if ( $indicator eq 'o' ) { | ||||
359 | $content =~ m/ \A \\ o [{] ( [01234567]+ ) [}] \z /smx | ||||
360 | and return oct $1; | ||||
361 | return; # Shouldn't happen, but ... | ||||
362 | } | ||||
363 | |||||
364 | if ( $indicator eq 'N' ) { | ||||
365 | $content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx | ||||
366 | and return hex $1; | ||||
367 | $content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx | ||||
368 | and return ( | ||||
369 | _have_charnames_vianame() ? | ||||
370 | charnames::vianame( $1 ) : | ||||
371 | undef | ||||
372 | ); | ||||
373 | return; # Shouldn't happen, but ... | ||||
374 | } | ||||
375 | |||||
376 | return ord $indicator; | ||||
377 | } | ||||
378 | |||||
379 | } | ||||
380 | |||||
381 | { | ||||
382 | 2 | 400ns | my $have_charnames_vianame; | ||
383 | |||||
384 | sub _have_charnames_vianame { | ||||
385 | defined $have_charnames_vianame | ||||
386 | and return $have_charnames_vianame; | ||||
387 | return ( | ||||
388 | $have_charnames_vianame = | ||||
389 | charnames->can( 'vianame' ) ? 1 : 0 | ||||
390 | ); | ||||
391 | |||||
392 | } | ||||
393 | } | ||||
394 | |||||
395 | |||||
396 | 1 | 1µs | *__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp; | ||
397 | |||||
398 | 1 | 22µs | 1; | ||
399 | |||||
400 | __END__ | ||||
# spent 4µs within PPIx::Regexp::Token::Literal::CORE:qr which was called 2 times, avg 2µs/call:
# once (2µs+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 78
# once (2µs+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 1 of (eval 442)[PPIx/Regexp/Token/Literal.pm:115] | |||||
# spent 900ns within PPIx::Regexp::Token::Literal::CORE:regcomp which was called:
# once (900ns+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 78 |