Profile of PPIx/Regexp/Token/Literal.pm

Filename	/Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Token/Literal.pm
Statements	Executed 22 statements in 1.62ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
1	1	1	14µs	29µs	PPIx::Regexp::Token::Literal::BEGIN@32
1	1	1	9µs	43µs	PPIx::Regexp::Token::Literal::BEGIN@37
1	1	1	8µs	14µs	PPIx::Regexp::Token::Literal::BEGIN@33
1	1	1	8µs	72µs	PPIx::Regexp::Token::Literal::BEGIN@35
2	2	2	4µs	4µs	PPIx::Regexp::Token::Literal::::CORE:qrPPIx::Regexp::Token::Literal::CORE:qr (opcode)
1	1	1	900ns	900ns	PPIx::Regexp::Token::Literal::CORE:regcomp (opcode)
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::__PPIX_TOKENIZER__regexp
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::_escaped
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::_have_charnames_vianame
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::_ordinal
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::::ordinalPPIx::Regexp::Token::Literal::ordinal
0	0	0	0s	0s	PPIx::Regexp::Token::Literal::perl_version_introduced

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					=head1 NAME
2
3					PPIx::Regexp::Token::Literal - Represent a literal character
4
5					=head1 SYNOPSIS
6
7					use PPIx::Regexp::Dumper;
8					PPIx::Regexp::Dumper->new( 'qr{foo}smx' )
9					->print();
10
11					=head1 INHERITANCE
12
13					C<PPIx::Regexp::Token::Literal> is a
14					L<PPIx::Regexp::Token\|PPIx::Regexp::Token>.
15
16					C<PPIx::Regexp::Token::Literal> has no descendants.
17
18					=head1 DESCRIPTION
19
20					This class represents a literal character, no matter how specified.
21
22					=head1 METHODS
23
24					This class provides the following public methods. Methods not documented
25					here are private, and unsupported in the sense that the author reserves
26					the right to change or remove them without notice.
27
28					=cut
29
30					package PPIx::Regexp::Token::Literal;
31
32	2	25µs	2	43µs	# spent 29µs (14+14) within PPIx::Regexp::Token::Literal::BEGIN@32 which was called: # once (14µs+14µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 32 use strict; # spent 29µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@32 # spent 14µs making 1 call to strict::import
33	2	23µs	2	20µs	# spent 14µs (8+6) within PPIx::Regexp::Token::Literal::BEGIN@33 which was called: # once (8µs+6µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 33 use warnings; # spent 14µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@33 # spent 6µs making 1 call to warnings::import
34
35	2	49µs	2	136µs	# spent 72µs (8+64) within PPIx::Regexp::Token::Literal::BEGIN@35 which was called: # once (8µs+64µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 35 use base qw{ PPIx::Regexp::Token }; # spent 72µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@35 # spent 64µs making 1 call to base::import
36
37	1	300ns			# spent 43µs (9+34) within PPIx::Regexp::Token::Literal::BEGIN@37 which was called: # once (9µs+34µs) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 39 use PPIx::Regexp::Constant qw{
38					COOKIE_CLASS COOKIE_REGEX_SET MINIMUM_PERL TOKEN_UNKNOWN
39	1	1.39ms	2	78µs	}; # spent 43µs making 1 call to PPIx::Regexp::Token::Literal::BEGIN@37 # spent 34µs making 1 call to Exporter::import
40
41	1	700ns			our $VERSION = '0.036';
42
43					# Return true if the token can be quantified, and false otherwise
44					# sub can_be_quantified { return };
45
46					sub perl_version_introduced {
47					my ( $self ) = @_;
48					exists $self->{perl_version_introduced}
49					and return $self->{perl_version_introduced};
50					( my $content = $self->content() ) =~ m/ \A \\ o /smx
51					and return ( $self->{perl_version_introduced} = '5.013003' );
52					$content =~ m/ \A \\ N [{] U [+] /smx
53					and return ( $self->{perl_version_introduced} = '5.008' );
54					$content =~ m/ \A \\ x [{] /smx # }
55					and return ( $self->{perl_version_introduced} = '5.006' );
56					$content =~ m/ \A \\ N /smx
57					and return ( $self->{perl_version_introduced} = '5.006001' );
58					return ( $self->{perl_version_introduced} = MINIMUM_PERL );
59					}
60
61					# Some characters may or may not be literals depending on whether we are
62					# inside a character class. The following hash identifies those
63					# characters and says what we should return when outside (index 0) or
64					# inside (index 1) a character class, as judged by the presence of the
65					# relevant cookie.
66	1	5µs			my %double_agent = (
67					'.' => [ undef, 1 ],
68					'*' => [ undef, 1 ],
69					'?' => [ undef, 1 ],
70					'+' => [ undef, 1 ],
71					'-' => [ 1, undef ],
72					'\|' => [ undef, 1 ],
73					);
74
75					# These are the characters that other external tokenizers need to see,
76					# or at least that we need to take a closer look at. All others can be
77					# unconditionally made into single-character literals.
78	1	31µs	2	3µs	my %extra_ordinary = map { $_ => 1 } # spent 2µs making 1 call to PPIx::Regexp::Token::Literal::CORE:qr # spent 900ns making 1 call to PPIx::Regexp::Token::Literal::CORE:regcomp
79					split qr{}smx, '$@*+?.\\(){}[]^\|-#';
80					# $ -> Token::Interpolation, Token::Assertion
81					# @ -> Token::Interpolation
82					# * -> Token::Quantifier
83					# + ? -> Token::Quantifier, Token::Greediness
84					# . -> Token::CharClass::Simple
85					# \ -> Token::Control, Token::CharClass::Simple, Token::Assertion,
86					# Token::Backreference
87					# ( ) { } [ ] -> Token::Structure
88					# ^ -> Token::Assertion
89					# \| - -> Token::Operator
90
91	1	4µs			my %regex_set_operator = map { $_ => 1 } qw{ & + \| - ^ ! };
92
93					# The regex for the extended white space available under regex sets in
94					# Perl 5.17.8 and in general in perl 5.17.9. I have been unable to get
95					# this to work under Perl 5.6.2, so for that we fall back to ASCII white
96					# space. The stringy eval is because I have been unable to get
97					# satisfaction out of either interpolated characters (in general) or
98					# eval-ed "\N{U+...}" (under 5.6.2) or \x{...} (ditto).
99					#
100					# See PPIx::Regexp::Structure::RegexSet for the documentation of this
101					# mess.
102					# my $white_space_re = $] >= 5.008 ?
103					# 'qr< \\A [\\s\\N{U+0085}\\N{U+200E}\\N{U+200F}\\N{U+2028}\\N{U+2029}]+ >smx' :
104					# 'qr< \\A \\s+ >smx';
105					#
106					# RT #91798
107					# The above turns out to be wrong, because \s matches too many
108					# characters. We need the following to get the right match. Note that
109					# \cK was added experimentally in 5.17.0 and made it into 5.18. The \N{}
110					# characters were NOT added (as I originally thought) but were simply
111					# made characters that generated warnings when escaped, in preparation
112					# for adding them. When they actually get added, I will have to add back
113					# the trinary operator. Sigh.
114	1	400ns			my $white_space_re = 'qr< \A [\t\n\cK\f\r ] >smx';
115	1	24µs			$white_space_re = eval $white_space_re; ## no critic (ProhibitStringyEval) # spent 8µs executing statements in string eval
116
117	1	4µs			my %regex_pass_on = map { $_ => 1 } qw{ [ ] ( ) $ \ };
118
119					sub __PPIX_TOKENIZER__regexp {
120					my ( $class, $tokenizer, $character, $char_type ) = @_;
121
122					if ( $tokenizer->cookie( COOKIE_REGEX_SET ) ) {
123					# If we're inside a regex set no literals are allowed, but not
124					# all characters that get here are seen as literals.
125
126					$regex_set_operator{$character}
127					and return $tokenizer->make_token(
128					length $character, 'PPIx::Regexp::Token::Operator' );
129
130					my $accept;
131
132					$accept = $tokenizer->find_regexp( $white_space_re )
133					and return $tokenizer->make_token(
134					$accept, 'PPIx::Regexp::Token::Whitespace' );
135
136					$accept = _escaped( $tokenizer, $character )
137					and return $accept;
138
139					$regex_pass_on{$character}
140					and return;
141
142					# At this point we have a single character which is poised to be
143					# interpreted as a literal. These are not legal in a regex set
144					# except when also in a bracketed class.
145					return $tokenizer->cookie( COOKIE_CLASS ) ?
146					length $character :
147					$tokenizer->make_token(
148					length $character, TOKEN_UNKNOWN, {
149					error => 'Literal not valid in Regex set',
150					},
151					);
152
153					} else {
154
155					# Otherwise handle the characters that may or may not be
156					# literals depending on whether or not we are in a character
157					# class.
158					if ( my $class = $double_agent{$character} ) {
159					my $inx = $tokenizer->cookie( COOKIE_CLASS ) ? 1 : 0;
160					return $class->[$inx];
161					}
162					}
163
164					# If /x is in effect _and_ we are not inside a character class, \s
165					# is whitespace, and '#' introduces a comment. Otherwise they are
166					# both literals.
167					if ( $tokenizer->modifier( 'x' ) &&
168					! $tokenizer->cookie( COOKIE_CLASS ) ) {
169					my $accept;
170					$accept = $tokenizer->find_regexp( $white_space_re )
171					and return $tokenizer->make_token(
172					$accept, 'PPIx::Regexp::Token::Whitespace' );
173					$accept = $tokenizer->find_regexp(
174					qr{ \A \# [^\n]* (?: \n \| \z) }smx )
175					and return $tokenizer->make_token(
176					$accept, 'PPIx::Regexp::Token::Comment' );
177					} else {
178					( $character eq '#' \|\| $character =~ m/ \A \s \z /smx )
179					and return 1;
180					}
181
182					my $accept;
183					$accept = _escaped( $tokenizer, $character )
184					and return $accept;
185
186					# All other characters which are not extra ordinary get accepted.
187					$extra_ordinary{$character} or return 1;
188
189					return;
190					}
191
192
193					=begin comment
194
195					The following is from perlop:
196
197					The character following "\c" is mapped to some other character by
198					converting letters to upper case and then (on ASCII systems) by
199					inverting the 7th bit (0x40). The most interesting range is from '@' to
200					'_' (0x40 through 0x5F), resulting in a control character from 0x00
201					through 0x1F. A '?' maps to the DEL character. On EBCDIC systems only
202					'@', the letters, '[', '\', ']', '^', '_' and '?' will work, resulting
203					in 0x00 through 0x1F and 0x7F.
204
205					=end comment
206
207					=cut
208
209					# Recognize all the escaped constructions that generate literal
210					# characters in one gigantic regexp. Technically \1.. through \7.. are
211					# octal literals too, but we can not disambiguate these from back
212					# references until we know how many there are. So the lexer gets another
213					# dirty job.
214					sub _escaped {
215					my ( $tokenizer, $character ) = @_;
216
217					$character eq '\\'
218					or return;
219
220					if ( my $accept = $tokenizer->find_regexp(
221					qr< \A \\ (?:
222					[^\w\s] \| # delimiters/metas
223					[tnrfae] \| # C-style escapes
224					0 [01234567]{0,2} \| # octal
225					# [01234567]{1,3} \| # made from backref by lexer
226					c [][[:alpha:]\@\\^_?] \| # control characters
227					x (?: \{ [[:xdigit:]]* \} \| [[:xdigit:]]{0,2} ) \| # hex
228					o [{] [01234567]+ [}] \| # octal as of 5.13.3
229					## N (?: \{ (?: [[:alpha:]] [\w\s:()-]* \| # must begin w/ alpha
230					## U [+] [[:xdigit:]]+ ) \} ) \| # unicode
231					N (?: [{] (?= \D ) [^\}]+ [}] ) # unicode
232					) >smx ) ) {
233					return $accept;
234					}
235					return;
236					}
237
238					=head2 ordinal
239
240					print 'The ordinal of ', $token->content(),
241					' is ', $token->ordinal(), "\n";
242
243					This method returns the ordinal of the literal if it can figure it out.
244					It is analogous to the C<ord> built-in.
245
246					It will not attempt to determine the ordinal of a unicode name
247					(C<\N{...}>) unless L<charnames\|charnames> has been loaded, and supports
248					the L<vianame()\|charnames/vianame> function. Instead, it will return
249					C<undef>. Users of Perl 5.6.2 and older may be out of luck here.
250
251					Unicode code points (e.g. C<\N{U+abcd}>) should work independently of
252					L<charnames\|charnames>, and just return the value of C<abcd>.
253
254					It will never attempt to return the ordinal of an octet (C<\C{...}>)
255					because I don't understand the syntax.
256
257					=cut
258
259					{
260
261	2	32µs			my %escapes = (
262					'\\t' => ord "\t",
263					'\\n' => ord "\n",
264					'\\r' => ord "\r",
265					'\\f' => ord "\f",
266					'\\a' => ord "\a",
267					'\\b' => ord "\b",
268					'\\e' => ord "\e",
269					'\\c?' => ord "\c?",
270					'\\c@' => ord "\c@",
271					'\\cA' => ord "\cA",
272					'\\ca' => ord "\cA",
273					'\\cB' => ord "\cB",
274					'\\cb' => ord "\cB",
275					'\\cC' => ord "\cC",
276					'\\cc' => ord "\cC",
277					'\\cD' => ord "\cD",
278					'\\cd' => ord "\cD",
279					'\\cE' => ord "\cE",
280					'\\ce' => ord "\cE",
281					'\\cF' => ord "\cF",
282					'\\cf' => ord "\cF",
283					'\\cG' => ord "\cG",
284					'\\cg' => ord "\cG",
285					'\\cH' => ord "\cH",
286					'\\ch' => ord "\cH",
287					'\\cI' => ord "\cI",
288					'\\ci' => ord "\cI",
289					'\\cJ' => ord "\cJ",
290					'\\cj' => ord "\cJ",
291					'\\cK' => ord "\cK",
292					'\\ck' => ord "\cK",
293					'\\cL' => ord "\cL",
294					'\\cl' => ord "\cL",
295					'\\cM' => ord "\cM",
296					'\\cm' => ord "\cM",
297					'\\cN' => ord "\cN",
298					'\\cn' => ord "\cN",
299					'\\cO' => ord "\cO",
300					'\\co' => ord "\cO",
301					'\\cP' => ord "\cP",
302					'\\cp' => ord "\cP",
303					'\\cQ' => ord "\cQ",
304					'\\cq' => ord "\cQ",
305					'\\cR' => ord "\cR",
306					'\\cr' => ord "\cR",
307					'\\cS' => ord "\cS",
308					'\\cs' => ord "\cS",
309					'\\cT' => ord "\cT",
310					'\\ct' => ord "\cT",
311					'\\cU' => ord "\cU",
312					'\\cu' => ord "\cU",
313					'\\cV' => ord "\cV",
314					'\\cv' => ord "\cV",
315					'\\cW' => ord "\cW",
316					'\\cw' => ord "\cW",
317					'\\cX' => ord "\cX",
318					'\\cx' => ord "\cX",
319					'\\cY' => ord "\cY",
320					'\\cy' => ord "\cY",
321					'\\cZ' => ord "\cZ",
322					'\\cz' => ord "\cZ",
323					'\\c[' => ord "\c[",
324					'\\c\\\\' => ord "\c\\", # " # Get Vim's head straight.
325					'\\c]' => ord "\c]",
326					'\\c^' => ord "\c^",
327					'\\c_' => ord "\c_",
328					);
329
330					sub ordinal {
331					my ( $self ) = @_;
332					exists $self->{ordinal} and return $self->{ordinal};
333					return ( $self->{ordinal} = $self->_ordinal() );
334					}
335
336	1	8µs			my %octal = map {; "$_" => 1 } ( 0 .. 7 );
337
338					sub _ordinal {
339					my ( $self ) = @_;
340					my $content = $self->content();
341
342					$content =~ m/ \A \\ /smx or return ord $content;
343
344					exists $escapes{$content} and return $escapes{$content};
345
346					my $indicator = substr $content, 1, 1;
347
348					$octal{$indicator} and return oct substr $content, 1;
349
350					if ( $indicator eq 'x' ) {
351					$content =~ m/ \A \\ x \{ ( [[:xdigit:]]+ ) \} \z /smx
352					and return hex $1;
353					$content =~ m/ \A \\ x ( [[:xdigit:]]{0,2} ) \z /smx
354					and return hex $1;
355					return;
356					}
357
358					if ( $indicator eq 'o' ) {
359					$content =~ m/ \A \\ o [{] ( [01234567]+ ) [}] \z /smx
360					and return oct $1;
361					return; # Shouldn't happen, but ...
362					}
363
364					if ( $indicator eq 'N' ) {
365					$content =~ m/ \A \\ N \{ U [+] ( [[:xdigit:]]+ ) \} \z /smx
366					and return hex $1;
367					$content =~ m/ \A \\ N [{] ( .+ ) [}] \z /smx
368					and return (
369					_have_charnames_vianame() ?
370					charnames::vianame( $1 ) :
371					undef
372					);
373					return; # Shouldn't happen, but ...
374					}
375
376					return ord $indicator;
377					}
378
379					}
380
381					{
382	2	400ns			my $have_charnames_vianame;
383
384					sub _have_charnames_vianame {
385					defined $have_charnames_vianame
386					and return $have_charnames_vianame;
387					return (
388					$have_charnames_vianame =
389					charnames->can( 'vianame' ) ? 1 : 0
390					);
391
392					}
393					}
394
395
396	1	1µs			*__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp;
397
398	1	22µs			1;
399
400					__END__

					# spent 4µs within PPIx::Regexp::Token::Literal::CORE:qr which was called 2 times, avg 2µs/call: # once (2µs+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 78 # once (2µs+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 1 of (eval 442)[PPIx/Regexp/Token/Literal.pm:115] sub PPIx::Regexp::Token::Literal::CORE:qr; # opcode
					# spent 900ns within PPIx::Regexp::Token::Literal::CORE:regcomp which was called: # once (900ns+0s) by PPIx::Regexp::Tokenizer::BEGIN@34 at line 78 sub PPIx::Regexp::Token::Literal::CORE:regcomp; # opcode