| Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Token/Structure.pm |
| Statements | Executed 31 statements in 1.57ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1.60ms | 1.80ms | PPIx::Regexp::Token::Structure::BEGIN@51 |
| 1 | 1 | 1 | 403µs | 655µs | PPIx::Regexp::Token::Structure::BEGIN@54 |
| 1 | 1 | 1 | 19µs | 39µs | PPIx::Regexp::Token::Structure::BEGIN@35 |
| 1 | 1 | 1 | 13µs | 20µs | PPIx::Regexp::Token::Structure::BEGIN@36 |
| 1 | 1 | 1 | 11µs | 96µs | PPIx::Regexp::Token::Structure::BEGIN@38 |
| 1 | 1 | 1 | 11µs | 66µs | PPIx::Regexp::Token::Structure::BEGIN@40 |
| 1 | 1 | 1 | 7µs | 7µs | PPIx::Regexp::Token::Structure::BEGIN@52 |
| 1 | 1 | 1 | 5µs | 5µs | PPIx::Regexp::Token::Structure::BEGIN@50 |
| 1 | 1 | 1 | 5µs | 5µs | PPIx::Regexp::Token::Structure::BEGIN@53 |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::__ANON__[:152] |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::__ANON__[:220] |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::__ANON__[:245] |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::__PPIX_LEXER__finalize |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::__PPIX_TOKENIZER__regexp |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::can_be_quantified |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::is_quantifier |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Token::Structure::perl_version_introduced |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | =head1 NAME | ||||
| 2 | |||||
| 3 | PPIx::Regexp::Token::Structure - Represent structural elements. | ||||
| 4 | |||||
| 5 | =head1 SYNOPSIS | ||||
| 6 | |||||
| 7 | use PPIx::Regexp::Dumper; | ||||
| 8 | PPIx::Regexp::Dumper->new( 'qr{(foo)}smx' ) | ||||
| 9 | ->print(); | ||||
| 10 | |||||
| 11 | =head1 INHERITANCE | ||||
| 12 | |||||
| 13 | C<PPIx::Regexp::Token::Structure> is a | ||||
| 14 | L<PPIx::Regexp::Token|PPIx::Regexp::Token>. | ||||
| 15 | |||||
| 16 | C<PPIx::Regexp::Token::Structure> is the parent of | ||||
| 17 | L<PPIx::Regexp::Token::Delimiter|PPIx::Regexp::Token::Delimiter>. | ||||
| 18 | |||||
| 19 | =head1 DESCRIPTION | ||||
| 20 | |||||
| 21 | This class represents things that define the structure of the regular | ||||
| 22 | expression. This typically means brackets of various sorts, but to | ||||
| 23 | prevent proliferation of token classes the type of the regular | ||||
| 24 | expression is stored here. | ||||
| 25 | |||||
| 26 | =head1 METHODS | ||||
| 27 | |||||
| 28 | This class provides no public methods beyond those provided by its | ||||
| 29 | superclass. | ||||
| 30 | |||||
| 31 | =cut | ||||
| 32 | |||||
| 33 | package PPIx::Regexp::Token::Structure; | ||||
| 34 | |||||
| 35 | 2 | 33µs | 2 | 58µs | # spent 39µs (19+19) within PPIx::Regexp::Token::Structure::BEGIN@35 which was called:
# once (19µs+19µs) by base::import at line 35 # spent 39µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@35
# spent 19µs making 1 call to strict::import |
| 36 | 2 | 33µs | 2 | 27µs | # spent 20µs (13+7) within PPIx::Regexp::Token::Structure::BEGIN@36 which was called:
# once (13µs+7µs) by base::import at line 36 # spent 20µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@36
# spent 7µs making 1 call to warnings::import |
| 37 | |||||
| 38 | 2 | 43µs | 2 | 96µs | # spent 96µs (11+85) within PPIx::Regexp::Token::Structure::BEGIN@38 which was called:
# once (11µs+85µs) by base::import at line 38 # spent 96µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@38
# spent 85µs making 1 call to base::import, recursion: max depth 1, sum of overlapping time 85µs |
| 39 | |||||
| 40 | 1 | 400ns | # spent 66µs (11+55) within PPIx::Regexp::Token::Structure::BEGIN@40 which was called:
# once (11µs+55µs) by base::import at line 46 | ||
| 41 | COOKIE_CLASS | ||||
| 42 | COOKIE_QUANT | ||||
| 43 | COOKIE_REGEX_SET | ||||
| 44 | MINIMUM_PERL | ||||
| 45 | TOKEN_LITERAL | ||||
| 46 | 1 | 34µs | 2 | 121µs | }; # spent 66µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@40
# spent 55µs making 1 call to Exporter::import |
| 47 | |||||
| 48 | # Tokens we are responsible for making, under at least some | ||||
| 49 | # circumstances. | ||||
| 50 | 2 | 26µs | 1 | 5µs | # spent 5µs within PPIx::Regexp::Token::Structure::BEGIN@50 which was called:
# once (5µs+0s) by base::import at line 50 # spent 5µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@50 |
| 51 | 2 | 161µs | 1 | 1.80ms | # spent 1.80ms (1.60+202µs) within PPIx::Regexp::Token::Structure::BEGIN@51 which was called:
# once (1.60ms+202µs) by base::import at line 51 # spent 1.80ms making 1 call to PPIx::Regexp::Token::Structure::BEGIN@51 |
| 52 | 2 | 28µs | 1 | 7µs | # spent 7µs within PPIx::Regexp::Token::Structure::BEGIN@52 which was called:
# once (7µs+0s) by base::import at line 52 # spent 7µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@52 |
| 53 | 2 | 26µs | 1 | 5µs | # spent 5µs within PPIx::Regexp::Token::Structure::BEGIN@53 which was called:
# once (5µs+0s) by base::import at line 53 # spent 5µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@53 |
| 54 | 2 | 1.14ms | 1 | 655µs | # spent 655µs (403+252) within PPIx::Regexp::Token::Structure::BEGIN@54 which was called:
# once (403µs+252µs) by base::import at line 54 # spent 655µs making 1 call to PPIx::Regexp::Token::Structure::BEGIN@54 |
| 55 | |||||
| 56 | 1 | 800ns | our $VERSION = '0.036'; | ||
| 57 | |||||
| 58 | # Return true if the token can be quantified, and false otherwise | ||||
| 59 | |||||
| 60 | 1 | 4µs | my %quant = map { $_ => 1 } ')', ']'; | ||
| 61 | sub can_be_quantified { | ||||
| 62 | my ( $self ) = @_; | ||||
| 63 | ref $self or return; | ||||
| 64 | return $quant{ $self->content() }; | ||||
| 65 | }; | ||||
| 66 | |||||
| 67 | sub is_quantifier { | ||||
| 68 | my ( $self ) = @_; | ||||
| 69 | ref $self or return; | ||||
| 70 | return $self->{is_quantifier}; | ||||
| 71 | } | ||||
| 72 | |||||
| 73 | { | ||||
| 74 | |||||
| 75 | # Note that the implementation equivocates on the ::Token::Structure | ||||
| 76 | # class, using it both for the initial token that determines the | ||||
| 77 | # type of the regex and things like parentheses internal to the | ||||
| 78 | # regex. Rather than sort out this equivocation, I have relied on | ||||
| 79 | # the currently-true assumption that 'qr' will not satisfy the | ||||
| 80 | # ::Token::Structure recognition logic, and the only way this class | ||||
| 81 | # can acquire this content is by the brute-force approach used to | ||||
| 82 | # generate the initial token object. | ||||
| 83 | |||||
| 84 | 2 | 3µs | my %perl_version_introduced = ( | ||
| 85 | qr => '5.005', | ||||
| 86 | '(?[' => '5.017008', | ||||
| 87 | ); | ||||
| 88 | |||||
| 89 | sub perl_version_introduced { | ||||
| 90 | my ( $self ) = @_; | ||||
| 91 | return $perl_version_introduced{ $self->content() } || MINIMUM_PERL; | ||||
| 92 | } | ||||
| 93 | } | ||||
| 94 | |||||
| 95 | { | ||||
| 96 | |||||
| 97 | 2 | 5µs | my %delim = map { $_ => 1 } qw/ ( ) { } [ ] /; | ||
| 98 | |||||
| 99 | # Regular expressions to match various parenthesized tokens, and the | ||||
| 100 | # classes to make them into. | ||||
| 101 | |||||
| 102 | 5 | 15µs | 5 | 106µs | my @paren_token = map { # spent 53µs making 1 call to PPIx::Regexp::Token::Recursion::__PPIX_TOKEN__recognize
# spent 14µs making 1 call to PPIx::Regexp::Token::Backtrack::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Comment::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Backreference::__PPIX_TOKEN__recognize
# spent 13µs making 1 call to PPIx::Regexp::Token::Modifier::__PPIX_TOKEN__recognize |
| 103 | 1 | 5µs | [ $_ => $_->__PPIX_TOKEN__recognize() ] | ||
| 104 | } | ||||
| 105 | 'PPIx::Regexp::Token::Comment', | ||||
| 106 | 'PPIx::Regexp::Token::Modifier', | ||||
| 107 | 'PPIx::Regexp::Token::Backreference', | ||||
| 108 | 'PPIx::Regexp::Token::Backtrack', | ||||
| 109 | 'PPIx::Regexp::Token::Recursion', | ||||
| 110 | ; | ||||
| 111 | |||||
| 112 | sub __PPIX_TOKENIZER__regexp { | ||||
| 113 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 114 | |||||
| 115 | # We are not interested in anything but delimiters. | ||||
| 116 | $delim{$character} or return; | ||||
| 117 | |||||
| 118 | # Inside a character class, all the delimiters are normal characters | ||||
| 119 | # except for the close square bracket. | ||||
| 120 | if ( $tokenizer->cookie( COOKIE_CLASS ) ) { | ||||
| 121 | $character eq ']' | ||||
| 122 | or return $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
| 123 | } | ||||
| 124 | |||||
| 125 | # Open parentheses have various interesting possibilities ... | ||||
| 126 | if ( $character eq '(' ) { | ||||
| 127 | |||||
| 128 | # Sometimes the whole bunch of parenthesized characters seems | ||||
| 129 | # naturally to be a token. | ||||
| 130 | foreach ( @paren_token ) { | ||||
| 131 | my ( $class, @recognize ) = @{ $_ }; | ||||
| 132 | foreach ( @recognize ) { | ||||
| 133 | my ( $regexp, $arg ) = @{ $_ }; | ||||
| 134 | my $accept = $tokenizer->find_regexp( $regexp ) or next; | ||||
| 135 | return $tokenizer->make_token( $accept, $class, $arg ); | ||||
| 136 | } | ||||
| 137 | } | ||||
| 138 | |||||
| 139 | # Modifier changes are local to this parenthesis group | ||||
| 140 | $tokenizer->modifier_duplicate(); | ||||
| 141 | |||||
| 142 | # The regex-set functionality introduced with 5.17.8 is most | ||||
| 143 | # conveniently handled by treating the initial '(?[' and | ||||
| 144 | # final '])' as ::Structure tokens. Fortunately for us, | ||||
| 145 | # perl5178delta documents that these may not have interior | ||||
| 146 | # spaces. | ||||
| 147 | |||||
| 148 | if ( my $accept = $tokenizer->find_regexp( | ||||
| 149 | qr{ \A [(] [?] [[] }smx # ] ) - help for vim | ||||
| 150 | ) | ||||
| 151 | ) { | ||||
| 152 | $tokenizer->cookie( COOKIE_REGEX_SET, sub { return 1 } ); | ||||
| 153 | $tokenizer->modifier_modify( x => 1 ); # Implicitly /x | ||||
| 154 | return $accept; | ||||
| 155 | } | ||||
| 156 | |||||
| 157 | # We expect certain tokens only after a left paren. | ||||
| 158 | $tokenizer->expect( | ||||
| 159 | 'PPIx::Regexp::Token::GroupType::Modifier', | ||||
| 160 | 'PPIx::Regexp::Token::GroupType::NamedCapture', | ||||
| 161 | 'PPIx::Regexp::Token::GroupType::Assertion', | ||||
| 162 | 'PPIx::Regexp::Token::GroupType::Code', | ||||
| 163 | 'PPIx::Regexp::Token::GroupType::BranchReset', | ||||
| 164 | 'PPIx::Regexp::Token::GroupType::Subexpression', | ||||
| 165 | 'PPIx::Regexp::Token::GroupType::Switch', | ||||
| 166 | ); | ||||
| 167 | |||||
| 168 | # Accept the parenthesis. | ||||
| 169 | return 1; | ||||
| 170 | } | ||||
| 171 | |||||
| 172 | # Close parentheses end modifier localization | ||||
| 173 | if ( $character eq ')' ) { | ||||
| 174 | $tokenizer->modifier_pop(); | ||||
| 175 | return 1; | ||||
| 176 | } | ||||
| 177 | |||||
| 178 | # Open curlys are complicated because they may or may not represent | ||||
| 179 | # the beginning of a quantifier, depending on what comes before the | ||||
| 180 | # close curly. So we set a cookie to monitor the token stream for | ||||
| 181 | # interlopers. If all goes well, the right curly will find the | ||||
| 182 | # cookie and know it is supposed to be a quantifier. | ||||
| 183 | if ( $character eq '{' ) { | ||||
| 184 | |||||
| 185 | # If the prior token can not be quantified, all this is | ||||
| 186 | # unnecessary. | ||||
| 187 | $tokenizer->prior( 'can_be_quantified' ) | ||||
| 188 | or return 1; | ||||
| 189 | |||||
| 190 | # We make our token now, before setting the cookie. Otherwise | ||||
| 191 | # the cookie has to deal with this token. | ||||
| 192 | my $token = $tokenizer->make_token( 1 ); | ||||
| 193 | |||||
| 194 | # A cookie for the next '}'. | ||||
| 195 | my $commas = 0; | ||||
| 196 | $tokenizer->cookie( COOKIE_QUANT, sub { | ||||
| 197 | my ( $tokenizer, $token ) = @_; | ||||
| 198 | $token or return 1; | ||||
| 199 | |||||
| 200 | # Of literals, we accept exactly one comma provided it | ||||
| 201 | # is not immediately after a '{'. We also accept | ||||
| 202 | # anything that matches '\d'; | ||||
| 203 | if ( $token->isa( TOKEN_LITERAL ) ) { | ||||
| 204 | my $character = $token->content(); | ||||
| 205 | if ( $character eq ',' ) { | ||||
| 206 | $commas++ and return; | ||||
| 207 | return $tokenizer->prior( 'content' ) ne '{'; | ||||
| 208 | } | ||||
| 209 | return $character =~ m/ \A \d \z /smx; | ||||
| 210 | } | ||||
| 211 | |||||
| 212 | # Since we do not know what is in an interpolation, we | ||||
| 213 | # trustingly accept it. | ||||
| 214 | if ( $token->isa( 'PPIx::Regexp::Token::Interpolation' ) | ||||
| 215 | ) { | ||||
| 216 | return 1; | ||||
| 217 | } | ||||
| 218 | |||||
| 219 | return; | ||||
| 220 | }, | ||||
| 221 | ); | ||||
| 222 | |||||
| 223 | return $token; | ||||
| 224 | } | ||||
| 225 | |||||
| 226 | # The close curly bracket is a little complicated because if the | ||||
| 227 | # cookie posted by the left curly bracket is still around, we are a | ||||
| 228 | # quantifier, otherwise not. | ||||
| 229 | if ( $character eq '}' ) { | ||||
| 230 | $tokenizer->cookie( COOKIE_QUANT, undef ) | ||||
| 231 | or return 1; | ||||
| 232 | $tokenizer->prior( 'class' )->isa( __PACKAGE__ ) | ||||
| 233 | and return 1; | ||||
| 234 | my $token = $tokenizer->make_token( 1 ); | ||||
| 235 | $token->{is_quantifier} = 1; | ||||
| 236 | return $token; | ||||
| 237 | } | ||||
| 238 | |||||
| 239 | # The parse rules are different inside a character class, so we set | ||||
| 240 | # another cookie. Sigh. If your tool is a hammer ... | ||||
| 241 | if ( $character eq '[' ) { | ||||
| 242 | |||||
| 243 | # Set our cookie. Since it always returns 1, it does not matter | ||||
| 244 | # where in the following mess we set it. | ||||
| 245 | $tokenizer->cookie( COOKIE_CLASS, sub { return 1 } ); | ||||
| 246 | |||||
| 247 | # Make our token now, since the easiest place to deal with the | ||||
| 248 | # beginning-of-character-class strangeness seems to be right | ||||
| 249 | # here. | ||||
| 250 | my @tokens = $tokenizer->make_token( 1 ); | ||||
| 251 | |||||
| 252 | # Get the next character, returning tokens if there is none. | ||||
| 253 | defined ( $character = $tokenizer->peek() ) | ||||
| 254 | or return @tokens; | ||||
| 255 | |||||
| 256 | # If we have a caret, it is a negation operator. Make its token | ||||
| 257 | # and fetch the next character, returning if none. | ||||
| 258 | if ( $character eq '^' ) { | ||||
| 259 | push @tokens, $tokenizer->make_token( | ||||
| 260 | 1, 'PPIx::Regexp::Token::Operator' ); | ||||
| 261 | defined ( $character = $tokenizer->peek() ) | ||||
| 262 | or return @tokens; | ||||
| 263 | } | ||||
| 264 | |||||
| 265 | # If we have a close square at this point, it is not the end of | ||||
| 266 | # the class, but just a literal. Make its token. | ||||
| 267 | $character eq ']' | ||||
| 268 | and push @tokens, $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
| 269 | |||||
| 270 | # Return all tokens made. | ||||
| 271 | return @tokens; | ||||
| 272 | } | ||||
| 273 | # per perlop, the metas inside a [] are -]\^$. | ||||
| 274 | # per perlop, the metas outside a [] are {}[]()^$.|*+?\ | ||||
| 275 | # The difference is that {}[().|*+? are not metas in [], but - is. | ||||
| 276 | |||||
| 277 | # Close bracket is complicated by the addition of regex sets. | ||||
| 278 | # And more complicated by the fact that you can have an | ||||
| 279 | # old-style character class inside a regex set. Fortunately they | ||||
| 280 | # have not (yet!) permitted nested regex sets. | ||||
| 281 | if ( $character eq ']' ) { | ||||
| 282 | |||||
| 283 | # If we find '])' and COOKIE_REGEX_SET is present, we have a | ||||
| 284 | # regex set. We need to delete the cookie and accept both | ||||
| 285 | # characters. | ||||
| 286 | if ( ( my $accept = $tokenizer->find_regexp( | ||||
| 287 | # help vim - ( [ | ||||
| 288 | qr{ \A []] [)] }smx | ||||
| 289 | ) ) | ||||
| 290 | && $tokenizer->cookie( COOKIE_REGEX_SET ) | ||||
| 291 | |||||
| 292 | ) { | ||||
| 293 | $tokenizer->cookie( COOKIE_REGEX_SET, undef ); | ||||
| 294 | return $accept; | ||||
| 295 | } | ||||
| 296 | |||||
| 297 | # Otherwise we assume we're in a bracketed character class, | ||||
| 298 | # delete the cookie, and accept the close bracket. | ||||
| 299 | $tokenizer->cookie( COOKIE_CLASS, undef ); | ||||
| 300 | return 1; | ||||
| 301 | } | ||||
| 302 | |||||
| 303 | return 1; | ||||
| 304 | } | ||||
| 305 | |||||
| 306 | } | ||||
| 307 | |||||
| 308 | # Called by the lexer once it has done its worst to all the tokens. | ||||
| 309 | # Called as a method with no arguments. The return is the number of | ||||
| 310 | # parse failures discovered when finalizing. | ||||
| 311 | sub __PPIX_LEXER__finalize { | ||||
| 312 | my ( $self ) = @_; | ||||
| 313 | delete $self->{is_quantifier}; | ||||
| 314 | return 0; | ||||
| 315 | } | ||||
| 316 | |||||
| 317 | 1 | 10µs | 1; | ||
| 318 | |||||
| 319 | __END__ |