| Filename | /Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/PPIx/Regexp/Tokenizer.pm |
| Statements | Executed 77 statements in 5.34ms |
| Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
|---|---|---|---|---|---|
| 1 | 1 | 1 | 1.69ms | 1.85ms | PPIx::Regexp::Tokenizer::BEGIN@34 |
| 1 | 1 | 1 | 1.20ms | 1.39ms | PPIx::Regexp::Tokenizer::BEGIN@33 |
| 1 | 1 | 1 | 618µs | 747µs | PPIx::Regexp::Tokenizer::BEGIN@23 |
| 1 | 1 | 1 | 600µs | 925µs | PPIx::Regexp::Tokenizer::BEGIN@30 |
| 1 | 1 | 1 | 598µs | 728µs | PPIx::Regexp::Tokenizer::BEGIN@19 |
| 1 | 1 | 1 | 595µs | 1.55ms | PPIx::Regexp::Tokenizer::BEGIN@15 |
| 1 | 1 | 1 | 471µs | 690µs | PPIx::Regexp::Tokenizer::BEGIN@29 |
| 1 | 1 | 1 | 440µs | 589µs | PPIx::Regexp::Tokenizer::BEGIN@20 |
| 1 | 1 | 1 | 377µs | 543µs | PPIx::Regexp::Tokenizer::BEGIN@28 |
| 1 | 1 | 1 | 362µs | 519µs | PPIx::Regexp::Tokenizer::BEGIN@22 |
| 1 | 1 | 1 | 358µs | 524µs | PPIx::Regexp::Tokenizer::BEGIN@25 |
| 1 | 1 | 1 | 327µs | 504µs | PPIx::Regexp::Tokenizer::BEGIN@36 |
| 1 | 1 | 1 | 317µs | 451µs | PPIx::Regexp::Tokenizer::BEGIN@14 |
| 1 | 1 | 1 | 301µs | 722µs | PPIx::Regexp::Tokenizer::BEGIN@17 |
| 1 | 1 | 1 | 285µs | 1.16ms | PPIx::Regexp::Tokenizer::BEGIN@26 |
| 1 | 1 | 1 | 282µs | 414µs | PPIx::Regexp::Tokenizer::BEGIN@32 |
| 1 | 1 | 1 | 266µs | 375µs | PPIx::Regexp::Tokenizer::BEGIN@37 |
| 1 | 1 | 1 | 248µs | 386µs | PPIx::Regexp::Tokenizer::BEGIN@40 |
| 1 | 1 | 1 | 246µs | 376µs | PPIx::Regexp::Tokenizer::BEGIN@41 |
| 1 | 1 | 1 | 237µs | 398µs | PPIx::Regexp::Tokenizer::BEGIN@18 |
| 1 | 1 | 1 | 234µs | 334µs | PPIx::Regexp::Tokenizer::BEGIN@16 |
| 1 | 1 | 1 | 217µs | 354µs | PPIx::Regexp::Tokenizer::BEGIN@31 |
| 1 | 1 | 1 | 206µs | 4.54ms | PPIx::Regexp::Tokenizer::BEGIN@24 |
| 1 | 1 | 1 | 206µs | 338µs | PPIx::Regexp::Tokenizer::BEGIN@27 |
| 1 | 1 | 1 | 201µs | 297µs | PPIx::Regexp::Tokenizer::BEGIN@21 |
| 1 | 1 | 1 | 12µs | 23µs | PPIx::Regexp::Tokenizer::BEGIN@3 |
| 1 | 1 | 1 | 8µs | 28µs | PPIx::Regexp::Tokenizer::BEGIN@42 |
| 1 | 1 | 1 | 8µs | 32µs | PPIx::Regexp::Tokenizer::BEGIN@9 |
| 1 | 1 | 1 | 8µs | 30µs | PPIx::Regexp::Tokenizer::BEGIN@8 |
| 1 | 1 | 1 | 7µs | 27µs | PPIx::Regexp::Tokenizer::BEGIN@43 |
| 1 | 1 | 1 | 7µs | 12µs | PPIx::Regexp::Tokenizer::BEGIN@4 |
| 1 | 1 | 1 | 7µs | 53µs | PPIx::Regexp::Tokenizer::BEGIN@6 |
| 1 | 1 | 1 | 5µs | 5µs | PPIx::Regexp::Tokenizer::BEGIN@35 |
| 1 | 1 | 1 | 4µs | 4µs | PPIx::Regexp::Tokenizer::BEGIN@38 |
| 1 | 1 | 1 | 4µs | 4µs | PPIx::Regexp::Tokenizer::BEGIN@39 |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__PPIX_TOKENIZER__finish |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__PPIX_TOKENIZER__init |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__PPIX_TOKENIZER__regexp |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__PPIX_TOKEN_FALLBACK__regexp |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__PPIX_TOKEN_FALLBACK__repl |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::__effective_modifiers |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::_known_tokenizer_check |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::_known_tokenizers |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::_remainder |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::capture |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::content |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::cookie |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::default_modifiers |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::encoding |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::errstr |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::expect |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::failures |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::find_matching_delimiter |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::find_regexp |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::get_start_delimiter |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::get_token |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::interpolates |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::make_token |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::match |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::modifier |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::modifier_duplicate |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::modifier_modify |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::modifier_pop |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::new |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::next_token |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::peek |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::ppi_document |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::prior |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::significant |
| 0 | 0 | 0 | 0s | 0s | PPIx::Regexp::Tokenizer::tokens |
| Line | State ments |
Time on line |
Calls | Time in subs |
Code |
|---|---|---|---|---|---|
| 1 | package PPIx::Regexp::Tokenizer; | ||||
| 2 | |||||
| 3 | 2 | 19µs | 2 | 35µs | # spent 23µs (12+12) within PPIx::Regexp::Tokenizer::BEGIN@3 which was called:
# once (12µs+12µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 3 # spent 23µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@3
# spent 12µs making 1 call to strict::import |
| 4 | 2 | 20µs | 2 | 16µs | # spent 12µs (7+4) within PPIx::Regexp::Tokenizer::BEGIN@4 which was called:
# once (7µs+4µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 4 # spent 12µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@4
# spent 4µs making 1 call to warnings::import |
| 5 | |||||
| 6 | 2 | 26µs | 2 | 99µs | # spent 53µs (7+46) within PPIx::Regexp::Tokenizer::BEGIN@6 which was called:
# once (7µs+46µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 6 # spent 53µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@6
# spent 46µs making 1 call to base::import |
| 7 | |||||
| 8 | 2 | 27µs | 2 | 52µs | # spent 30µs (8+22) within PPIx::Regexp::Tokenizer::BEGIN@8 which was called:
# once (8µs+22µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 8 # spent 30µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@8
# spent 22µs making 1 call to Exporter::import |
| 9 | 1 | 300ns | # spent 32µs (8+24) within PPIx::Regexp::Tokenizer::BEGIN@9 which was called:
# once (8µs+24µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 13 | ||
| 10 | MINIMUM_PERL | ||||
| 11 | TOKEN_LITERAL | ||||
| 12 | TOKEN_UNKNOWN | ||||
| 13 | 1 | 19µs | 2 | 56µs | }; # spent 32µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@9
# spent 24µs making 1 call to Exporter::import |
| 14 | 2 | 91µs | 1 | 451µs | # spent 451µs (317+134) within PPIx::Regexp::Tokenizer::BEGIN@14 which was called:
# once (317µs+134µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 14 # spent 451µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@14 |
| 15 | 2 | 90µs | 1 | 1.55ms | # spent 1.55ms (595µs+955µs) within PPIx::Regexp::Tokenizer::BEGIN@15 which was called:
# once (595µs+955µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 15 # spent 1.55ms making 1 call to PPIx::Regexp::Tokenizer::BEGIN@15 |
| 16 | 2 | 135µs | 1 | 334µs | # spent 334µs (234+100) within PPIx::Regexp::Tokenizer::BEGIN@16 which was called:
# once (234µs+100µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 16 # spent 334µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@16 |
| 17 | 2 | 102µs | 1 | 722µs | # spent 722µs (301+421) within PPIx::Regexp::Tokenizer::BEGIN@17 which was called:
# once (301µs+421µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 17 # spent 722µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@17 |
| 18 | 2 | 95µs | 1 | 398µs | # spent 398µs (237+161) within PPIx::Regexp::Tokenizer::BEGIN@18 which was called:
# once (237µs+161µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 18 # spent 398µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@18 |
| 19 | 2 | 99µs | 1 | 728µs | # spent 728µs (598+131) within PPIx::Regexp::Tokenizer::BEGIN@19 which was called:
# once (598µs+131µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 19 # spent 728µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@19 |
| 20 | 2 | 92µs | 1 | 589µs | # spent 589µs (440+150) within PPIx::Regexp::Tokenizer::BEGIN@20 which was called:
# once (440µs+150µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 20 # spent 589µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@20 |
| 21 | 2 | 89µs | 1 | 297µs | # spent 297µs (201+97) within PPIx::Regexp::Tokenizer::BEGIN@21 which was called:
# once (201µs+97µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 21 # spent 297µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@21 |
| 22 | 2 | 85µs | 1 | 519µs | # spent 519µs (362+157) within PPIx::Regexp::Tokenizer::BEGIN@22 which was called:
# once (362µs+157µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 22 # spent 519µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@22 |
| 23 | 2 | 113µs | 1 | 747µs | # spent 747µs (618+130) within PPIx::Regexp::Tokenizer::BEGIN@23 which was called:
# once (618µs+130µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 23 # spent 747µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@23 |
| 24 | 2 | 144µs | 1 | 4.54ms | # spent 4.54ms (206µs+4.34) within PPIx::Regexp::Tokenizer::BEGIN@24 which was called:
# once (206µs+4.34ms) by PPIx::Regexp::Lexer::BEGIN@61 at line 24 # spent 4.54ms making 1 call to PPIx::Regexp::Tokenizer::BEGIN@24 |
| 25 | 2 | 123µs | 1 | 524µs | # spent 524µs (358+166) within PPIx::Regexp::Tokenizer::BEGIN@25 which was called:
# once (358µs+166µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 25 # spent 524µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@25 |
| 26 | 2 | 128µs | 1 | 1.16ms | # spent 1.16ms (285µs+873µs) within PPIx::Regexp::Tokenizer::BEGIN@26 which was called:
# once (285µs+873µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 26 # spent 1.16ms making 1 call to PPIx::Regexp::Tokenizer::BEGIN@26 |
| 27 | 2 | 120µs | 1 | 338µs | # spent 338µs (206+133) within PPIx::Regexp::Tokenizer::BEGIN@27 which was called:
# once (206µs+133µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 27 # spent 338µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@27 |
| 28 | 2 | 124µs | 1 | 543µs | # spent 543µs (377+166) within PPIx::Regexp::Tokenizer::BEGIN@28 which was called:
# once (377µs+166µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 28 # spent 543µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@28 |
| 29 | 2 | 128µs | 1 | 690µs | # spent 690µs (471+220) within PPIx::Regexp::Tokenizer::BEGIN@29 which was called:
# once (471µs+220µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 29 # spent 690µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@29 |
| 30 | 2 | 123µs | 1 | 925µs | # spent 925µs (600+325) within PPIx::Regexp::Tokenizer::BEGIN@30 which was called:
# once (600µs+325µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 30 # spent 925µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@30 |
| 31 | 2 | 123µs | 1 | 354µs | # spent 354µs (217+136) within PPIx::Regexp::Tokenizer::BEGIN@31 which was called:
# once (217µs+136µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 31 # spent 354µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@31 |
| 32 | 2 | 137µs | 1 | 414µs | # spent 414µs (282+133) within PPIx::Regexp::Tokenizer::BEGIN@32 which was called:
# once (282µs+133µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 32 # spent 414µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@32 |
| 33 | 2 | 124µs | 1 | 1.39ms | # spent 1.39ms (1.20+185µs) within PPIx::Regexp::Tokenizer::BEGIN@33 which was called:
# once (1.20ms+185µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 33 # spent 1.39ms making 1 call to PPIx::Regexp::Tokenizer::BEGIN@33 |
| 34 | 2 | 109µs | 1 | 1.85ms | # spent 1.85ms (1.69+163µs) within PPIx::Regexp::Tokenizer::BEGIN@34 which was called:
# once (1.69ms+163µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 34 # spent 1.85ms making 1 call to PPIx::Regexp::Tokenizer::BEGIN@34 |
| 35 | 2 | 21µs | 1 | 5µs | # spent 5µs within PPIx::Regexp::Tokenizer::BEGIN@35 which was called:
# once (5µs+0s) by PPIx::Regexp::Lexer::BEGIN@61 at line 35 # spent 5µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@35 |
| 36 | 2 | 103µs | 1 | 504µs | # spent 504µs (327+177) within PPIx::Regexp::Tokenizer::BEGIN@36 which was called:
# once (327µs+177µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 36 # spent 504µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@36 |
| 37 | 2 | 97µs | 1 | 375µs | # spent 375µs (266+109) within PPIx::Regexp::Tokenizer::BEGIN@37 which was called:
# once (266µs+109µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 37 # spent 375µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@37 |
| 38 | 2 | 16µs | 1 | 4µs | # spent 4µs within PPIx::Regexp::Tokenizer::BEGIN@38 which was called:
# once (4µs+0s) by PPIx::Regexp::Lexer::BEGIN@61 at line 38 # spent 4µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@38 |
| 39 | 2 | 17µs | 1 | 4µs | # spent 4µs within PPIx::Regexp::Tokenizer::BEGIN@39 which was called:
# once (4µs+0s) by PPIx::Regexp::Lexer::BEGIN@61 at line 39 # spent 4µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@39 |
| 40 | 2 | 90µs | 1 | 386µs | # spent 386µs (248+139) within PPIx::Regexp::Tokenizer::BEGIN@40 which was called:
# once (248µs+139µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 40 # spent 386µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@40 |
| 41 | 2 | 89µs | 1 | 376µs | # spent 376µs (246+130) within PPIx::Regexp::Tokenizer::BEGIN@41 which was called:
# once (246µs+130µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 41 # spent 376µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@41 |
| 42 | 2 | 21µs | 2 | 49µs | # spent 28µs (8+21) within PPIx::Regexp::Tokenizer::BEGIN@42 which was called:
# once (8µs+21µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 42 # spent 28µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@42
# spent 21µs making 1 call to Exporter::import |
| 43 | 2 | 2.40ms | 2 | 47µs | # spent 27µs (7+20) within PPIx::Regexp::Tokenizer::BEGIN@43 which was called:
# once (7µs+20µs) by PPIx::Regexp::Lexer::BEGIN@61 at line 43 # spent 27µs making 1 call to PPIx::Regexp::Tokenizer::BEGIN@43
# spent 20µs making 1 call to Exporter::import |
| 44 | |||||
| 45 | 1 | 700ns | our $VERSION = '0.036'; | ||
| 46 | |||||
| 47 | { | ||||
| 48 | # Names of classes containing tokenization machinery. There are few | ||||
| 49 | # known ordering requirements, since each class recognizes its own, | ||||
| 50 | # and I have tried to prevent overlap. Absent such constraints, the | ||||
| 51 | # order is in perceived frequency of acceptance, to keep the search | ||||
| 52 | # as short as possible. If I were conscientious I would gather | ||||
| 53 | # statistics on this. | ||||
| 54 | 2 | 3µs | my @classes = ( # TODO make readonly when acceptable way appears | ||
| 55 | 'PPIx::Regexp::Token::Literal', | ||||
| 56 | 'PPIx::Regexp::Token::Interpolation', | ||||
| 57 | 'PPIx::Regexp::Token::Control', # Note 1 | ||||
| 58 | 'PPIx::Regexp::Token::CharClass::Simple', # Note 2 | ||||
| 59 | 'PPIx::Regexp::Token::Quantifier', | ||||
| 60 | 'PPIx::Regexp::Token::Greediness', | ||||
| 61 | 'PPIx::Regexp::Token::CharClass::POSIX', # Note 3 | ||||
| 62 | 'PPIx::Regexp::Token::Structure', | ||||
| 63 | 'PPIx::Regexp::Token::Assertion', | ||||
| 64 | 'PPIx::Regexp::Token::Backreference', | ||||
| 65 | 'PPIx::Regexp::Token::Operator', # Note 4 | ||||
| 66 | ); | ||||
| 67 | |||||
| 68 | # Note 1: If we are in quote mode ( \Q ... \E ), Control makes a | ||||
| 69 | # literal out of anything it sees other than \E. So it | ||||
| 70 | # needs to come before almost all other tokenizers. Not | ||||
| 71 | # Literal, which already makes literals, and not | ||||
| 72 | # Interpolation, which is legal in quote mode, but | ||||
| 73 | # everything else. | ||||
| 74 | |||||
| 75 | # Note 2: CharClass::Simple must come after Literal, because it | ||||
| 76 | # relies on Literal to recognize a Unicode named character | ||||
| 77 | # ( \N{something} ), so any \N that comes through to it | ||||
| 78 | # must be the \N simple character class (which represents | ||||
| 79 | # anything but a newline, and was introduced in Perl | ||||
| 80 | # 5.11.0. | ||||
| 81 | |||||
| 82 | # Note 3: CharClass::POSIX has to come before Structure, since both | ||||
| 83 | # look for square brackets, and CharClass::POSIX is the | ||||
| 84 | # more particular. | ||||
| 85 | |||||
| 86 | # Note 4: Operator relies on Literal making the characters literal | ||||
| 87 | # if they appear in a context where they can not be | ||||
| 88 | # operators, and Control making them literals if quoting, | ||||
| 89 | # so it must come after both. | ||||
| 90 | |||||
| 91 | sub _known_tokenizers { | ||||
| 92 | my ( $self ) = @_; | ||||
| 93 | |||||
| 94 | my $mode = $self->{mode}; | ||||
| 95 | |||||
| 96 | my @expect; | ||||
| 97 | if ( $self->{expect_next} ) { | ||||
| 98 | $self->{expect} = $self->{expect_next}; | ||||
| 99 | $self->{expect_next} = undef; | ||||
| 100 | } | ||||
| 101 | if ( $self->{expect} ) { | ||||
| 102 | @expect = $self->_known_tokenizer_check( | ||||
| 103 | @{ $self->{expect} } ); | ||||
| 104 | } | ||||
| 105 | |||||
| 106 | exists $self->{known}{$mode} and return ( | ||||
| 107 | @expect, @{ $self->{known}{$mode} } ); | ||||
| 108 | |||||
| 109 | my @found = $self->_known_tokenizer_check( @classes ); | ||||
| 110 | |||||
| 111 | $self->{known}{$mode} = \@found; | ||||
| 112 | return (@expect, @found); | ||||
| 113 | } | ||||
| 114 | |||||
| 115 | sub _known_tokenizer_check { | ||||
| 116 | my ( $self, @args ) = @_; | ||||
| 117 | |||||
| 118 | my $mode = $self->{mode}; | ||||
| 119 | |||||
| 120 | my $handler = '__PPIX_TOKENIZER__' . $mode; | ||||
| 121 | my @found; | ||||
| 122 | |||||
| 123 | foreach my $class ( @args ) { | ||||
| 124 | |||||
| 125 | $class->can( $handler ) or next; | ||||
| 126 | push @found, $class; | ||||
| 127 | |||||
| 128 | } | ||||
| 129 | |||||
| 130 | return @found; | ||||
| 131 | } | ||||
| 132 | |||||
| 133 | } | ||||
| 134 | |||||
| 135 | { | ||||
| 136 | 2 | 100ns | my $errstr; | ||
| 137 | |||||
| 138 | sub new { | ||||
| 139 | my ( $class, $re, %args ) = @_; | ||||
| 140 | ref $class and $class = ref $class; | ||||
| 141 | |||||
| 142 | $errstr = undef; | ||||
| 143 | |||||
| 144 | exists $args{default_modifiers} | ||||
| 145 | and 'ARRAY' ne ref $args{default_modifiers} | ||||
| 146 | and do { | ||||
| 147 | $errstr = 'default_modifiers must be an array reference'; | ||||
| 148 | return; | ||||
| 149 | }; | ||||
| 150 | |||||
| 151 | my $self = { | ||||
| 152 | capture => undef, # Captures from find_regexp. | ||||
| 153 | content => undef, # The string we are tokenizing. | ||||
| 154 | cookie => {}, # Cookies | ||||
| 155 | cursor_curr => 0, # The current position in the string. | ||||
| 156 | cursor_limit => undef, # The end of the portion of the | ||||
| 157 | # string being tokenized. | ||||
| 158 | cursor_orig => undef, # Position of cursor when tokenizer | ||||
| 159 | # called. Used by get_token to prevent | ||||
| 160 | # recursion. | ||||
| 161 | cursor_modifiers => undef, # Position of modifiers. | ||||
| 162 | default_modifiers => $args{default_modifiers} || [], | ||||
| 163 | delimiter_finish => undef, # Finishing delimiter of regexp. | ||||
| 164 | delimiter_re => undef, # Recognize finishing delimiter. | ||||
| 165 | delimiter_start => undef, # Starting delimiter of regexp. | ||||
| 166 | encoding => $args{encoding}, # Character encoding. | ||||
| 167 | expect => undef, # Extra classes to expect. | ||||
| 168 | expect_next => undef, # Extra classes as of next parse cycle | ||||
| 169 | failures => 0, # Number of parse failures. | ||||
| 170 | find => undef, # String for find_regexp | ||||
| 171 | known => {}, # Known tokenizers, by mode. | ||||
| 172 | match => undef, # Match from find_regexp. | ||||
| 173 | mode => 'init', # Initialize | ||||
| 174 | modifiers => [{}], # Modifier hash. | ||||
| 175 | pending => [], # Tokens made but not returned. | ||||
| 176 | prior => TOKEN_UNKNOWN, # Prior significant token. | ||||
| 177 | source => $re, # The object we were initialized with. | ||||
| 178 | trace => __PACKAGE__->_defined_or( | ||||
| 179 | $args{trace}, $ENV{PPIX_REGEXP_TOKENIZER_TRACE}, 0 ), | ||||
| 180 | }; | ||||
| 181 | |||||
| 182 | if ( __instance( $re, 'PPI::Element' ) ) { | ||||
| 183 | $self->{content} = $re->content(); | ||||
| 184 | } elsif ( ref $re ) { | ||||
| 185 | $errstr = ref( $re ) . ' not supported'; | ||||
| 186 | return; | ||||
| 187 | } else { | ||||
| 188 | $self->{content} = $re; | ||||
| 189 | } | ||||
| 190 | |||||
| 191 | bless $self, $class; | ||||
| 192 | |||||
| 193 | $self->{content} = $self->decode( $self->{content} ); | ||||
| 194 | |||||
| 195 | if ( $self->{content} =~ m/ \s+ \z /smx ) { | ||||
| 196 | $self->{cursor_limit} = $-[0]; | ||||
| 197 | } else { | ||||
| 198 | $self->{cursor_limit} = length $self->{content}; | ||||
| 199 | } | ||||
| 200 | |||||
| 201 | $self->{trace} | ||||
| 202 | and warn "\ntokenizing '$self->{content}'\n"; | ||||
| 203 | |||||
| 204 | return $self; | ||||
| 205 | } | ||||
| 206 | |||||
| 207 | sub errstr { | ||||
| 208 | return $errstr; | ||||
| 209 | } | ||||
| 210 | |||||
| 211 | } | ||||
| 212 | |||||
| 213 | sub capture { | ||||
| 214 | my ( $self ) = @_; | ||||
| 215 | $self->{capture} or return; | ||||
| 216 | defined wantarray or return; | ||||
| 217 | return wantarray ? @{ $self->{capture} } : $self->{capture}; | ||||
| 218 | } | ||||
| 219 | |||||
| 220 | sub content { | ||||
| 221 | my ( $self ) = @_; | ||||
| 222 | return $self->{content}; | ||||
| 223 | } | ||||
| 224 | |||||
| 225 | sub cookie { | ||||
| 226 | my ( $self, $name, @args ) = @_; | ||||
| 227 | defined $name | ||||
| 228 | or confess "Programming error - undefined cookie name"; | ||||
| 229 | @args or return $self->{cookie}{$name}; | ||||
| 230 | my $cookie = shift @args; | ||||
| 231 | if ( ref $cookie eq 'CODE' ) { | ||||
| 232 | return ( $self->{cookie}{$name} = $cookie ); | ||||
| 233 | } elsif ( defined $cookie ) { | ||||
| 234 | confess "Programming error - cookie must be CODE ref or undef"; | ||||
| 235 | } else { | ||||
| 236 | return delete $self->{cookie}{$name}; | ||||
| 237 | } | ||||
| 238 | } | ||||
| 239 | |||||
| 240 | sub default_modifiers { | ||||
| 241 | my ( $self ) = @_; | ||||
| 242 | return [ @{ $self->{default_modifiers} } ]; | ||||
| 243 | } | ||||
| 244 | |||||
| 245 | sub __effective_modifiers { | ||||
| 246 | my ( $self ) = @_; | ||||
| 247 | 'HASH' eq ref $self->{effective_modifiers} | ||||
| 248 | or return {}; | ||||
| 249 | return { %{ $self->{effective_modifiers} } }; | ||||
| 250 | } | ||||
| 251 | |||||
| 252 | sub encoding { | ||||
| 253 | my ( $self ) = @_; | ||||
| 254 | return $self->{encoding}; | ||||
| 255 | } | ||||
| 256 | |||||
| 257 | sub expect { | ||||
| 258 | my ( $self, @args ) = @_; | ||||
| 259 | |||||
| 260 | @args | ||||
| 261 | or return; | ||||
| 262 | |||||
| 263 | $self->{expect_next} = [ | ||||
| 264 | map { m/ \A PPIx::Regexp:: /smx ? $_ : 'PPIx::Regexp::' . $_ } | ||||
| 265 | @args | ||||
| 266 | ]; | ||||
| 267 | $self->{expect} = undef; | ||||
| 268 | return; | ||||
| 269 | } | ||||
| 270 | |||||
| 271 | sub failures { | ||||
| 272 | my ( $self ) = @_; | ||||
| 273 | return $self->{failures}; | ||||
| 274 | } | ||||
| 275 | |||||
| 276 | sub find_matching_delimiter { | ||||
| 277 | my ( $self ) = @_; | ||||
| 278 | $self->{cursor_curr} ||= 0; | ||||
| 279 | my $start = substr | ||||
| 280 | $self->{content}, | ||||
| 281 | $self->{cursor_curr}, | ||||
| 282 | 1; | ||||
| 283 | |||||
| 284 | my $inx = $self->{cursor_curr}; | ||||
| 285 | my $finish = ( | ||||
| 286 | my $bracketed = $self->close_bracket( $start ) ) || $start; | ||||
| 287 | my $nest = 0; | ||||
| 288 | |||||
| 289 | while ( ++$inx < $self->{cursor_limit} ) { | ||||
| 290 | my $char = substr $self->{content}, $inx, 1; | ||||
| 291 | if ( $char eq '\\' && $finish ne '\\' ) { | ||||
| 292 | ++$inx; | ||||
| 293 | } elsif ( $bracketed && $char eq $start ) { | ||||
| 294 | ++$nest; | ||||
| 295 | } elsif ( $char eq $finish ) { | ||||
| 296 | --$nest < 0 | ||||
| 297 | and return $inx - $self->{cursor_curr}; | ||||
| 298 | } | ||||
| 299 | } | ||||
| 300 | |||||
| 301 | return; | ||||
| 302 | } | ||||
| 303 | |||||
| 304 | sub find_regexp { | ||||
| 305 | my ( $self, $regexp ) = @_; | ||||
| 306 | |||||
| 307 | ref $regexp eq 'Regexp' | ||||
| 308 | or confess | ||||
| 309 | 'Argument is a ', ( ref $regexp || 'scalar' ), ' not a Regexp'; | ||||
| 310 | |||||
| 311 | defined $self->{find} or $self->_remainder(); | ||||
| 312 | |||||
| 313 | $self->{find} =~ $regexp | ||||
| 314 | or return; | ||||
| 315 | |||||
| 316 | my @capture; | ||||
| 317 | foreach my $inx ( 0 .. $#+ ) { | ||||
| 318 | if ( defined $-[$inx] && defined $+[$inx] ) { | ||||
| 319 | push @capture, $self->{capture} = substr | ||||
| 320 | $self->{find}, | ||||
| 321 | $-[$inx], | ||||
| 322 | $+[$inx] - $-[$inx]; | ||||
| 323 | } else { | ||||
| 324 | push @capture, undef; | ||||
| 325 | } | ||||
| 326 | } | ||||
| 327 | $self->{match} = shift @capture; | ||||
| 328 | $self->{capture} = \@capture; | ||||
| 329 | |||||
| 330 | # The following circumlocution seems to be needed under Perl 5.13.0 | ||||
| 331 | # for reasons I do not fathom -- at least in the case where | ||||
| 332 | # wantarray is false. RT 56864 details the symptoms, which I was | ||||
| 333 | # never able to reproduce outside Perl::Critic. But returning $+[0] | ||||
| 334 | # directly, the value could transmogrify between here and the | ||||
| 335 | # calling module. | ||||
| 336 | ## my @data = ( $-[0], $+[0] ); | ||||
| 337 | ## return wantarray ? @data : $data[1]; | ||||
| 338 | return wantarray ? ( $-[0] + 0, $+[0] + 0 ) : $+[0] + 0; | ||||
| 339 | } | ||||
| 340 | |||||
| 341 | sub get_start_delimiter { | ||||
| 342 | my ( $self ) = @_; | ||||
| 343 | return $self->{delimiter_start}; | ||||
| 344 | } | ||||
| 345 | |||||
| 346 | sub get_token { | ||||
| 347 | my ( $self ) = @_; | ||||
| 348 | |||||
| 349 | caller eq __PACKAGE__ or $self->{cursor_curr} > $self->{cursor_orig} | ||||
| 350 | or confess 'Programming error - get_token() called without ', | ||||
| 351 | 'first calling make_token()'; | ||||
| 352 | |||||
| 353 | my $handler = '__PPIX_TOKENIZER__' . $self->{mode}; | ||||
| 354 | |||||
| 355 | my $character = substr( | ||||
| 356 | $self->{content}, | ||||
| 357 | $self->{cursor_curr}, | ||||
| 358 | 1 | ||||
| 359 | ); | ||||
| 360 | |||||
| 361 | return ( __PACKAGE__->$handler( $self, $character ) ); | ||||
| 362 | } | ||||
| 363 | |||||
| 364 | sub interpolates { | ||||
| 365 | my ( $self ) = @_; | ||||
| 366 | return $self->{delimiter_start} ne q{'}; | ||||
| 367 | } | ||||
| 368 | |||||
| 369 | sub make_token { | ||||
| 370 | my ( $self, $length, $class, $arg ) = @_; | ||||
| 371 | defined $class or $class = caller; | ||||
| 372 | |||||
| 373 | if ( $length + $self->{cursor_curr} > $self->{cursor_limit} ) { | ||||
| 374 | $length = $self->{cursor_limit} - $self->{cursor_curr} | ||||
| 375 | or return; | ||||
| 376 | } | ||||
| 377 | |||||
| 378 | $class =~ m/ \A PPIx::Regexp:: /smx | ||||
| 379 | or $class = 'PPIx::Regexp::' . $class; | ||||
| 380 | my $content = substr | ||||
| 381 | $self->{content}, | ||||
| 382 | $self->{cursor_curr}, | ||||
| 383 | $length; | ||||
| 384 | |||||
| 385 | $self->{trace} | ||||
| 386 | and warn "make_token( $length, '$class' ) => '$content'\n"; | ||||
| 387 | $self->{trace} > 1 | ||||
| 388 | and warn " make_token: cursor_curr = $self->{cursor_curr}; ", | ||||
| 389 | "cursor_limit = $self->{cursor_limit}\n"; | ||||
| 390 | my $token = $class->_new( $content ) or return; | ||||
| 391 | $token->significant() and $self->{expect} = undef; | ||||
| 392 | $token->__PPIX_TOKEN__post_make( $self, $arg ); | ||||
| 393 | |||||
| 394 | $token->isa( TOKEN_UNKNOWN ) and $self->{failures}++; | ||||
| 395 | |||||
| 396 | $self->{cursor_curr} += $length; | ||||
| 397 | $self->{find} = undef; | ||||
| 398 | $self->{match} = undef; | ||||
| 399 | $self->{capture} = undef; | ||||
| 400 | |||||
| 401 | foreach my $name ( keys %{ $self->{cookie} } ) { | ||||
| 402 | my $cookie = $self->{cookie}{$name}; | ||||
| 403 | $cookie->( $self, $token ) | ||||
| 404 | or delete $self->{cookie}{$name}; | ||||
| 405 | } | ||||
| 406 | |||||
| 407 | # Record this token as the prior token if it is significant. We must | ||||
| 408 | # do this after processing cookies, so that the cookies have access | ||||
| 409 | # to the old token if they want. | ||||
| 410 | $token->significant() | ||||
| 411 | and $self->{prior} = $token; | ||||
| 412 | |||||
| 413 | return $token; | ||||
| 414 | } | ||||
| 415 | |||||
| 416 | sub match { | ||||
| 417 | my ( $self ) = @_; | ||||
| 418 | return $self->{match}; | ||||
| 419 | } | ||||
| 420 | |||||
| 421 | sub modifier { | ||||
| 422 | my ( $self, $modifier ) = @_; | ||||
| 423 | return $self->{modifiers}[-1]{$modifier}; | ||||
| 424 | } | ||||
| 425 | |||||
| 426 | sub modifier_duplicate { | ||||
| 427 | my ( $self ) = @_; | ||||
| 428 | push @{ $self->{modifiers} }, | ||||
| 429 | { %{ $self->{modifiers}[-1] } }; | ||||
| 430 | return; | ||||
| 431 | } | ||||
| 432 | |||||
| 433 | sub modifier_modify { | ||||
| 434 | my ( $self, %args ) = @_; | ||||
| 435 | |||||
| 436 | # Modifier code is centralized in PPIx::Regexp::Token::Modifier | ||||
| 437 | $self->{modifiers}[-1] = | ||||
| 438 | PPIx::Regexp::Token::Modifier::__PPIX_TOKENIZER__modifier_modify( | ||||
| 439 | $self->{modifiers}[-1], \%args ); | ||||
| 440 | |||||
| 441 | return; | ||||
| 442 | |||||
| 443 | } | ||||
| 444 | |||||
| 445 | sub modifier_pop { | ||||
| 446 | my ( $self ) = @_; | ||||
| 447 | @{ $self->{modifiers} } > 1 | ||||
| 448 | and pop @{ $self->{modifiers} }; | ||||
| 449 | return; | ||||
| 450 | } | ||||
| 451 | |||||
| 452 | sub next_token { | ||||
| 453 | my ( $self ) = @_; | ||||
| 454 | |||||
| 455 | { | ||||
| 456 | |||||
| 457 | if ( @{ $self->{pending} } ) { | ||||
| 458 | return shift @{ $self->{pending} }; | ||||
| 459 | } | ||||
| 460 | |||||
| 461 | if ( $self->{cursor_curr} >= $self->{cursor_limit} ) { | ||||
| 462 | $self->{cursor_limit} >= length $self->{content} | ||||
| 463 | and return; | ||||
| 464 | $self->{mode} eq 'finish' and return; | ||||
| 465 | $self->{mode} = 'finish'; | ||||
| 466 | $self->{cursor_limit}++; | ||||
| 467 | } | ||||
| 468 | |||||
| 469 | if ( my @tokens = $self->get_token() ) { | ||||
| 470 | push @{ $self->{pending} }, @tokens; | ||||
| 471 | redo; | ||||
| 472 | |||||
| 473 | } | ||||
| 474 | |||||
| 475 | } | ||||
| 476 | |||||
| 477 | return; | ||||
| 478 | |||||
| 479 | } | ||||
| 480 | |||||
| 481 | sub peek { | ||||
| 482 | my ( $self, $offset ) = @_; | ||||
| 483 | defined $offset or $offset = 0; | ||||
| 484 | $offset < 0 and return; | ||||
| 485 | $offset += $self->{cursor_curr}; | ||||
| 486 | $offset >= $self->{cursor_limit} and return; | ||||
| 487 | return substr $self->{content}, $offset, 1; | ||||
| 488 | } | ||||
| 489 | |||||
| 490 | sub ppi_document { | ||||
| 491 | my ( $self ) = @_; | ||||
| 492 | |||||
| 493 | defined $self->{find} or $self->_remainder(); | ||||
| 494 | |||||
| 495 | return PPI::Document->new( \"$self->{find}" ); | ||||
| 496 | } | ||||
| 497 | |||||
| 498 | sub prior { | ||||
| 499 | my ( $self, $method, @args ) = @_; | ||||
| 500 | defined $method or return $self->{prior}; | ||||
| 501 | $self->{prior}->can( $method ) | ||||
| 502 | or confess 'Programming error - ', | ||||
| 503 | ( ref $self->{prior} || $self->{prior} ), | ||||
| 504 | ' does not support method ', $method; | ||||
| 505 | return $self->{prior}->$method( @args ); | ||||
| 506 | } | ||||
| 507 | |||||
| 508 | sub significant { | ||||
| 509 | return 1; | ||||
| 510 | } | ||||
| 511 | |||||
| 512 | sub tokens { | ||||
| 513 | my ( $self ) = @_; | ||||
| 514 | |||||
| 515 | my @rslt; | ||||
| 516 | while ( my $token = $self->next_token() ) { | ||||
| 517 | push @rslt, $token; | ||||
| 518 | } | ||||
| 519 | |||||
| 520 | return @rslt; | ||||
| 521 | } | ||||
| 522 | |||||
| 523 | sub _remainder { | ||||
| 524 | my ( $self ) = @_; | ||||
| 525 | |||||
| 526 | $self->{cursor_curr} > $self->{cursor_limit} | ||||
| 527 | and confess "Programming error - Trying to find past end of string"; | ||||
| 528 | $self->{find} = substr( | ||||
| 529 | $self->{content}, | ||||
| 530 | $self->{cursor_curr}, | ||||
| 531 | $self->{cursor_limit} - $self->{cursor_curr} | ||||
| 532 | ); | ||||
| 533 | |||||
| 534 | return; | ||||
| 535 | } | ||||
| 536 | |||||
| 537 | sub __PPIX_TOKENIZER__init { | ||||
| 538 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 539 | |||||
| 540 | $tokenizer->{mode} = 'kaput'; | ||||
| 541 | $tokenizer->{content} =~ m/ \A \s* ( qr | m | s )? ( \s* ) ( [^\w\s] ) /smx | ||||
| 542 | or return $tokenizer->make_token( | ||||
| 543 | length( $tokenizer->{content} ), TOKEN_UNKNOWN, { | ||||
| 544 | error => 'Tokenizer found illegal first characters', | ||||
| 545 | }, | ||||
| 546 | ); | ||||
| 547 | # my ( $type, $white, $delim ) = ( $1, $2, $3 ); | ||||
| 548 | my ( $type, $white ) = ( $1, $2 ); | ||||
| 549 | my $start_pos = defined $-[1] ? $-[1] : | ||||
| 550 | defined $-[2] ? $-[2] : | ||||
| 551 | defined $-[3] ? $-[3] : 0; | ||||
| 552 | |||||
| 553 | defined $type or $type = ''; | ||||
| 554 | $tokenizer->{type} = $type; | ||||
| 555 | |||||
| 556 | my @tokens; | ||||
| 557 | $start_pos | ||||
| 558 | and push @tokens, $tokenizer->make_token( $start_pos, | ||||
| 559 | 'PPIx::Regexp::Token::Whitespace' ); | ||||
| 560 | push @tokens, $tokenizer->make_token( length $type, | ||||
| 561 | 'PPIx::Regexp::Token::Structure' ); | ||||
| 562 | length $white > 0 | ||||
| 563 | and push @tokens, $tokenizer->make_token( length $white, | ||||
| 564 | 'PPIx::Regexp::Token::Whitespace' ); | ||||
| 565 | |||||
| 566 | { | ||||
| 567 | my @mods = @{ $tokenizer->{default_modifiers} }; | ||||
| 568 | if ( $tokenizer->{content} =~ m/ ( [[:lower:]]* ) \s* \z /smx ) { | ||||
| 569 | my $mod = $1; | ||||
| 570 | $tokenizer->{cursor_limit} -= length $mod; | ||||
| 571 | push @mods, $mod; | ||||
| 572 | } | ||||
| 573 | $tokenizer->{effective_modifiers} = | ||||
| 574 | PPIx::Regexp::Token::Modifier::__aggregate_modifiers ( | ||||
| 575 | @mods ); | ||||
| 576 | $tokenizer->{modifiers} = [ | ||||
| 577 | { %{ $tokenizer->{effective_modifiers} } }, | ||||
| 578 | ]; | ||||
| 579 | $tokenizer->{cursor_modifiers} = $tokenizer->{cursor_limit}; | ||||
| 580 | } | ||||
| 581 | |||||
| 582 | $tokenizer->{delimiter_start} = substr | ||||
| 583 | $tokenizer->{content}, | ||||
| 584 | $tokenizer->{cursor_curr}, | ||||
| 585 | 1; | ||||
| 586 | |||||
| 587 | if ( $type eq 's' and my $offset = $tokenizer->find_matching_delimiter() ) { | ||||
| 588 | $tokenizer->{cursor_limit} = $tokenizer->{cursor_curr} + $offset; | ||||
| 589 | } else { | ||||
| 590 | $tokenizer->{cursor_limit} = $tokenizer->{cursor_modifiers} - 1; | ||||
| 591 | } | ||||
| 592 | |||||
| 593 | $tokenizer->{delimiter_finish} = substr | ||||
| 594 | $tokenizer->{content}, | ||||
| 595 | $tokenizer->{cursor_limit}, | ||||
| 596 | 1; | ||||
| 597 | $tokenizer->{delimiter_re} = undef; | ||||
| 598 | |||||
| 599 | push @tokens, $tokenizer->make_token( 1, | ||||
| 600 | 'PPIx::Regexp::Token::Delimiter' ); | ||||
| 601 | |||||
| 602 | $tokenizer->{mode} = 'regexp'; | ||||
| 603 | |||||
| 604 | return @tokens; | ||||
| 605 | } | ||||
| 606 | |||||
| 607 | sub __PPIX_TOKENIZER__regexp { | ||||
| 608 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 609 | |||||
| 610 | my $mode = $tokenizer->{mode}; | ||||
| 611 | my $handler = '__PPIX_TOKENIZER__' . $mode; | ||||
| 612 | |||||
| 613 | $tokenizer->{cursor_orig} = $tokenizer->{cursor_curr}; | ||||
| 614 | foreach my $class( $tokenizer->_known_tokenizers() ) { | ||||
| 615 | my @tokens = grep { $_ } $class->$handler( $tokenizer, $character ); | ||||
| 616 | $tokenizer->{trace} | ||||
| 617 | and warn $class, "->$handler( \$tokenizer, '$character' )", | ||||
| 618 | " => (@tokens)\n"; | ||||
| 619 | @tokens | ||||
| 620 | and return ( map { | ||||
| 621 | ref $_ ? $_ : $tokenizer->make_token( $_, | ||||
| 622 | $class ) } @tokens ); | ||||
| 623 | } | ||||
| 624 | |||||
| 625 | # Find a fallback processor for the character. | ||||
| 626 | my $fallback = __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__' . $mode ) | ||||
| 627 | || __PACKAGE__->can( '__PPIX_TOKEN_FALLBACK__regexp' ) | ||||
| 628 | || confess "Programming error - unable to find fallback for $mode"; | ||||
| 629 | return $fallback->( $class, $tokenizer, $character ); | ||||
| 630 | } | ||||
| 631 | |||||
| 632 | 1 | 1µs | *__PPIX_TOKENIZER__repl = \&__PPIX_TOKENIZER__regexp; | ||
| 633 | |||||
| 634 | sub __PPIX_TOKEN_FALLBACK__regexp { | ||||
| 635 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 636 | |||||
| 637 | # As a fallback in regexp mode, any escaped character is a literal. | ||||
| 638 | if ( $character eq '\\' | ||||
| 639 | && $tokenizer->{cursor_limit} - $tokenizer->{cursor_curr} > 1 | ||||
| 640 | ) { | ||||
| 641 | return $tokenizer->make_token( 2, TOKEN_LITERAL ); | ||||
| 642 | } | ||||
| 643 | |||||
| 644 | # Any normal character is unknown. | ||||
| 645 | return $tokenizer->make_token( 1, TOKEN_UNKNOWN, { | ||||
| 646 | error => 'Tokenizer found unexpected literal', | ||||
| 647 | }, | ||||
| 648 | ); | ||||
| 649 | } | ||||
| 650 | |||||
| 651 | sub __PPIX_TOKEN_FALLBACK__repl { | ||||
| 652 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 653 | |||||
| 654 | # As a fallback in replacement mode, any escaped character is a literal. | ||||
| 655 | if ( $character eq '\\' | ||||
| 656 | && defined ( my $next = $tokenizer->peek( 1 ) ) ) { | ||||
| 657 | |||||
| 658 | if ( $tokenizer->interpolates() || $next eq q<'> || $next eq '\\' ) { | ||||
| 659 | return $tokenizer->make_token( 2, TOKEN_LITERAL ); | ||||
| 660 | } | ||||
| 661 | return $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
| 662 | } | ||||
| 663 | |||||
| 664 | # So is any normal character. | ||||
| 665 | return $tokenizer->make_token( 1, TOKEN_LITERAL ); | ||||
| 666 | } | ||||
| 667 | |||||
| 668 | sub __PPIX_TOKENIZER__finish { | ||||
| 669 | my ( $class, $tokenizer, $character ) = @_; | ||||
| 670 | |||||
| 671 | $tokenizer->{cursor_limit} > length $tokenizer->{content} | ||||
| 672 | and confess "Programming error - ran off string"; | ||||
| 673 | my @tokens = $tokenizer->make_token( 1, | ||||
| 674 | 'PPIx::Regexp::Token::Delimiter' ); | ||||
| 675 | |||||
| 676 | if ( $tokenizer->{cursor_curr} eq $tokenizer->{cursor_modifiers} ) { | ||||
| 677 | |||||
| 678 | # We are out of string. Make the modifier token and close up | ||||
| 679 | # shop. | ||||
| 680 | my $trailer; | ||||
| 681 | if ( $tokenizer->{content} =~ m/ \s+ \z /smx ) { | ||||
| 682 | $tokenizer->{cursor_limit} = $-[0]; | ||||
| 683 | $trailer = length( $tokenizer->{content} ) - | ||||
| 684 | $tokenizer->{cursor_curr}; | ||||
| 685 | } else { | ||||
| 686 | $tokenizer->{cursor_limit} = length $tokenizer->{content}; | ||||
| 687 | } | ||||
| 688 | push @tokens, $tokenizer->make_token( | ||||
| 689 | $tokenizer->{cursor_limit} - $tokenizer->{cursor_curr}, | ||||
| 690 | 'PPIx::Regexp::Token::Modifier' ); | ||||
| 691 | if ( $trailer ) { | ||||
| 692 | $tokenizer->{cursor_limit} = length $tokenizer->{content}; | ||||
| 693 | push @tokens, $tokenizer->make_token( | ||||
| 694 | $trailer, 'PPIx::Regexp::Token::Whitespace' ); | ||||
| 695 | } | ||||
| 696 | $tokenizer->{mode} = 'kaput'; | ||||
| 697 | |||||
| 698 | } else { | ||||
| 699 | |||||
| 700 | # Clear the cookies, because we are going around again. | ||||
| 701 | $tokenizer->{cookie} = {}; | ||||
| 702 | |||||
| 703 | # Move the cursor limit to just before the modifiers. | ||||
| 704 | $tokenizer->{cursor_limit} = $tokenizer->{cursor_modifiers} - 1; | ||||
| 705 | |||||
| 706 | # If the preceding regular expression was bracketed, we need to | ||||
| 707 | # consume possible whitespace and find another delimiter. | ||||
| 708 | |||||
| 709 | if ( $tokenizer->close_bracket( $tokenizer->{delimiter_start} ) ) { | ||||
| 710 | my $accept; | ||||
| 711 | $accept = $tokenizer->find_regexp( qr{ \A \s+ }smx ) | ||||
| 712 | and push @tokens, $tokenizer->make_token( | ||||
| 713 | $accept, 'PPIx::Regexp::Token::Whitespace' ); | ||||
| 714 | my $character = $tokenizer->peek(); | ||||
| 715 | $tokenizer->{delimiter_start} = $character; | ||||
| 716 | push @tokens, $tokenizer->make_token( | ||||
| 717 | 1, 'PPIx::Regexp::Token::Delimiter' ); | ||||
| 718 | $tokenizer->{delimiter_finish} = substr | ||||
| 719 | $tokenizer->{content}, | ||||
| 720 | $tokenizer->{cursor_limit} - 1, | ||||
| 721 | 1; | ||||
| 722 | $tokenizer->{delimiter_re} = undef; | ||||
| 723 | } | ||||
| 724 | |||||
| 725 | if ( $tokenizer->modifier( 'e' ) ) { | ||||
| 726 | # With /e, the replacement portion is code. We make it all | ||||
| 727 | # into one big PPIx::Regexp::Token::Code, slap on the | ||||
| 728 | # trailing delimiter and modifiers, and return it all. | ||||
| 729 | push @tokens, $tokenizer->make_token( | ||||
| 730 | $tokenizer->{cursor_limit} - $tokenizer->{cursor_curr}, | ||||
| 731 | 'PPIx::Regexp::Token::Code', | ||||
| 732 | { perl_version_introduced => MINIMUM_PERL }, | ||||
| 733 | ); | ||||
| 734 | $tokenizer->{cursor_limit} = length $tokenizer->{content}; | ||||
| 735 | push @tokens, $tokenizer->make_token( 1, | ||||
| 736 | 'PPIx::Regexp::Token::Delimiter' ); | ||||
| 737 | push @tokens, $tokenizer->make_token( | ||||
| 738 | $tokenizer->{cursor_limit} - $tokenizer->{cursor_curr}, | ||||
| 739 | 'PPIx::Regexp::Token::Modifier' ); | ||||
| 740 | $tokenizer->{mode} = 'kaput'; | ||||
| 741 | } else { | ||||
| 742 | # Put our mode to replacement. | ||||
| 743 | $tokenizer->{mode} = 'repl'; | ||||
| 744 | } | ||||
| 745 | |||||
| 746 | } | ||||
| 747 | |||||
| 748 | return @tokens; | ||||
| 749 | |||||
| 750 | } | ||||
| 751 | |||||
| 752 | 1 | 5µs | 1; | ||
| 753 | |||||
| 754 | __END__ |