Profile of HTML/Entities.pm

Filename	/Users/timbo/perl5/perlbrew/perls/perl-5.18.2/lib/site_perl/5.18.2/darwin-2level/HTML/Entities.pm
Statements	Executed 941 statements in 2.40ms

Subroutines
Calls	P	F	Exclusive Time	Inclusive Time	Subroutine
253	1	1	159µs	159µs	HTML::Entities::::CORE:substHTML::Entities::CORE:subst (opcode)
1	1	1	15µs	34µs	HTML::Entities::::BEGIN@138HTML::Entities::BEGIN@138
1	1	1	8µs	66µs	HTML::Entities::::BEGIN@139HTML::Entities::BEGIN@139
1	1	1	6µs	33µs	HTML::Entities::::BEGIN@140HTML::Entities::BEGIN@140
0	0	0	0s	0s	HTML::Entities::::VersionHTML::Entities::Version
0	0	0	0s	0s	HTML::Entities::::encode_entitiesHTML::Entities::encode_entities
0	0	0	0s	0s	HTML::Entities::encode_entities_numeric
0	0	0	0s	0s	HTML::Entities::::num_entityHTML::Entities::num_entity

Call graph for these subroutines as a Graphviz dot language file.

Line	State ments	Time on line	Calls	Time in subs	Code
1					package HTML::Entities;
2
3					=encoding utf8
4
5					=head1 NAME
6
7					HTML::Entities - Encode or decode strings with HTML entities
8
9					=head1 SYNOPSIS
10
11					use HTML::Entities;
12
13					$a = "Våre norske tegn bør &#230res";
14					decode_entities($a);
15					encode_entities($a, "\200-\377");
16
17					For example, this:
18
19					$input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
20					print encode_entities($input), "\n"
21
22					Prints this out:
23
24					vis-à-vis Beyoncé's naïve
25					papier-mâché résumé
26
27					=head1 DESCRIPTION
28
29					This module deals with encoding and decoding of strings with HTML
30					character entities. The module provides the following functions:
31
32					=over 4
33
34					=item decode_entities( $string, ... )
35
36					This routine replaces HTML entities found in the $string with the
37					corresponding Unicode character. Unrecognized entities are left alone.
38
39					If multiple strings are provided as argument they are each decoded
40					separately and the same number of strings are returned.
41
42					If called in void context the arguments are decoded in-place.
43
44					This routine is exported by default.
45
46					=item _decode_entities( $string, \%entity2char )
47
48					=item _decode_entities( $string, \%entity2char, $expand_prefix )
49
50					This will in-place replace HTML entities in $string. The %entity2char
51					hash must be provided. Named entities not found in the %entity2char
52					hash are left alone. Numeric entities are expanded unless their value
53					overflow.
54
55					The keys in %entity2char are the entity names to be expanded and their
56					values are what they should expand into. The values do not have to be
57					single character strings. If a key has ";" as suffix,
58					then occurrences in $string are only expanded if properly terminated
59					with ";". Entities without ";" will be expanded regardless of how
60					they are terminated for compatibility with how common browsers treat
61					entities in the Latin-1 range.
62
63					If $expand_prefix is TRUE then entities without trailing ";" in
64					%entity2char will even be expanded as a prefix of a longer
65					unrecognized name. The longest matching name in %entity2char will be
66					used. This is mainly present for compatibility with an MSIE
67					misfeature.
68
69					$string = "foo&nbspbar";
70					_decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
71					print $string; # will print "foo bar"
72
73					This routine is exported by default.
74
75					=item encode_entities( $string )
76
77					=item encode_entities( $string, $unsafe_chars )
78
79					This routine replaces unsafe characters in $string with their entity
80					representation. A second argument can be given to specify which characters to
81					consider unsafe. The unsafe characters is specified using the regular
82					expression character class syntax (what you find within brackets in regular
83					expressions).
84
85					The default set of characters to encode are control chars, high-bit chars, and
86					the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
87					for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
88					>> characters:
89
90					$encoded = encode_entities($input, '<>&"');
91
92					and this would only encode non-plain ascii:
93
94					$encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
95
96					This routine is exported by default.
97
98					=item encode_entities_numeric( $string )
99
100					=item encode_entities_numeric( $string, $unsafe_chars )
101
102					This routine works just like encode_entities, except that the replacement
103					entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
104					example, C<encode_entities("r\xF4le")> returns "rôle", but
105					C<encode_entities_numeric("r\xF4le")> returns "rôle".
106
107					This routine is I<not> exported by default. But you can always
108					export it with C<use HTML::Entities qw(encode_entities_numeric);>
109					or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
110
111					=back
112
113					All these routines modify the string passed as the first argument, if
114					called in a void context. In scalar and array contexts, the encoded or
115					decoded string is returned (without changing the input string).
116
117					If you prefer not to import these routines into your namespace, you can
118					call them as:
119
120					use HTML::Entities ();
121					$decoded = HTML::Entities::decode($a);
122					$encoded = HTML::Entities::encode($a);
123					$encoded = HTML::Entities::encode_numeric($a);
124
125					The module can also export the %char2entity and the %entity2char
126					hashes, which contain the mapping from all characters to the
127					corresponding entities (and vice versa, respectively).
128
129					=head1 COPYRIGHT
130
131					Copyright 1995-2006 Gisle Aas. All rights reserved.
132
133					This library is free software; you can redistribute it and/or
134					modify it under the same terms as Perl itself.
135
136					=cut
137
138	2	26µs	2	52µs	# spent 34µs (15+19) within HTML::Entities::BEGIN@138 which was called: # once (15µs+19µs) by Perl::Tidy::HtmlWriter::BEGIN@2 at line 138 use strict; # spent 34µs making 1 call to HTML::Entities::BEGIN@138 # spent 19µs making 1 call to strict::import
139	2	24µs	2	123µs	# spent 66µs (8+58) within HTML::Entities::BEGIN@139 which was called: # once (8µs+58µs) by Perl::Tidy::HtmlWriter::BEGIN@2 at line 139 use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # spent 66µs making 1 call to HTML::Entities::BEGIN@139 # spent 58µs making 1 call to vars::import
140	2	1.03ms	2	60µs	# spent 33µs (6+27) within HTML::Entities::BEGIN@140 which was called: # once (6µs+27µs) by Perl::Tidy::HtmlWriter::BEGIN@2 at line 140 use vars qw(%entity2char %char2entity); # spent 33µs making 1 call to HTML::Entities::BEGIN@140 # spent 27µs making 1 call to vars::import
141
142	1	11µs			require 5.004;
143	1	700ns			require Exporter;
144	1	5µs			@ISA = qw(Exporter);
145
146	1	900ns			@EXPORT = qw(encode_entities decode_entities _decode_entities);
147	1	600ns			@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
148
149	1	200ns			$VERSION = "3.69";
150					sub Version { $VERSION; }
151
152	1	62µs			require HTML::Parser; # for fast XS implemented decode_entities
153
154
155	1	105µs			%entity2char = (
156					# Some normal chars that have special meaning in SGML context
157					amp => '&', # ampersand
158					'gt' => '>', # greater than
159					'lt' => '<', # less than
160					quot => '"', # double quote
161					apos => "'", # single quote
162
163					# PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
164					AElig => chr(198), # capital AE diphthong (ligature)
165					Aacute => chr(193), # capital A, acute accent
166					Acirc => chr(194), # capital A, circumflex accent
167					Agrave => chr(192), # capital A, grave accent
168					Aring => chr(197), # capital A, ring
169					Atilde => chr(195), # capital A, tilde
170					Auml => chr(196), # capital A, dieresis or umlaut mark
171					Ccedil => chr(199), # capital C, cedilla
172					ETH => chr(208), # capital Eth, Icelandic
173					Eacute => chr(201), # capital E, acute accent
174					Ecirc => chr(202), # capital E, circumflex accent
175					Egrave => chr(200), # capital E, grave accent
176					Euml => chr(203), # capital E, dieresis or umlaut mark
177					Iacute => chr(205), # capital I, acute accent
178					Icirc => chr(206), # capital I, circumflex accent
179					Igrave => chr(204), # capital I, grave accent
180					Iuml => chr(207), # capital I, dieresis or umlaut mark
181					Ntilde => chr(209), # capital N, tilde
182					Oacute => chr(211), # capital O, acute accent
183					Ocirc => chr(212), # capital O, circumflex accent
184					Ograve => chr(210), # capital O, grave accent
185					Oslash => chr(216), # capital O, slash
186					Otilde => chr(213), # capital O, tilde
187					Ouml => chr(214), # capital O, dieresis or umlaut mark
188					THORN => chr(222), # capital THORN, Icelandic
189					Uacute => chr(218), # capital U, acute accent
190					Ucirc => chr(219), # capital U, circumflex accent
191					Ugrave => chr(217), # capital U, grave accent
192					Uuml => chr(220), # capital U, dieresis or umlaut mark
193					Yacute => chr(221), # capital Y, acute accent
194					aacute => chr(225), # small a, acute accent
195					acirc => chr(226), # small a, circumflex accent
196					aelig => chr(230), # small ae diphthong (ligature)
197					agrave => chr(224), # small a, grave accent
198					aring => chr(229), # small a, ring
199					atilde => chr(227), # small a, tilde
200					auml => chr(228), # small a, dieresis or umlaut mark
201					ccedil => chr(231), # small c, cedilla
202					eacute => chr(233), # small e, acute accent
203					ecirc => chr(234), # small e, circumflex accent
204					egrave => chr(232), # small e, grave accent
205					eth => chr(240), # small eth, Icelandic
206					euml => chr(235), # small e, dieresis or umlaut mark
207					iacute => chr(237), # small i, acute accent
208					icirc => chr(238), # small i, circumflex accent
209					igrave => chr(236), # small i, grave accent
210					iuml => chr(239), # small i, dieresis or umlaut mark
211					ntilde => chr(241), # small n, tilde
212					oacute => chr(243), # small o, acute accent
213					ocirc => chr(244), # small o, circumflex accent
214					ograve => chr(242), # small o, grave accent
215					oslash => chr(248), # small o, slash
216					otilde => chr(245), # small o, tilde
217					ouml => chr(246), # small o, dieresis or umlaut mark
218					szlig => chr(223), # small sharp s, German (sz ligature)
219					thorn => chr(254), # small thorn, Icelandic
220					uacute => chr(250), # small u, acute accent
221					ucirc => chr(251), # small u, circumflex accent
222					ugrave => chr(249), # small u, grave accent
223					uuml => chr(252), # small u, dieresis or umlaut mark
224					yacute => chr(253), # small y, acute accent
225					yuml => chr(255), # small y, dieresis or umlaut mark
226
227					# Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
228					copy => chr(169), # copyright sign
229					reg => chr(174), # registered sign
230					nbsp => chr(160), # non breaking space
231
232					# Additional ISO-8859/1 entities listed in rfc1866 (section 14)
233					iexcl => chr(161),
234					cent => chr(162),
235					pound => chr(163),
236					curren => chr(164),
237					yen => chr(165),
238					brvbar => chr(166),
239					sect => chr(167),
240					uml => chr(168),
241					ordf => chr(170),
242					laquo => chr(171),
243					'not' => chr(172), # not is a keyword in perl
244					shy => chr(173),
245					macr => chr(175),
246					deg => chr(176),
247					plusmn => chr(177),
248					sup1 => chr(185),
249					sup2 => chr(178),
250					sup3 => chr(179),
251					acute => chr(180),
252					micro => chr(181),
253					para => chr(182),
254					middot => chr(183),
255					cedil => chr(184),
256					ordm => chr(186),
257					raquo => chr(187),
258					frac14 => chr(188),
259					frac12 => chr(189),
260					frac34 => chr(190),
261					iquest => chr(191),
262					'times' => chr(215), # times is a keyword in perl
263					divide => chr(247),
264
265					( $] > 5.007 ? (
266					'OElig;' => chr(338),
267					'oelig;' => chr(339),
268					'Scaron;' => chr(352),
269					'scaron;' => chr(353),
270					'Yuml;' => chr(376),
271					'fnof;' => chr(402),
272					'circ;' => chr(710),
273					'tilde;' => chr(732),
274					'Alpha;' => chr(913),
275					'Beta;' => chr(914),
276					'Gamma;' => chr(915),
277					'Delta;' => chr(916),
278					'Epsilon;' => chr(917),
279					'Zeta;' => chr(918),
280					'Eta;' => chr(919),
281					'Theta;' => chr(920),
282					'Iota;' => chr(921),
283					'Kappa;' => chr(922),
284					'Lambda;' => chr(923),
285					'Mu;' => chr(924),
286					'Nu;' => chr(925),
287					'Xi;' => chr(926),
288					'Omicron;' => chr(927),
289					'Pi;' => chr(928),
290					'Rho;' => chr(929),
291					'Sigma;' => chr(931),
292					'Tau;' => chr(932),
293					'Upsilon;' => chr(933),
294					'Phi;' => chr(934),
295					'Chi;' => chr(935),
296					'Psi;' => chr(936),
297					'Omega;' => chr(937),
298					'alpha;' => chr(945),
299					'beta;' => chr(946),
300					'gamma;' => chr(947),
301					'delta;' => chr(948),
302					'epsilon;' => chr(949),
303					'zeta;' => chr(950),
304					'eta;' => chr(951),
305					'theta;' => chr(952),
306					'iota;' => chr(953),
307					'kappa;' => chr(954),
308					'lambda;' => chr(955),
309					'mu;' => chr(956),
310					'nu;' => chr(957),
311					'xi;' => chr(958),
312					'omicron;' => chr(959),
313					'pi;' => chr(960),
314					'rho;' => chr(961),
315					'sigmaf;' => chr(962),
316					'sigma;' => chr(963),
317					'tau;' => chr(964),
318					'upsilon;' => chr(965),
319					'phi;' => chr(966),
320					'chi;' => chr(967),
321					'psi;' => chr(968),
322					'omega;' => chr(969),
323					'thetasym;' => chr(977),
324					'upsih;' => chr(978),
325					'piv;' => chr(982),
326					'ensp;' => chr(8194),
327					'emsp;' => chr(8195),
328					'thinsp;' => chr(8201),
329					'zwnj;' => chr(8204),
330					'zwj;' => chr(8205),
331					'lrm;' => chr(8206),
332					'rlm;' => chr(8207),
333					'ndash;' => chr(8211),
334					'mdash;' => chr(8212),
335					'lsquo;' => chr(8216),
336					'rsquo;' => chr(8217),
337					'sbquo;' => chr(8218),
338					'ldquo;' => chr(8220),
339					'rdquo;' => chr(8221),
340					'bdquo;' => chr(8222),
341					'dagger;' => chr(8224),
342					'Dagger;' => chr(8225),
343					'bull;' => chr(8226),
344					'hellip;' => chr(8230),
345					'permil;' => chr(8240),
346					'prime;' => chr(8242),
347					'Prime;' => chr(8243),
348					'lsaquo;' => chr(8249),
349					'rsaquo;' => chr(8250),
350					'oline;' => chr(8254),
351					'frasl;' => chr(8260),
352					'euro;' => chr(8364),
353					'image;' => chr(8465),
354					'weierp;' => chr(8472),
355					'real;' => chr(8476),
356					'trade;' => chr(8482),
357					'alefsym;' => chr(8501),
358					'larr;' => chr(8592),
359					'uarr;' => chr(8593),
360					'rarr;' => chr(8594),
361					'darr;' => chr(8595),
362					'harr;' => chr(8596),
363					'crarr;' => chr(8629),
364					'lArr;' => chr(8656),
365					'uArr;' => chr(8657),
366					'rArr;' => chr(8658),
367					'dArr;' => chr(8659),
368					'hArr;' => chr(8660),
369					'forall;' => chr(8704),
370					'part;' => chr(8706),
371					'exist;' => chr(8707),
372					'empty;' => chr(8709),
373					'nabla;' => chr(8711),
374					'isin;' => chr(8712),
375					'notin;' => chr(8713),
376					'ni;' => chr(8715),
377					'prod;' => chr(8719),
378					'sum;' => chr(8721),
379					'minus;' => chr(8722),
380					'lowast;' => chr(8727),
381					'radic;' => chr(8730),
382					'prop;' => chr(8733),
383					'infin;' => chr(8734),
384					'ang;' => chr(8736),
385					'and;' => chr(8743),
386					'or;' => chr(8744),
387					'cap;' => chr(8745),
388					'cup;' => chr(8746),
389					'int;' => chr(8747),
390					'there4;' => chr(8756),
391					'sim;' => chr(8764),
392					'cong;' => chr(8773),
393					'asymp;' => chr(8776),
394					'ne;' => chr(8800),
395					'equiv;' => chr(8801),
396					'le;' => chr(8804),
397					'ge;' => chr(8805),
398					'sub;' => chr(8834),
399					'sup;' => chr(8835),
400					'nsub;' => chr(8836),
401					'sube;' => chr(8838),
402					'supe;' => chr(8839),
403					'oplus;' => chr(8853),
404					'otimes;' => chr(8855),
405					'perp;' => chr(8869),
406					'sdot;' => chr(8901),
407					'lceil;' => chr(8968),
408					'rceil;' => chr(8969),
409					'lfloor;' => chr(8970),
410					'rfloor;' => chr(8971),
411					'lang;' => chr(9001),
412					'rang;' => chr(9002),
413					'loz;' => chr(9674),
414					'spades;' => chr(9824),
415					'clubs;' => chr(9827),
416					'hearts;' => chr(9829),
417					'diams;' => chr(9830),
418					) : ())
419					);
420
421
422					# Make the opposite mapping
423	1	175µs			while (my($entity, $char) = each(%entity2char)) {
424	253	517µs	253	159µs	$entity =~ s/;\z//; # spent 159µs making 253 calls to HTML::Entities::CORE:subst, avg 630ns/call
425	253	264µs			$char2entity{$char} = "&$entity;";
426					}
427	1	600ns			delete $char2entity{"'"}; # only one-way decoding
428
429					# Fill in missing entities
430	1	2µs			for (0 .. 255) {
431	256	28µs			next if exists $char2entity{chr($_)};
432	156	85µs			$char2entity{chr($_)} = "&#$_;";
433					}
434
435	1	100ns			my %subst; # compiled encoding regexps
436
437					sub encode_entities
438					{
439					return undef unless defined $_[0];
440					my $ref;
441					if (defined wantarray) {
442					my $x = $_[0];
443					$ref = \$x; # copy
444					} else {
445					$ref = \$_[0]; # modify in-place
446					}
447					if (defined $_[1] and length $_[1]) {
448					unless (exists $subst{$_[1]}) {
449					# Because we can't compile regex we fake it with a cached sub
450					my $chars = $_[1];
451					$chars =~ s,(?<!\\)([]/]),\\$1,g;
452					$chars =~ s,(?<!\\)\\\z,\\\\,;
453					my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} \|\| num_entity(\$1)/ge; }";
454					$subst{$_[1]} = eval $code;
455					die( $@ . " while trying to turn range: \"$_[1]\"\n "
456					. "into code: $code\n "
457					) if $@;
458					}
459					&{$subst{$_[1]}}($$ref);
460					} else {
461					# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
462					$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} \|\| num_entity($1)/ge;
463					}
464					$$ref;
465					}
466
467					sub encode_entities_numeric {
468					local %char2entity;
469					return &encode_entities; # a goto &encode_entities wouldn't work
470					}
471
472
473					sub num_entity {
474					sprintf "&#x%X;", ord($_[0]);
475					}
476
477					# Set up aliases
478	1	800ns			*encode = \&encode_entities;
479	1	200ns			*encode_numeric = \&encode_entities_numeric;
480	1	200ns			*encode_numerically = \&encode_entities_numeric;
481	1	200ns			*decode = \&decode_entities;
482
483	1	60µs			1;

					# spent 159µs within HTML::Entities::CORE:subst which was called 253 times, avg 630ns/call: # 253 times (159µs+0s) by Perl::Tidy::HtmlWriter::BEGIN@2 at line 424, avg 630ns/call sub HTML::Entities::CORE:subst; # opcode