.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.16) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Regexp::Optimizer 3pm" .TH Regexp::Optimizer 3pm "2004-12-05" "perl v5.14.2" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" Regexp::Optimizer \- optimizes regular expressions .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 4 \& use Regexp::Optimizer; \& my $o = Regexp::Optimizer\->new; \& my $re = $o\->optimize(qr/foobar|fooxar|foozap/); \& # $re is now qr/foo(?:[bx]ar|zap)/ .Ve .SH "ABSTRACT" .IX Header "ABSTRACT" This module does, ahem, attempts to, optimize regular expressions. .SH "INSTALLATION" .IX Header "INSTALLATION" To install this module type the following: .PP .Vb 4 \& perl Makefile.PL \& make \& make test \& make install .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" Here is a quote from perltodo. .Sp .RS 4 Factoring out common suffices/prefices in regexps (trie optimization) .Sp Currently, the user has to optimize \*(L"foo|far\*(R" and \*(L"foo|goo\*(R" into \&\*(L"f(?:oo|ar)\*(R" and \*(L"[fg]oo\*(R" by hand; this could be done automatically. .RE .PP This module implements just that. .SS "\s-1EXPORT\s0" .IX Subsection "EXPORT" Since this is an \s-1OO\s0 module there is no symbol exported. .SH "METHODS" .IX Header "METHODS" This module is implemented as a subclass of Regexp::List. For methods not listed here, see Regexp::List. .ie n .IP "$o = Regexp::Optimizer\->new;" 4 .el .IP "\f(CW$o\fR = Regexp::Optimizer\->new;" 4 .IX Item "$o = Regexp::Optimizer->new;" .PD 0 .ie n .IP "$o\->set(\fIkey => value, ...\fR)" 4 .el .IP "\f(CW$o\fR\->set(\fIkey => value, ...\fR)" 4 .IX Item "$o->set(key => value, ...)" .PD Just the same us Regexp::List except for the attribute below; .RS 4 .IP "unexpand" 4 .IX Item "unexpand" When set to one, \f(CW$o\fR\->\fIoptimize()\fR tries to \f(CW$o\fR\->expand before actually starting the operation. .Sp .Vb 6 \& # cases you need to set expand => 1 \& $o\->set(expand => 1)\->optimize(qr/ \& foobar| \& fooxar| \& foozar \& /x); .Ve .RE .RS 4 .RE .ie n .IP "$re = $o\->optimize(\fIregexp\fR);" 4 .el .IP "\f(CW$re\fR = \f(CW$o\fR\->optimize(\fIregexp\fR);" 4 .IX Item "$re = $o->optimize(regexp);" Does the job. Note that unlike \f(CW\*(C`\->list2re()\*(C'\fR in Regexp::List, the argument is the regular expression itself. What it basically does is to find groups will alterations and replace it with the result of \&\f(CW\*(C`$o\->list2re\*(C'\fR. .ie n .IP "$re = $o\->list2re(\fIlist of words ...\fR)" 4 .el .IP "\f(CW$re\fR = \f(CW$o\fR\->list2re(\fIlist of words ...\fR)" 4 .IX Item "$re = $o->list2re(list of words ...)" Same as \f(CW\*(C`list2re()\*(C'\fR in Regexp::List in terms of functionality but how it tokenize \*(L"atoms\*(R" is different since the arguments can be regular expressions, not just strings. Here is a brief example. .Sp .Vb 3 \& my @expr = qw/foobar fooba+/; \& Regexp::List\->new\->list2re(@expr) eq qr/fooba[\e+r]/; \& Regexp::Optimizer\->new\->list2re(@expr) eq qr/foob(?:a+ar)/; .Ve .SH "CAVEATS" .IX Header "CAVEATS" This module is still experimental. Do not assume that the result is the same as the unoptimized version. .IP "\(bu" 4 When you just want a regular expression which matches normal words with not metacharacters, use . It's more robus and much faster. .IP "\(bu" 4 When you have a list of regular expessions which you want to aggregate, use \f(CW\*(C`list2re\*(C'\fR of \s-1THIS\s0 \s-1MODULE\s0. .IP "\(bu" 4 Use \f(CW\*(C`\->optimize()\*(C'\fR when and only when you already have a big regular expression with alterations therein. .Sp \&\f(CW\*(C`\->optimize()\*(C'\fR does support nested groups but its parser is not tested very well. .SH "BUGS" .IX Header "BUGS" .IP "\(bu" 4 Regex parser in this module (which itself is implemented by regular expression) is not as thoroughly tested as Regexp::List .IP "\(bu" 4 May still fall into deep recursion when you attempt to optimize deeply nested regexp. See \*(L"\s-1PRACTICALITY\s0\*(R". .IP "\(bu" 4 Does not grok (?{expression}) and (?(cond)yes|no) constructs yet .IP "\(bu" 4 You need to escape characters in character classes. .Sp .Vb 3 \& $o\->optimize(qr/[a\-z()]|[A\-Z]/); # wrong \& $o\->optimize(qr/[a\-z\e(\e)]|[A\-Z]/); # right \& $o\->optimize(qr/[0\-9A\-Za\-z]|[\eQ\-_.!~*"\*(Aq()\eE]/ # right, too. .Ve .IP "\(bu" 4 When character(?: class(?:es)?)? are aggregated, duplicate ranges are left as is. Though functionally \s-1OK\s0, it is cosmetically ugly. .Sp .Vb 2 \& $o\->optimize(qr/[0\-5]|[5\-9]|0123456789/); \& # simply turns into [0\-5][5\-9]0123456789] not [0\-9] .Ve .Sp I left it that way because marking-rearranging approach can result a humongous result when unicode characters are concerned (and \&\ep{Properties}). .SH "PRACTICALITY" .IX Header "PRACTICALITY" Though this module is still experimental, It is still good enough even for such deeply nested regexes as the followng. .PP .Vb 3 \& # See 3.2.2 of http://www.ietf.org/rfc/rfc2616.txt \& # BNF faithfully turned into a regex \& http://(?:(?:(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|(?:(?:[a\-z]|[A\-Z])|[0\-9])(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|\-)*(?:(?:[a\-z]|[A\-Z])|[0\-9]))\e.)*(?:(?:[a\-z]|[A\-Z])|(?:[a\-z]|[A\-Z])(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|\-)*(?:(?:[a\-z]|[A\-Z])|[0\-9]))\e.?|[0\-9]+\e.[0\-9]+\e.[0\-9]+\e.[0\-9]+)(?::[0\-9]*)?(?:/(?:(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|[\e\-\e_\e.\e!\e~\e*\e\*(Aq\e(\e)])|%(?:[0\-9]|[A\-Fa\-f])(?:[0\-9]|[A\-Fa\-f])|[:@&=+$,])*(?:;(?:(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|[\e\-\e_\e.\e!\e~\e*\e\*(Aq\e(\e)])|%(?:[0\-9]|[A\-Fa\-f])(?:[0\-9]|[A\-Fa\-f])|[:@&=+$,])*)*(?:/(?:(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|[\e\-\e_\e.\e!\e~\e*\e\*(Aq\e(\e)])|%(?:[0\-9]|[A\-Fa\-f])(?:[0\-9]|[A\-Fa\-f])|[:@&=+$,])*(?:;(?:(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|[\e\-\e_\e.\e!\e~\e*\e\*(Aq\e(\e)])|%(?:[0\-9]|[A\-Fa\-f])(?:[0\-9]|[A\-Fa\-f])|[:@&=+$,])*)*)*(?:\e\e?(?:[;/?:@&=+$,]|(?:(?:(?:[a\-z]|[A\-Z])|[0\-9])|[\e\-\e_\e.\e!\e~\e*\e\*(Aq\e(\e)])|%(?:[0\-9]|[A\-Fa\-f])(?:[0\-9]|[A\-Fa\-f]))*)?)? \& \& # and optimized \& http://(?::?[a\-zA\-Z0\-9](?:[a\-zA\-Z0\-9\e\-]*[a\-zA\-Z0\-9])?\e.[a\-zA\-Z]*(?:[a\-zA\-Z0\-9\e\-]*[a\-zA\-Z0\-9])?\e.?|[0\-9]+\e.[0\-9]+\e.[0\-9]+\e.[0\-9]+)(?::[0\-9]*)?(?:/(?:(?:(?:[a\-zA\-Z0\-9\e\-\e_\e.\e!\e~\e*\e\*(Aq\ex28\ex29]|%[0\-9A\-Fa\-f][0\-9A\-Fa\-f])|[:@&=+$,]))*(?:;(?:(?:(?:[a\-zA\-Z0\-9\e\-\e_\e.\e!\e~\e*\e\*(Aq\ex28\ex29]|%[0\-9A\-Fa\-f][0\-9A\-Fa\-f])|[:@&=+$,]))*)*(?:/(?:(?:(?:[a\-zA\-Z0\-9\e\-\e_\e.\e!\e~\e*\e\*(Aq\ex28\ex29]|%[0\-9A\-Fa\-f][0\-9A\-Fa\-f])|[:@&=+$,]))*(?:;(?:(?:(?:[a\-zA\-Z0\-9\e\-\e_\e.\e!\e~\e*\e\*(Aq\ex28\ex29]|%[0\-9A\-Fa\-f][0\-9A\-Fa\-f])|[:@&=+$,]))*)*)*(?:\e\e?(?:(?:[;/?:@&=+$,a\-zA\-Z0\-9\e\-\e_\e.\e!\e~\e*\e\*(Aq\ex28\ex29]|%[0\-9A\-Fa\-f][0\-9A\-Fa\-f]))*)?)? .Ve .PP By carefully examine both you can find that character classes are properly aggregated. .SH "SEE ALSO" .IX Header "SEE ALSO" Regexp::List \*(-- upon which this module is based .PP \&\f(CW\*(C`eg/\*(C'\fR directory in this package contains example scripts. .IP "Perl standard documents" 4 .IX Item "Perl standard documents" .Vb 1 \& L, L .Ve .IP "\s-1CPAN\s0 Modules" 4 .IX Item "CPAN Modules" Regexp::Presuf, Text::Trie .IP "Books" 4 .IX Item "Books" Mastering Regular Expressions .SH "AUTHOR" .IX Header "AUTHOR" Dan Kogai .SH "COPYRIGHT AND LICENSE" .IX Header "COPYRIGHT AND LICENSE" Copyright 2003 by Dan Kogai .PP This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.