.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.28) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{ . if \nF \{ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Lucy::Docs::Cookbook::CustomQueryParser 3pm" .TH Lucy::Docs::Cookbook::CustomQueryParser 3pm "2015-03-06" "perl v5.20.2" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" Lucy::Docs::Cookbook::CustomQueryParser \- Sample subclass of QueryParser. .SH "ABSTRACT" .IX Header "ABSTRACT" Implement a custom search query language using a subclass of Lucy::Search::QueryParser. .SH "The language" .IX Header "The language" At first, our query language will support only simple term queries and phrases delimited by double quotes. For simplicity's sake, it will not support parenthetical groupings, boolean operators, or prepended plus/minus. The results for all subqueries will be unioned together \*(-- i.e. joined using an \s-1OR\s0 \&\*(-- which is usually the best approach for small-to-medium-sized document collections. .PP Later, we'll add support for trailing wildcards. .SH "Single-field parser" .IX Header "Single-field parser" Our initial parser implentation will generate queries against a single fixed field, \*(L"content\*(R", and it will analyze text using a fixed choice of English PolyAnalyzer. We won't subclass Lucy::Search::QueryParser just yet. .PP .Vb 5 \& package FlatQueryParser; \& use Lucy::Search::TermQuery; \& use Lucy::Search::PhraseQuery; \& use Lucy::Search::ORQuery; \& use Carp; \& \& sub new { \& my $analyzer = Lucy::Analysis::PolyAnalyzer\->new( \& language => \*(Aqen\*(Aq, \& ); \& return bless { \& field => \*(Aqcontent\*(Aq, \& analyzer => $analyzer, \& }, _\|_PACKAGE_\|_; \& } .Ve .PP Some private helper subs for creating TermQuery and PhraseQuery objects will help keep the size of our main \fIparse()\fR subroutine down: .PP .Vb 7 \& sub _make_term_query { \& my ( $self, $term ) = @_; \& return Lucy::Search::TermQuery\->new( \& field => $self\->{field}, \& term => $term, \& ); \& } \& \& sub _make_phrase_query { \& my ( $self, $terms ) = @_; \& return Lucy::Search::PhraseQuery\->new( \& field => $self\->{field}, \& terms => $terms, \& ); \& } .Ve .PP Our private \fI_tokenize()\fR method treats double-quote delimited material as a single token and splits on whitespace everywhere else. .PP .Vb 10 \& sub _tokenize { \& my ( $self, $query_string ) = @_; \& my @tokens; \& while ( length $query_string ) { \& if ( $query_string =~ s/^\es+// ) { \& next; # skip whitespace \& } \& elsif ( $query_string =~ s/^("[^"]*(?:"|$))// ) { \& push @tokens, $1; # double\-quoted phrase \& } \& else { \& $query_string =~ s/(\eS+)//; \& push @tokens, $1; # single word \& } \& } \& return \e@tokens; \& } .Ve .PP The main parsing routine creates an array of tokens by calling \fI_tokenize()\fR, runs the tokens through through the PolyAnalyzer, creates TermQuery or PhraseQuery objects according to how many tokens emerge from the PolyAnalyzer's \fIsplit()\fR method, and adds each of the sub-queries to the primary ORQuery. .PP .Vb 5 \& sub parse { \& my ( $self, $query_string ) = @_; \& my $tokens = $self\->_tokenize($query_string); \& my $analyzer = $self\->{analyzer}; \& my $or_query = Lucy::Search::ORQuery\->new; \& \& for my $token (@$tokens) { \& if ( $token =~ s/^"// ) { \& $token =~ s/"$//; \& my $terms = $analyzer\->split($token); \& my $query = $self\->_make_phrase_query($terms); \& $or_query\->add_child($phrase_query); \& } \& else { \& my $terms = $analyzer\->split($token); \& if ( @$terms == 1 ) { \& my $query = $self\->_make_term_query( $terms\->[0] ); \& $or_query\->add_child($query); \& } \& elsif ( @$terms > 1 ) { \& my $query = $self\->_make_phrase_query($terms); \& $or_query\->add_child($query); \& } \& } \& } \& \& return $or_query; \& } .Ve .SH "Multi-field parser" .IX Header "Multi-field parser" Most often, the end user will want their search query to match not only a single 'content' field, but also 'title' and so on. To make that happen, we have to turn queries such as this... .PP .Vb 1 \& foo AND NOT bar .Ve .PP \&... into the logical equivalent of this: .PP .Vb 1 \& (title:foo OR content:foo) AND NOT (title:bar OR content:bar) .Ve .PP Rather than continue with our own from-scratch parser class and write the routines to accomplish that expansion, we're now going to subclass Lucy::Search::QueryParser and take advantage of some of its existing methods. .PP Our first parser implementation had the \*(L"content\*(R" field name and the choice of English PolyAnalyzer hard-coded for simplicity, but we don't need to do that once we subclass Lucy::Search::QueryParser. QueryParser's constructor \*(-- which we will inherit, allowing us to eliminate our own constructor \*(-- requires a Schema which conveys field and Analyzer information, so we can just defer to that. .PP .Vb 7 \& package FlatQueryParser; \& use base qw( Lucy::Search::QueryParser ); \& use Lucy::Search::TermQuery; \& use Lucy::Search::PhraseQuery; \& use Lucy::Search::ORQuery; \& use PrefixQuery; \& use Carp; \& \& # Inherit new() .Ve .PP We're also going to jettison our \fI_make_term_query()\fR and \fI_make_phrase_query()\fR helper subs and chop our \fIparse()\fR subroutine way down. Our revised \fIparse()\fR routine will generate Lucy::Search::LeafQuery objects instead of TermQueries and PhraseQueries: .PP .Vb 10 \& sub parse { \& my ( $self, $query_string ) = @_; \& my $tokens = $self\->_tokenize($query_string); \& my $or_query = Lucy::Search::ORQuery\->new; \& for my $token (@$tokens) { \& my $leaf_query = Lucy::Search::LeafQuery\->new( text => $token ); \& $or_query\->add_child($leaf_query); \& } \& return $self\->expand($or_query); \& } .Ve .PP The magic happens in QueryParser's \fIexpand()\fR method, which walks the ORQuery object we supply to it looking for LeafQuery objects, and calls \fIexpand_leaf()\fR for each one it finds. \fIexpand_leaf()\fR performs field-specific analysis, decides whether each query should be a TermQuery or a PhraseQuery, and if multiple fields are required, creates an ORQuery which mults out e.g. \f(CW\*(C`foo\*(C'\fR into \f(CW\*(C`(title:foo OR content:foo)\*(C'\fR. .SH "Extending the query language" .IX Header "Extending the query language" To add support for trailing wildcards to our query language, we need to override \fIexpand_leaf()\fR to accommodate PrefixQuery, while deferring to the parent class implementation on TermQuery and PhraseQuery. .PP .Vb 10 \& sub expand_leaf { \& my ( $self, $leaf_query ) = @_; \& my $text = $leaf_query\->get_text; \& if ( $text =~ /\e*$/ ) { \& my $or_query = Lucy::Search::ORQuery\->new; \& for my $field ( @{ $self\->get_fields } ) { \& my $prefix_query = PrefixQuery\->new( \& field => $field, \& query_string => $text, \& ); \& $or_query\->add_child($prefix_query); \& } \& return $or_query; \& } \& else { \& return $self\->SUPER::expand_leaf($leaf_query); \& } \& } .Ve .PP Ordinarily, those asterisks would have been stripped when running tokens through the PolyAnalyzer \*(-- query strings containing \*(L"foo*\*(R" would produce TermQueries for the term \*(L"foo\*(R". Our override intercepts tokens with trailing asterisks and processes them as PrefixQueries before \f(CW\*(C`SUPER::expand_leaf\*(C'\fR can discard them, so that a search for \*(L"foo*\*(R" can match \*(L"food\*(R", \*(L"foosball\*(R", and so on. .SH "Usage" .IX Header "Usage" Insert our custom parser into the search.cgi sample app to get a feel for how it behaves: .PP .Vb 8 \& my $parser = FlatQueryParser\->new( schema => $searcher\->get_schema ); \& my $query = $parser\->parse( decode( \*(AqUTF\-8\*(Aq, $cgi\->param(\*(Aqq\*(Aq) || \*(Aq\*(Aq ) ); \& my $hits = $searcher\->hits( \& query => $query, \& offset => $offset, \& num_wanted => $page_size, \& ); \& ... .Ve