.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.28) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' . ds C` . ds C' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .\" .\" Avoid warning from groff about undefined register 'F'. .de IX .. .nr rF 0 .if \n(.g .if rF .nr rF 1 .if (\n(rF:(\n(.g==0)) \{ . if \nF \{ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . if !\nF==2 \{ . nr % 0 . nr F 2 . \} . \} .\} .rr rF .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "Lucy::Docs::Tutorial::Simple 3pm" .TH Lucy::Docs::Tutorial::Simple 3pm "2015-03-06" "perl v5.20.2" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" Lucy::Docs::Tutorial::Simple \- Bare\-bones search app. .SS "Setup" .IX Subsection "Setup" Copy the text presentation of the \s-1US\s0 Constitution from the \f(CW\*(C`sample\*(C'\fR directory of the Apache Lucy distribution to the base level of your web server's \&\f(CW\*(C`htdocs\*(C'\fR directory. .PP .Vb 1 \& $ cp \-R sample/us_constitution /usr/local/apache2/htdocs/ .Ve .SS "Indexing: indexer.pl" .IX Subsection "Indexing: indexer.pl" Our first task will be to create an application called \f(CW\*(C`indexer.pl\*(C'\fR which builds a searchable \*(L"inverted index\*(R" from a collection of documents. .PP After we specify some configuration variables and load all necessary modules... .PP .Vb 3 \& #!/usr/local/bin/perl \& use strict; \& use warnings; \& \& # (Change configuration variables as needed.) \& my $path_to_index = \*(Aq/path/to/index\*(Aq; \& my $uscon_source = \*(Aq/usr/local/apache2/htdocs/us_constitution\*(Aq; \& \& use Lucy::Simple; \& use File::Spec::Functions qw( catfile ); .Ve .PP \&... we'll start by creating a Lucy::Simple object, telling it where we'd like the index to be located and the language of the source material. .PP .Vb 4 \& my $lucy = Lucy::Simple\->new( \& path => $path_to_index, \& language => \*(Aqen\*(Aq, \& ); .Ve .PP Next, we'll add a subroutine which parses our sample documents. .PP .Vb 10 \& # Parse a file from our US Constitution collection and return a hashref with \& # the fields title, body, and url. \& sub parse_file { \& my $filename = shift; \& my $filepath = catfile( $uscon_source, $filename ); \& open( my $fh, \*(Aq<\*(Aq, $filepath ) or die "Can\*(Aqt open \*(Aq$filepath\*(Aq: $!"; \& my $text = do { local $/; <$fh> }; # slurp file content \& $text =~ /\eA(.+?)^\es+(.*)/ms \& or die "Can\*(Aqt extract title/bodytext from \*(Aq$filepath\*(Aq"; \& my $title = $1; \& my $bodytext = $2; \& return { \& title => $title, \& content => $bodytext, \& url => "/us_constitution/$filename", \& }; \& } .Ve .PP Add some elementary directory reading code... .PP .Vb 4 \& # Collect names of source files. \& opendir( my $dh, $uscon_source ) \& or die "Couldn\*(Aqt opendir \*(Aq$uscon_source\*(Aq: $!"; \& my @filenames = grep { $_ =~ /\e.txt/ } readdir $dh; .Ve .PP \&... and now we're ready for the meat of indexer.pl \*(-- which occupies exactly one line of code. .PP .Vb 4 \& foreach my $filename (@filenames) { \& my $doc = parse_file($filename); \& $lucy\->add_doc($doc); # ta\-da! \& } .Ve .SS "Search: search.cgi" .IX Subsection "Search: search.cgi" As with our indexing app, the bulk of the code in our search script won't be Lucy-specific. .PP The beginning is dedicated to \s-1CGI\s0 processing and configuration. .PP .Vb 3 \& #!/usr/local/bin/perl \-T \& use strict; \& use warnings; \& \& # (Change configuration variables as needed.) \& my $path_to_index = \*(Aq/path/to/index\*(Aq; \& \& use CGI; \& use List::Util qw( max min ); \& use POSIX qw( ceil ); \& use Encode qw( decode ); \& use Lucy::Simple; \& \& my $cgi = CGI\->new; \& my $q = decode( "UTF\-8", $cgi\->param(\*(Aqq\*(Aq) || \*(Aq\*(Aq ); \& my $offset = decode( "UTF\-8", $cgi\->param(\*(Aqoffset\*(Aq) || 0 ); \& my $page_size = 10; .Ve .PP Once that's out of the way, we create our Lucy::Simple object and feed it a query string. .PP .Vb 9 \& my $lucy = Lucy::Simple\->new( \& path => $path_to_index, \& language => \*(Aqen\*(Aq, \& ); \& my $hit_count = $lucy\->search( \& query => $q, \& offset => $offset, \& num_wanted => $page_size, \& ); .Ve .PP The value returned by \fIsearch()\fR is the total number of documents in the collection which matched the query. We'll show this hit count to the user, and also use it in conjunction with the parameters \f(CW\*(C`offset\*(C'\fR and \f(CW\*(C`num_wanted\*(C'\fR to break up results into \*(L"pages\*(R" of manageable size. .PP Calling \fIsearch()\fR on our Simple object turns it into an iterator. Invoking \&\fInext()\fR now returns hits one at a time as Lucy::Document::HitDoc objects, starting with the most relevant. .PP .Vb 10 \& # Create result list. \& my $report = \*(Aq\*(Aq; \& while ( my $hit = $lucy\->next ) { \& my $score = sprintf( "%0.3f", $hit\->get_score ); \& $report .= qq| \&

\& $hit\->{title} \& $score \&
\& $hit\->{url} \&

\& |; \& } .Ve .PP The rest of the script is just text wrangling. .PP .Vb 3 \& #\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-# \& # No tutorial material below this point \- just html generation. # \& #\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-# \& \& # Generate paging links and hit count, print and exit. \& my $paging_links = generate_paging_info( $q, $hit_count ); \& blast_out_content( $q, $report, $paging_links ); \& \& # Create html fragment with links for paging through results n\-at\-a\-time. \& sub generate_paging_info { \& my ( $query_string, $total_hits ) = @_; \& my $escaped_q = CGI::escapeHTML($query_string); \& my $paging_info; \& if ( !length $query_string ) { \& # No query? No display. \& $paging_info = \*(Aq\*(Aq; \& } \& elsif ( $total_hits == 0 ) { \& # Alert the user that their search failed. \& $paging_info \& = qq|

No matches for $escaped_q

|; \& } \& else { \& # Calculate the nums for the first and last hit to display. \& my $last_result = min( ( $offset + $page_size ), $total_hits ); \& my $first_result = min( ( $offset + 1 ), $last_result ); \& \& # Display the result nums, start paging info. \& $paging_info = qq| \&

\& Results $first_result\-$last_result \& of $total_hits \& for $escaped_q. \&

\&

\& Results Page: \& |; \& \& # Calculate first and last hits pages to display / link to. \& my $current_page = int( $first_result / $page_size ) + 1; \& my $last_page = ceil( $total_hits / $page_size ); \& my $first_page = max( 1, ( $current_page \- 9 ) ); \& $last_page = min( $last_page, ( $current_page + 10 ) ); \& \& # Create a url for use in paging links. \& my $href = $cgi\->url( \-relative => 1 ); \& $href .= "?q=" . CGI::escape($query_string); \& $href .= ";offset=" . CGI::escape($offset); \& \& # Generate the "Prev" link. \& if ( $current_page > 1 ) { \& my $new_offset = ( $current_page \- 2 ) * $page_size; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $paging_info .= qq|<= Prev\en|; \& } \& \& # Generate paging links. \& for my $page_num ( $first_page .. $last_page ) { \& if ( $page_num == $current_page ) { \& $paging_info .= qq|$page_num \en|; \& } \& else { \& my $new_offset = ( $page_num \- 1 ) * $page_size; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $paging_info .= qq|$page_num\en|; \& } \& } \& \& # Generate the "Next" link. \& if ( $current_page != $last_page ) { \& my $new_offset = $current_page * $page_size; \& $href =~ s/(?<=offset=)\ed+/$new_offset/; \& $paging_info .= qq|Next =>\en|; \& } \& \& # Close tag. \& $paging_info .= "

\en"; \& } \& \& return $paging_info; \& } \& \& # Print content to output. \& sub blast_out_content { \& my ( $query_string, $hit_list, $paging_info ) = @_; \& my $escaped_q = CGI::escapeHTML($query_string); \& binmode( STDOUT, ":encoding(UTF\-8)" ); \& print qq|Content\-type: text/html; charset=UTF\-8\en\en|; \& print qq| \& \& \& \& \& \& Lucy: $escaped_q \& \& \& \& \& \& \&
\& \& $hit_list \& \& $paging_info \& \&

\& \& Powered by Apache LucyTM \& \&

\&
\& \& \& \& \& |; \& } .Ve .SS "\s-1OK...\s0 now what?" .IX Subsection "OK... now what?" Lucy::Simple is perfectly adequate for some tasks, but it's not very flexible. Many people find that it doesn't do at least one or two things they can't live without. .PP In our next tutorial chapter, BeyondSimple, we'll rewrite our indexing and search scripts using the classes that Lucy::Simple hides from view, opening up the possibilities for expansion; then, we'll spend the rest of the tutorial chapters exploring these possibilities.