Perl

.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{
.    if \nF \{
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\" ========================================================================
.\"
.IX Title "XML::TreePuller::CookBook::Performance 3pm"
.TH XML::TreePuller::CookBook::Performance 3pm "2016-06-27" "perl v5.22.2" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
XML::TreePuller::CookBook::Performance \- Increasing the rate of data through XML::TreePuller
.SH "ABOUT"
.IX Header "ABOUT"
Wikipedia (and MediaWiki) dump files present interesting parsing challenges \-
they are not of a high complexity but they do get to be very large; the
English Wikipedia dump file is around 24 gigabytes and the dump file that
has all of the revisions ever made is estimated to be around 1.5 terabytes 
(or larger). We'll cover parsing the Wikipedia dump files in faster
and faster ways.
.SS "Wikipedia dump format"
.IX Subsection "Wikipedia dump format"
The dump file looks a little something like this:
.PP
.Vb 10
\&  <mediawiki version="0.4">
\&    <siteinfo>
\&      <sitename>Wikipedia</sitename>
\&      <namespaces>
\&        <namespace key="0"/>
\&        <namespace key="1">Talk</namespace>
\&      </namespaces>
\&    </siteinfo>
\&    <page>
\&      <title>Perl</title>
\&      <revision>
\&        <contributor>
\&          <username>A Random Monger</username>
\&        </contributor>
\&        <text>A nifty little language if I do say so myself!</text>
\&      </revision>
\&    </page>
\&    <!\-\- 24 gigabytes more of XML goes here>
\&    <page>
\&      <title>C</title>
\&      <revision>
\&        <contributor>
\&          <username>A Random Monger</username>
\&        </contributor>
\&        <text>Faster and even more dangerous.</text>
\&      </revision>
\&    </page>
\&  </mediawiki>
.Ve
.SH "PROGRAMS"
.IX Header "PROGRAMS"
.SS "Print out a report from the dump file"
.IX Subsection "Print out a report from the dump file"
Lets build a report from the dump file: it'll contain the version
of the dump file, the site name, and the list of page titles.
.PP
The most important thing to keep in mind with big dump files is that they are to
large to fit the entire document into \s-1RAM.\s0 Because of this we need to have 
XML::TreePuller break the document up into chunks that will fit. We also want to 
access the version attribute on the root node but with out having the entire
document read into memory.
.PP
.Vb 1
\&  #!/usr/bin/env perl
\&
\&  use strict;
\&  use warnings;
\&  
\&  use XML::TreePuller;
\&  
\&  my $xml = XML::TreePuller\->new(location => shift(@ARGV));
\&
\&  #read the mediawiki element but stop short
\&  #of reading in a subelement 
\&  $xml\->iterate_at(\*(Aq/mediawiki\*(Aq => \*(Aqshort\*(Aq);
\&  $xml\->iterate_at(\*(Aq/mediawiki/siteinfo/sitename\*(Aq => \*(Aqsubtree\*(Aq);
\&  $xml\->iterate_at(\*(Aq/mediawiki/page\*(Aq => \*(Aqsubtree\*(Aq);
\&
\&  print "Dump version: ", $xml\->next\->attribute(\*(Aqversion\*(Aq), "\en";
\&  print "Sitename: ", $xml\->next\->text, "\en";
\&  
\&  while($_ = $xml\->next) {
\&        #note that the root element is page, not mediawiki
\&        print \*(Aq  * \*(Aq, $_\->xpath(\*(Aq/page/title\*(Aq)\->text, "\en";
\&  }
.Ve
.SS "Print page titles and text"
.IX Subsection "Print page titles and text"
Because the English Wikipedia dump files are so large parsing them has been
turned into a shoot-out to gauge the speed of various \s-1XML\s0 processing systems.
The shoot-out is to print all the page titles and contents to \s-1STDOUT.\s0 This
example processes the \s-1XML\s0 input of the simple English Wikipedia at 3.45 MiB/sec.
.PP
.Vb 2
\&  use strict;
\&  use warnings;
\&
\&  use XML::TreePuller;
\&
\&  binmode(STDOUT, \*(Aq:utf8\*(Aq);
\&
\&  my $xml = XML::TreePuller\->new(location => shift(@ARGV));
\&
\&  $xml\->iterate_at(\*(Aq/mediawiki/page\*(Aq, \*(Aqsubtree\*(Aq);
\&
\&  while(defined(my $e = $xml\->next)) {
\&        my $t = $e\->xpath(\*(Aq/page\*(Aq);
\&        
\&        print \*(AqTitle: \*(Aq, $e\->xpath(\*(Aq/page/title\*(Aq)\->text, "\en";
\&        print $e\->xpath(\*(Aq/page/revision/text\*(Aq)\->text;
\&  }
.Ve
.SS "Print page titles and text but faster"
.IX Subsection "Print page titles and text but faster"
The previous example does not really use any of the features of
XPath that warrant the additional processing overhead of involving
it in our code. We can replace the \fIxpath()\fR calls with \fIget_elements()\fR
which has less features but is faster. This code processes the \s-1XML\s0
input at 6.68 MiB/sec.
.PP
.Vb 2
\&  use strict;
\&  use warnings;
\&
\&  use XML::TreePuller;
\&
\&  binmode(STDOUT, \*(Aq:utf8\*(Aq);
\&
\&  my $xml = XML::TreePuller\->new(location => shift(@ARGV));
\&
\&  $xml\->iterate_at(\*(Aq/mediawiki/page\*(Aq, \*(Aqsubtree\*(Aq);
\&
\&  while(defined(my $e = $xml\->next)) {
\&        my $t = $e\->xpath(\*(Aq/page\*(Aq);
\&        
\&        print \*(AqTitle: \*(Aq, $e\->get_elements(\*(Aqtitle\*(Aq)\->text, "\en";
\&        print $e\->get_elements(\*(Aqrevision/text\*(Aq)\->text;
\&  }
.Ve
.SS "Print page titles and text \- also faster"
.IX Subsection "Print page titles and text - also faster"
There is one more way to solve this particular problem: we can ask
the engine to iterate on both page title and text elements. This
example runs at 7.9 MiB/sec.
.PP
.Vb 2
\&  use strict;
\&  use warnings;
\&
\&  use XML::TreePuller;
\&
\&  binmode(STDOUT, \*(Aq:utf8\*(Aq);
\&
\&  my $xml = XML::TreePuller\->new(location => shift(@ARGV));
\&
\&  $xml\->iterate_at(\*(Aq/mediawiki/page/title\*(Aq, \*(Aqsubtree\*(Aq);
\&  $xml\->iterate_at(\*(Aq/mediawiki/page/revision/text\*(Aq, \*(Aqsubtree\*(Aq);
\&
\&  while(defined(my $e = $xml\->next)) {
\&        print \*(AqTitle: \*(Aq, $e\->text, "\en";
\&        print $xml\->next\->text, "\en";
\&  }
.Ve
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
All content is copyright Tyler Riddle; see the \s-1README\s0 for licensing terms.