.\" Automatically generated by Pod::Man 2.23 (Pod::Simple 3.14) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "HTML::TreeBuilder::XPath 3pm" .TH HTML::TreeBuilder::XPath 3pm "2011-09-20" "perl v5.12.4" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" HTML::TreeBuilder::XPath \- add XPath support to HTML::TreeBuilder .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 5 \& use HTML::TreeBuilder::XPath; \& my $tree= HTML::TreeBuilder::XPath\->new; \& $tree\->parse_file( "mypage.html"); \& my $nb=$tree\->findvalue( \*(Aq/html/body//p[@class="section_title"]/span[@class="nb"]\*(Aq); \& my $id=$tree\->findvalue( \*(Aq/html/body//p[@class="section_title"]/@id\*(Aq); \& \& my $p= $html\->findnodes( \*(Aq//p[@id="toto"]\*(Aq)\->[0]; \& my $link_texts= $p\->findvalue( \*(Aq./a\*(Aq); # the texts of all a elements in $p \& $tree\->delete; # to avoid memory leaks, if you parse many HTML documents .Ve .SH "DESCRIPTION" .IX Header "DESCRIPTION" This module adds typical XPath methods to HTML::TreeBuilder, to make it easy to query a document. .SH "METHODS" .IX Header "METHODS" Extra methods added both to the tree object and to each element: .SS "findnodes ($path)" .IX Subsection "findnodes ($path)" Returns a list of nodes found by \f(CW$path\fR. In scalar context returns an \f(CW\*(C`Tree::XPathEngine::NodeSet\*(C'\fR object. .SS "findnodes_as_string ($path)" .IX Subsection "findnodes_as_string ($path)" Returns the text values of the nodes, as one string. .SS "findnodes_as_strings ($path)" .IX Subsection "findnodes_as_strings ($path)" Returns a list of the values of the result nodes. .SS "findvalue ($path)" .IX Subsection "findvalue ($path)" Returns either a \f(CW\*(C`Tree::XPathEngine::Literal\*(C'\fR, a \f(CW\*(C`Tree::XPathEngine::Boolean\*(C'\fR or a \f(CW\*(C`Tree::XPathEngine::Number\*(C'\fR object. If the path returns a NodeSet, \&\f(CW$nodeset\fR\->xpath_to_literal is called automatically for you (and thus a \&\f(CW\*(C`Tree::XPathEngine::Literal\*(C'\fR is returned). Note that for each of the objects stringification is overloaded, so you can just print the value found, or manipulate it in the ways you would a normal perl value (e.g. using regular expressions). .SS "findvalues ($path)" .IX Subsection "findvalues ($path)" Returns the values of the matching nodes as a list. This is mostly the same as findnodes_as_strings, except that the elements of the list are objects (with overloaded stringification) instead of plain strings. .SS "exists ($path)" .IX Subsection "exists ($path)" Returns true if the given path exists. .SS "matches($path)" .IX Subsection "matches($path)" Returns true if the element matches the path. .SS "find ($path)" .IX Subsection "find ($path)" The find function takes an XPath expression (a string) and returns either a Tree::XPathEngine::NodeSet object containing the nodes it found (or empty if no nodes matched the path), or one of XML::XPathEngine::Literal (a string), XML::XPathEngine::Number, or XML::XPathEngine::Boolean. It should always return something \- and you can use \->\fIisa()\fR to find out what it returned. If you need to check how many nodes it found you should check \f(CW$nodeset\fR\->size. See XML::XPathEngine::NodeSet. .SS "as_XML_compact" .IX Subsection "as_XML_compact" HTML::TreeBuilder's \f(CW\*(C`as_XML\*(C'\fR output is not really nice to look at, so I added a new method, that can be used as a simple replacement for it. It escapes only the '<', '>' and '&' (plus '"' in attribute values), and wraps \s-1CDATA\s0 elements in \s-1CDATA\s0 sections. .PP Note that the \s-1XML\s0 is actually not garanteed to be valid at this point. Nothing is done about the encoding of the string. Patches or just ideas of how it could work are welcome. .SS "as_XML_indented" .IX Subsection "as_XML_indented" Same as as_XML, except that the output is indented. .SH "SEE ALSO" .IX Header "SEE ALSO" HTML::TreeBuilder .PP XML::XPathEngine .SH "REPOSITORY" .IX Header "REPOSITORY" https://github.com/mirod/HTML\*(--TreeBuilder\-\-XPath .SH "AUTHOR" .IX Header "AUTHOR" Michel Rodriguez, .SH "COPYRIGHT AND LICENSE" .IX Header "COPYRIGHT AND LICENSE" Copyright (C) 2006\-2011 by Michel Rodriguez .PP This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.