.\" Automatically generated by Pod::Man 2.22 (Pod::Simple 3.07) .\" .\" Standard preamble: .\" ======================================================================== .de Sp \" Vertical space (when we can't use .PP) .if t .sp .5v .if n .sp .. .de Vb \" Begin verbatim text .ft CW .nf .ne \\$1 .. .de Ve \" End verbatim text .ft R .fi .. .\" Set up some character translations and predefined strings. \*(-- will .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left .\" double quote, and \*(R" will give a right double quote. \*(C+ will .\" give a nicer C++. Capital omega is used to do unbreakable dashes and .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff, .\" nothing in troff, for use with C<>. .tr \(*W- .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p' .ie n \{\ . ds -- \(*W- . ds PI pi . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch . ds L" "" . ds R" "" . ds C` "" . ds C' "" 'br\} .el\{\ . ds -- \|\(em\| . ds PI \(*p . ds L" `` . ds R" '' 'br\} .\" .\" Escape single quotes in literal strings from groff's Unicode transform. .ie \n(.g .ds Aq \(aq .el .ds Aq ' .\" .\" If the F register is turned on, we'll generate index entries on stderr for .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index .\" entries marked with X<> in POD. Of course, you'll have to process the .\" output yourself in some meaningful fashion. .ie \nF \{\ . de IX . tm Index:\\$1\t\\n%\t"\\$2" .. . nr % 0 . rr F .\} .el \{\ . de IX .. .\} .\" .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2). .\" Fear. Run. Save yourself. No user-serviceable parts. . \" fudge factors for nroff and troff .if n \{\ . ds #H 0 . ds #V .8m . ds #F .3m . ds #[ \f1 . ds #] \fP .\} .if t \{\ . ds #H ((1u-(\\\\n(.fu%2u))*.13m) . ds #V .6m . ds #F 0 . ds #[ \& . ds #] \& .\} . \" simple accents for nroff and troff .if n \{\ . ds ' \& . ds ` \& . ds ^ \& . ds , \& . ds ~ ~ . ds / .\} .if t \{\ . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u" . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u' . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u' . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u' . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u' . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u' .\} . \" troff and (daisy-wheel) nroff accents .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V' .ds 8 \h'\*(#H'\(*b\h'-\*(#H' .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#] .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H' .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u' .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#] .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#] .ds ae a\h'-(\w'a'u*4/10)'e .ds Ae A\h'-(\w'A'u*4/10)'E . \" corrections for vroff .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u' .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u' . \" for low resolution devices (crt and lpr) .if \n(.H>23 .if \n(.V>19 \ \{\ . ds : e . ds 8 ss . ds o a . ds d- d\h'-1'\(ga . ds D- D\h'-1'\(hy . ds th \o'bp' . ds Th \o'LP' . ds ae ae . ds Ae AE .\} .rm #[ #] #H #V #F C .\" ======================================================================== .\" .IX Title "MKDoc::XML::Tokenizer 3pm" .TH MKDoc::XML::Tokenizer 3pm "2004-10-06" "perl v5.10.1" "User Contributed Perl Documentation" .\" For nroff, turn off justification. Always turn off hyphenation; it makes .\" way too many mistakes in technical documents. .if n .ad l .nh .SH "NAME" MKDoc::XML::Tokenizer \- Tokenize XML the REX way .SH "SYNOPSIS" .IX Header "SYNOPSIS" .Vb 10 \& my $tokens = MKDoc::XML::Tokenizer\->process_data ($some_xml); \& foreach my $token (@{$tokens}) \& { \& print "\*(Aq" . $token\->as_string() . "\*(Aq is text\en" if (defined $token\->text()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a self closing tag\en" if (defined $token\->tag_self_close()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is an opening tag\en" if (defined $token\->tag_open()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a closing tag\en" if (defined $token\->tag_close()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a processing instruction\en" if (defined $token\->pi()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a declaration\en" if (defined $token\->declaration()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a comment\en" if (defined $token\->comment()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a tag\en" if (defined $token\->tag()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a pseudo\-tag (NOT text and NOT tag)\en" if (defined $token\->pseudotag()); \& print "\*(Aq" . $token\->as_string() . "\*(Aq is a leaf token (NOT opening tag)\en" if (defined $token\->leaf()); \& } .Ve .SH "SUMMARY" .IX Header "SUMMARY" MKDoc::XML::Tokenizer is a module which uses Robert D. Cameron \s-1REX\s0 technique to parse \s-1XML\s0 (ignore the carriage returns): .PP .Vb 10 \& [^<]+|<(?:!(?:\-\-(?:[^\-]*\-(?:[^\-][^\-]*\-)*\->?)?|\e[CDATA\e[(?:[^\e]]*](?:[^\e]]+]) \& *]+(?:[^\e]>][^\e]]*](?:[^\e]]+])*]+)*>)?|DOCTYPE(?:[ \en\et\er]+(?:[A\-Za\-z_:]|[^\e \& x00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]|[^\ex00\-\ex7F])*(?:[ \en\et\er]+(?:(?:[A\-Za\-z_:]|[^\e \& x00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]|[^\ex00\-\ex7F])*|"[^"]*"|\*(Aq[^\*(Aq]*\*(Aq))*(?:[ \en\et\er]+) \& ?(?:\e[(?:<(?:!(?:\-\-[^\-]*\-(?:[^\-][^\-]*\-)*\->|[^\-](?:[^\e]"\*(Aq><]+|"[^"]*"|\*(Aq[^\*(Aq]*\*(Aq \& )*>)|\e?(?:[A\-Za\-z_:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]|[^\ex00\-\ex7F])*(?:\e?>|[\e \& n\er\et ][^?]*\e?+(?:[^>?][^?]*\e?+)*>))|%(?:[A\-Za\-z_:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0 \& \-9_:.\-]|[^\ex00\-\ex7F])*;|[ \en\et\er]+)*](?:[ \en\et\er]+)?)?>?)?)?|\e?(?:(?:[A\-Za\-z \& _:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]|[^\ex00\-\ex7F])*(?:\e?>|[\en\er\et ][^?]*\e?+(? \& :[^>?][^?]*\e?+)*>)?)?|/(?:(?:[A\-Za\-z_:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]|[^\ex \& 00\-\ex7F])*(?:[ \en\et\er]+)?>?)?|(?:(?:[A\-Za\-z_:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0\-9_:. \& \-]|[^\ex00\-\ex7F])*(?:[ \en\et\er]+(?:[A\-Za\-z_:]|[^\ex00\-\ex7F])(?:[A\-Za\-z0\-9_:.\-]| \& [^\ex00\-\ex7F])*(?:[ \en\et\er]+)?=(?:[ \en\et\er]+)?(?:"[^<"]*"|\*(Aq[^<\*(Aq]*\*(Aq))*(?:[ \en\e \& t\er]+)?/?>?)?) .Ve .PP That's right. One big regex, and it works rather well. .SH "DISCLAIMER" .IX Header "DISCLAIMER" \&\fBThis module does low level \s-1XML\s0 manipulation. It will somehow parse even broken \s-1XML\s0 and try to do something with it. Do not use it unless you know what you're doing.\fR .SH "API" .IX Header "API" .ie n .SS "my $tokens = MKDoc::XML::Tokenizer\->process_data ($some_xml);" .el .SS "my \f(CW$tokens\fP = MKDoc::XML::Tokenizer\->process_data ($some_xml);" .IX Subsection "my $tokens = MKDoc::XML::Tokenizer->process_data ($some_xml);" Splits \f(CW$some_xml\fR into a list of MKDoc::XML::Token objects and returns an array reference to the list of tokens. .ie n .SS "my $tokens = MKDoc::XML::Tokenizer\->process_file ('/some/file.xml');" .el .SS "my \f(CW$tokens\fP = MKDoc::XML::Tokenizer\->process_file ('/some/file.xml');" .IX Subsection "my $tokens = MKDoc::XML::Tokenizer->process_file ('/some/file.xml');" Same as MKDoc::XML::Tokenizer\->process_data ($some_xml), except that it reads \f(CW$some_xml\fR from '/some/file.xml'. .SH "NOTES" .IX Header "NOTES" MKDoc::XML::Tokenizer works with MKDoc::XML::Token, which can be used when building a full tree is not necessary. If you need to build a tree, look at MKDoc::XML::TreeBuilder. .SH "AUTHOR" .IX Header "AUTHOR" Copyright 2003 \- MKDoc Holdings Ltd. .PP Author: Jean-Michel Hiver .PP This module is free software and is distributed under the same license as Perl itself. Use it at your own risk. .SH "SEE ALSO" .IX Header "SEE ALSO" MKDoc::XML::Token MKDoc::XML::TreeBuilder