'\"
'\" Generated from file 'htmlparse\&.man' by tcllib/doctools with format 'nroff'
'\"
.TH "htmlparse" 3tcl 1\&.2\&.2 tcllib "HTML Parser"
.\" The -*- nroff -*- definitions below are for supplemental macros used
.\" in Tcl/Tk manual entries.
.\"
.\" .AP type name in/out ?indent?
.\"	Start paragraph describing an argument to a library procedure.
.\"	type is type of argument (int, etc.), in/out is either "in", "out",
.\"	or "in/out" to describe whether procedure reads or modifies arg,
.\"	and indent is equivalent to second arg of .IP (shouldn't ever be
.\"	needed;  use .AS below instead)
.\"
.\" .AS ?type? ?name?
.\"	Give maximum sizes of arguments for setting tab stops.  Type and
.\"	name are examples of largest possible arguments that will be passed
.\"	to .AP later.  If args are omitted, default tab stops are used.
.\"
.\" .BS
.\"	Start box enclosure.  From here until next .BE, everything will be
.\"	enclosed in one large box.
.\"
.\" .BE
.\"	End of box enclosure.
.\"
.\" .CS
.\"	Begin code excerpt.
.\"
.\" .CE
.\"	End code excerpt.
.\"
.\" .VS ?version? ?br?
.\"	Begin vertical sidebar, for use in marking newly-changed parts
.\"	of man pages.  The first argument is ignored and used for recording
.\"	the version when the .VS was added, so that the sidebars can be
.\"	found and removed when they reach a certain age.  If another argument
.\"	is present, then a line break is forced before starting the sidebar.
.\"
.\" .VE
.\"	End of vertical sidebar.
.\"
.\" .DS
.\"	Begin an indented unfilled display.
.\"
.\" .DE
.\"	End of indented unfilled display.
.\"
.\" .SO ?manpage?
.\"	Start of list of standard options for a Tk widget. The manpage
.\"	argument defines where to look up the standard options; if
.\"	omitted, defaults to "options". The options follow on successive
.\"	lines, in three columns separated by tabs.
.\"
.\" .SE
.\"	End of list of standard options for a Tk widget.
.\"
.\" .OP cmdName dbName dbClass
.\"	Start of description of a specific option.  cmdName gives the
.\"	option's name as specified in the class command, dbName gives
.\"	the option's name in the option database, and dbClass gives
.\"	the option's class in the option database.
.\"
.\" .UL arg1 arg2
.\"	Print arg1 underlined, then print arg2 normally.
.\"
.\" .QW arg1 ?arg2?
.\"	Print arg1 in quotes, then arg2 normally (for trailing punctuation).
.\"
.\" .PQ arg1 ?arg2?
.\"	Print an open parenthesis, arg1 in quotes, then arg2 normally
.\"	(for trailing punctuation) and then a closing parenthesis.
.\"
.\"	# Set up traps and other miscellaneous stuff for Tcl/Tk man pages.
.if t .wh -1.3i ^B
.nr ^l \n(.l
.ad b
.\"	# Start an argument description
.de AP
.ie !"\\$4"" .TP \\$4
.el \{\
.   ie !"\\$2"" .TP \\n()Cu
.   el          .TP 15
.\}
.ta \\n()Au \\n()Bu
.ie !"\\$3"" \{\
\&\\$1 \\fI\\$2\\fP (\\$3)
.\".b
.\}
.el \{\
.br
.ie !"\\$2"" \{\
\&\\$1	\\fI\\$2\\fP
.\}
.el \{\
\&\\fI\\$1\\fP
.\}
.\}
..
.\"	# define tabbing values for .AP
.de AS
.nr )A 10n
.if !"\\$1"" .nr )A \\w'\\$1'u+3n
.nr )B \\n()Au+15n
.\"
.if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n
.nr )C \\n()Bu+\\w'(in/out)'u+2n
..
.AS Tcl_Interp Tcl_CreateInterp in/out
.\"	# BS - start boxed text
.\"	# ^y = starting y location
.\"	# ^b = 1
.de BS
.br
.mk ^y
.nr ^b 1u
.if n .nf
.if n .ti 0
.if n \l'\\n(.lu\(ul'
.if n .fi
..
.\"	# BE - end boxed text (draw box now)
.de BE
.nf
.ti 0
.mk ^t
.ie n \l'\\n(^lu\(ul'
.el \{\
.\"	Draw four-sided box normally, but don't draw top of
.\"	box if the box started on an earlier page.
.ie !\\n(^b-1 \{\
\h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
.\}
.el \}\
\h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul'
.\}
.\}
.fi
.br
.nr ^b 0
..
.\"	# VS - start vertical sidebar
.\"	# ^Y = starting y location
.\"	# ^v = 1 (for troff;  for nroff this doesn't matter)
.de VS
.if !"\\$2"" .br
.mk ^Y
.ie n 'mc \s12\(br\s0
.el .nr ^v 1u
..
.\"	# VE - end of vertical sidebar
.de VE
.ie n 'mc
.el \{\
.ev 2
.nf
.ti 0
.mk ^t
\h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n'
.sp
-1
.fi
.ev
.\}
.nr ^v 0
..
.\"	# Special macro to handle page bottom:  finish off current
.\"	# box/sidebar if in box/sidebar mode, then invoked standard
.\"	# page bottom macro.
.de ^B
.ev 2
'ti 0
'nf
.mk ^t
.if \\n(^b \{\
.\"	Draw three-sided box if this is the box's first page,
.\"	draw two sides but no top otherwise.
.ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
.el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c
.\}
.if \\n(^v \{\
.nr ^x \\n(^tu+1v-\\n(^Yu
\kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c
.\}
.bp
'fi
.ev
.if \\n(^b \{\
.mk ^y
.nr ^b 2
.\}
.if \\n(^v \{\
.mk ^Y
.\}
..
.\"	# DS - begin display
.de DS
.RS
.nf
.sp
..
.\"	# DE - end display
.de DE
.fi
.RE
.sp
..
.\"	# SO - start of list of standard options
.de SO
'ie '\\$1'' .ds So \\fBoptions\\fR
'el .ds So \\fB\\$1\\fR
.SH "STANDARD OPTIONS"
.LP
.nf
.ta 5.5c 11c
.ft B
..
.\"	# SE - end of list of standard options
.de SE
.fi
.ft R
.LP
See the \\*(So manual entry for details on the standard options.
..
.\"	# OP - start of full description for a single option
.de OP
.LP
.nf
.ta 4c
Command-Line Name:	\\fB\\$1\\fR
Database Name:	\\fB\\$2\\fR
Database Class:	\\fB\\$3\\fR
.fi
.IP
..
.\"	# CS - begin code excerpt
.de CS
.RS
.nf
.ta .25i .5i .75i 1i
..
.\"	# CE - end code excerpt
.de CE
.fi
.RE
..
.\"	# UL - underline word
.de UL
\\$1\l'|0\(ul'\\$2
..
.\"	# QW - apply quotation marks to word
.de QW
.ie '\\*(lq'"' ``\\$1''\\$2
.\"" fix emacs highlighting
.el \\*(lq\\$1\\*(rq\\$2
..
.\"	# PQ - apply parens and quotation marks to word
.de PQ
.ie '\\*(lq'"' (``\\$1''\\$2)\\$3
.\"" fix emacs highlighting
.el (\\*(lq\\$1\\*(rq\\$2)\\$3
..
.\"	# QR - quoted range
.de QR
.ie '\\*(lq'"' ``\\$1''\\-``\\$2''\\$3
.\"" fix emacs highlighting
.el \\*(lq\\$1\\*(rq\\-\\*(lq\\$2\\*(rq\\$3
..
.\"	# MT - "empty" string
.de MT
.QW ""
..
.BS
.SH NAME
htmlparse \- Procedures to parse HTML strings
.SH SYNOPSIS
package require \fBTcl  8\&.2\fR
.sp
package require \fBstruct::stack  1\&.3\fR
.sp
package require \fBcmdline  1\&.1\fR
.sp
package require \fBhtmlparse  ?1\&.2\&.2?\fR
.sp
\fB::htmlparse::parse\fR ?-cmd \fIcmd\fR? ?-vroot \fItag\fR? ?-split \fIn\fR? ?-incvar \fIvar\fR? ?-queue \fIq\fR? \fIhtml\fR
.sp
\fB::htmlparse::debugCallback\fR ?\fIclientdata\fR? \fItag slash param textBehindTheTag\fR
.sp
\fB::htmlparse::mapEscapes\fR \fIhtml\fR
.sp
\fB::htmlparse::2tree\fR \fIhtml tree\fR
.sp
\fB::htmlparse::removeVisualFluff\fR \fItree\fR
.sp
\fB::htmlparse::removeFormDefs\fR \fItree\fR
.sp
.BE
.SH DESCRIPTION
.PP
The \fBhtmlparse\fR package provides commands that allow libraries
and applications to parse HTML in a string into a representation of
their choice\&.
.PP
The following commands are available:
.TP
\fB::htmlparse::parse\fR ?-cmd \fIcmd\fR? ?-vroot \fItag\fR? ?-split \fIn\fR? ?-incvar \fIvar\fR? ?-queue \fIq\fR? \fIhtml\fR
This command is the basic parser for HTML\&. It takes an HTML string,
parses it and invokes a command prefix for every tag encountered\&. It
is not necessary for the HTML to be valid for this parser to
function\&. It is the responsibility of the command invoked for every
tag to check this\&. Another responsibility of the invoked command is
the handling of tag attributes and character entities (escaped
characters)\&. The parser provides the un-interpreted tag attributes to
the invoked command to aid in the former, and the package at large
provides a helper command, \fB::htmlparse::mapEscapes\fR, to aid in
the handling of the latter\&. The parser \fIdoes\fR ignore leading
DOCTYPE declarations and all valid HTML comments it encounters\&.
.sp
All information beyond the HTML string itself is specified via
options, these are explained below\&.
.sp
To help understand the options, some more background information about
the parser\&.
.sp
It is capable of detecting incomplete tags in the HTML string given to
it\&. Under normal circumstances this will cause the parser to throw an
error, but if the option \fI-incvar\fR is used to specify a global (or
namespace) variable, the parser will store the incomplete part of the
input into this variable instead\&. This will aid greatly in the
handling of incrementally arriving HTML, as the parser will handle
whatever it can and defer the handling of the incomplete part until
more data has arrived\&.
.sp
Another feature of the parser are its two possible modes of
operation\&. The normal mode is activated if the option \fI-queue\fR is
not present on the command line invoking the parser\&. If it is present,
the parser will go into the incremental mode instead\&.
.sp
The main difference is that a parser in normal mode will immediately
invoke the command prefix for each tag it encounters\&. In incremental
mode however the parser will generate a number of scripts which invoke
the command prefix for groups of tags in the HTML string and then
store these scripts in the specified queue\&. It is then the
responsibility of the caller of the parser to ensure the execution of
the scripts in the queue\&.
.sp
\fINote\fR: The queue object given to the parser has to provide the
same interface as the queue defined in tcllib -> struct\&. This means,
for example, that all queues created via that tcllib module can be
immediately used here\&. Still, the queue doesn't have to come from
tcllib -> struct as long as the same interface is provided\&.
.sp
In both modes the parser will return an empty string to the caller\&.
.sp
The \fI-split\fR option may be given to a parser in incremental mode to
specify the size of the groups it creates\&. In other words, -split 5
means that each of the generated scripts will invoke the command
prefix for 5 consecutive tags in the HTML string\&. A parser in normal
mode will ignore this option and its value\&.
.sp
The option \fI-vroot\fR specifies a virtual root tag\&. A parser in
normal mode will invoke the command prefix for it immediately before
and after it processes the tags in the HTML, thus simulating that the
HTML string is enclosed in a <vroot> </vroot> combination\&. In
incremental mode however the parser is unable to provide the closing
virtual root as it never knows when the input is complete\&. In this
case the first script generated by each invocation of the parser will
contain an invocation of the command prefix for the virtual root as
its first command\&.
The following options are available:
.RS
.TP
\fB-cmd\fR \fIcmd\fR
The command prefix to invoke for every tag in the HTML
string\&. Defaults to \fI::htmlparse::debugCallback\fR\&.
.TP
\fB-vroot\fR \fItag\fR
The virtual root tag to add around the HTML in normal mode\&. In
incremental mode it is the first tag in each chunk processed by the
parser, but there will be no closing tags\&. Defaults to
\fIhmstart\fR\&.
.TP
\fB-split\fR \fIn\fR
The size of the groups produced by an incremental mode parser\&. Ignored
when in normal mode\&. Defaults to 10\&. Values <= 0 are not allowed\&.
.TP
\fB-incvar\fR \fIvar\fR
The name of the variable where to store any incomplete HTML into\&. This
makes most sense for the incremental mode\&. The parser will throw an
error if it sees incomplete HTML and has no place to store it to\&. This
makes sense for the normal mode\&. Only incomplete tags are detected,
not missing tags\&.  Optional, defaults to 'no variable'\&.
.RE
.RS
.TP
\fIInterface to the command prefix\fR
In normal mode the parser will invoke the command prefix with four
arguments appended\&. See \fB::htmlparse::debugCallback\fR for a
description\&.
.sp
In incremental mode, however, the generated scripts will invoke the
command prefix with five arguments appended\&. The last four of these
are the same which were mentioned above\&. The first is a placeholder
string (\fB@win@\fR) for a clientdata value to be supplied later
during the actual execution of the generated scripts\&. This could be a
tk window path, for example\&. This allows the user of this package to
preprocess HTML strings without committing them to a specific window,
object, whatever during parsing\&. This connection can be made
later\&. This also means that it is possible to cache preprocessed
HTML\&. Of course, nothing prevents the user of the parser from
replacing the placeholder with an empty string\&.
.RE
.TP
\fB::htmlparse::debugCallback\fR ?\fIclientdata\fR? \fItag slash param textBehindTheTag\fR
This command is the standard callback used by the parser in
\fB::htmlparse::parse\fR if none was specified by the user\&. It simply
dumps its arguments to stdout\&.  This callback can be used for both
normal and incremental mode of the calling parser\&. In other words, it
accepts four or five arguments\&. The last four arguments are described
below\&. The optional fifth argument contains the clientdata value
passed to the callback by a parser in incremental mode\&. All callbacks
have to follow the signature of this command in the last four
arguments, and callbacks used in incremental parsing have to follow
this signature in the last five arguments\&.
.sp
The first argument, \fIclientdata\fR, is optional and present only if
this command is invoked by a parser in incremental mode\&. It contains
whatever the user of this package wishes\&.
.sp
The second argument, \fItag\fR, contains the name of the tag which is
currently processed by the parser\&.
.sp
The third argument, \fIslash\fR, is either empty or contains a slash
character\&. It allows the callback to distinguish between opening
(slash is empty) and closing tags (slash contains a slash character)\&.
.sp
The fourth argument, \fIparam\fR, contains the un-interpreted list of
parameters to the tag\&.
.sp
The fifth and last argument, \fItextBehindTheTag\fR, contains the text
found by the parser behind the tag named in \fItag\fR\&.
.TP
\fB::htmlparse::mapEscapes\fR \fIhtml\fR
This command takes a HTML string, substitutes all escape sequences
with their actual characters and then returns the resulting string\&.
HTML strings which do not contain escape sequences are returned
unchanged\&.
.TP
\fB::htmlparse::2tree\fR \fIhtml tree\fR
This command is a wrapper around \fB::htmlparse::parse\fR which takes
an HTML string (in \fIhtml\fR) and converts it into a tree containing
the logical structure of the parsed document\&. The name of the tree is
given to the command as its second argument (\fItree\fR)\&. The command
does \fBnot\fR generate the tree by itself but expects that the caller
provided it with an existing and empty tree\&. It also expects that the
specified tree object follows the same interface as the tree object in
tcllib -> struct\&. It doesn't have to be from tcllib -> struct, but it
must provide the same interface\&.
.sp
The internal callback does some basic checking of HTML validity and
tries to recover from the most basic errors\&. The command returns the
contents of its second argument\&. Side effects are the creation and
manipulation of a tree object\&.
.sp
Each node in the generated tree represent one tag in the input\&. The
name of the tag is stored in the attribute \fItype\fR of the
node\&. Any html attributes coming with the tag are stored unmodified in
the attribute \fIdata\fR of the tag\&. In other words, the command does
\fInot\fR parse html attributes into their names and values\&.
.sp
If a tag contains text its node will have children of type
\fIPCDATA\fR containing this text\&. The text will be stored in the
attribute \fIdata\fR of these children\&.
.TP
\fB::htmlparse::removeVisualFluff\fR \fItree\fR
This command walks a tree as generated by \fB::htmlparse::2tree\fR and
removes all the nodes which represent visual tags and not structural
ones\&. The purpose of the command is to make the tree easier to
navigate without getting bogged down in visual information not
relevant to the search\&. Its only argument is the name of the tree to
cut down\&.
.TP
\fB::htmlparse::removeFormDefs\fR \fItree\fR
Like \fB::htmlparse::removeVisualFluff\fR this command is here to cut
down on the size of the tree as generated by
\fB::htmlparse::2tree\fR\&. It removes all nodes representing forms and
form elements\&. Its only argument is the name of the tree to cut down\&.
.PP
.SH "BUGS, IDEAS, FEEDBACK"
This document, and the package it describes, will undoubtedly contain
bugs and other problems\&.
Please report such in the category \fIhtmlparse\fR of the
\fITcllib Trackers\fR [http://core\&.tcl\&.tk/tcllib/reportlist]\&.
Please also report any ideas for enhancements you may have for either
package and/or documentation\&.
.PP
When proposing code changes, please provide \fIunified diffs\fR,
i\&.e the output of \fBdiff -u\fR\&.
.PP
Note further that \fIattachments\fR are strongly preferred over
inlined patches\&. Attachments can be made by going to the \fBEdit\fR
form of the ticket immediately after its creation, and then using the
left-most button in the secondary navigation bar\&.
.SH "SEE ALSO"
struct::tree
.SH KEYWORDS
html, parsing, queue, tree
.SH CATEGORY
Text processing