.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.40)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
.    if \nF \{\
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{\
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Marpa::R2::Scanless 3pm"
.TH Marpa::R2::Scanless 3pm "2021-01-22" "perl v5.32.0" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "Name"
.IX Header "Name"
Marpa::R2::Scanless \- Scanless interface
.SH "Synopsis"
.IX Header "Synopsis"
.Vb 1
\&    use Marpa::R2;
\&
\&    my $grammar = Marpa::R2::Scanless::G\->new(
\&        {   bless_package => \*(AqMy_Nodes\*(Aq,
\&            source        => \e(<<\*(AqEND_OF_SOURCE\*(Aq),
\&    :default ::= action => [values] bless => ::lhs
\&    lexeme default = action => [ start, length, value ]
\&        bless => ::name latm => 1
\&
\&    :start ::= Script
\&    Script ::= Expression+ separator => comma
\&    comma ~ [,]
\&    Expression ::=
\&        Number bless => primary
\&        | \*(Aq(\*(Aq Expression \*(Aq)\*(Aq bless => paren assoc => group
\&       || Expression \*(Aq**\*(Aq Expression bless => exponentiate assoc => right
\&       || Expression \*(Aq*\*(Aq Expression bless => multiply
\&        | Expression \*(Aq/\*(Aq Expression bless => divide
\&       || Expression \*(Aq+\*(Aq Expression bless => add
\&        | Expression \*(Aq\-\*(Aq Expression bless => subtract
\&
\&    Number ~ [\ed]+
\&    :discard ~ whitespace
\&    whitespace ~ [\es]+
\&    # allow comments
\&    :discard ~ <hash comment>
\&    <hash comment> ~ <terminated hash comment> | <unterminated
\&       final hash comment>
\&    <terminated hash comment> ~ \*(Aq#\*(Aq <hash comment body> <vertical space char>
\&    <unterminated final hash comment> ~ \*(Aq#\*(Aq <hash comment body>
\&    <hash comment body> ~ <hash comment char>*
\&    <vertical space char> ~ [\ex{A}\ex{B}\ex{C}\ex{D}\ex{2028}\ex{2029}]
\&    <hash comment char> ~ [^\ex{A}\ex{B}\ex{C}\ex{D}\ex{2028}\ex{2029}]
\&    END_OF_SOURCE
\&        }
\&    );
\&
\&
\&    my $recce = Marpa::R2::Scanless::R\->new( { grammar => $grammar } );
\&
\&    my $input = \*(Aq42*2+7/3, 42*(2+7)/3, 2**7\-3, 2**(7\-3)\*(Aq;
\&    $recce\->read(\e$input);
\&    my $value_ref = $recce\->value();
\&    die "No parse was found\en" if not defined $value_ref;
\&
\&    # Result will be something like "86.33... 126 125 16"
\&    # depending on the floating point precision
\&    my $result = ${$value_ref}\->doit();
\&
\&    package My_Nodes;
\&
\&    sub My_Nodes::primary::doit { return $_[0]\->[0]\->doit() }
\&    sub My_Nodes::Number::doit  { return $_[0]\->[2] }
\&    sub My_Nodes::paren::doit   { my ($self) = @_; $self\->[1]\->doit() }
\&
\&    sub My_Nodes::add::doit {
\&        my ($self) = @_;
\&        $self\->[0]\->doit() + $self\->[2]\->doit();
\&    }
\&
\&    sub My_Nodes::subtract::doit {
\&        my ($self) = @_;
\&        $self\->[0]\->doit() \- $self\->[2]\->doit();
\&    }
\&
\&    sub My_Nodes::multiply::doit {
\&        my ($self) = @_;
\&        $self\->[0]\->doit() * $self\->[2]\->doit();
\&    }
\&
\&    sub My_Nodes::divide::doit {
\&        my ($self) = @_;
\&        $self\->[0]\->doit() / $self\->[2]\->doit();
\&    }
\&
\&    sub My_Nodes::exponentiate::doit {
\&        my ($self) = @_;
\&        $self\->[0]\->doit()**$self\->[2]\->doit();
\&    }
\&
\&    sub My_Nodes::Script::doit {
\&        my ($self) = @_;
\&        return join q{ }, map { $_\->doit() } @{$self};
\&    }
.Ve
.SH "About this document"
.IX Header "About this document"
This document
is an introduction and overview
to Marpa's Scanless interface (\s-1SLIF\s0).
Marpa::R2's top-level page has
an \s-1SLIF\s0 tutorial in Marpa's top-level page.
If you are new to Marpa or its \s-1SLIF,\s0
you probably want to start with that.
.PP
This document follows up on the tutorial,
looking more deeply and carefully at the concepts
behind the \s-1SLIF.\s0
Separate documents provide
the reference documentation for
Scanless grammar objects,
Scanless recognizer objects
and
the Scanless \s-1DSL\s0.
.SH "The two levels of language description"
.IX Header "The two levels of language description"
Programmers usually
describe the syntax of a language at two levels.
The same two-level approach can be convenient for implementing
a parser of the language.
But, implementation aside,
a two-level description
seems to be a natural approach to
the design issues that arise in languages
intended for practical use.
.PP
The first level is structural.
For example, here is how the Perl docs describe one of
the forms that Perl's \f(CW\*(C`use\*(C'\fR statement takes:
.PP
.Vb 1
\&    use Module VERSION LIST
.Ve
.PP
and in Perl's source code (\f(CW\*(C`perly.y\*(C'\fR) something similar
drives the parser.
.PP
The second level is lexical.
For example,
Perl's perlpodspec page has a number of statements like this:
.PP
.Vb 2
\&    [...] you can distinguish URL\-links from anything else
\&    by the fact that they match m/\eA\ew+:[^:\es]\eS*\ez/.
.Ve
.PP
The lexical level is character by character.
The structural level is less well-defined,
but in practice it ignores most of the character-by-character issues,
and it almost always avoids dealing with whitespace.
.PP
For reasons that will become clear later,
I will sometimes call the lexical level, L0,
and will sometimes call the structural level, G1.
(For historic reasons, L0 is sometimes also called G0.)
.PP
It is important to realize
that the difference between L0 and G1 is one
of level of description and
\&\s-1NOT\s0 one of precision or exactness.
A structural description of Perl's \f(CW\*(C`use\*(C'\fR statement,
much like the one I showed above,
is in Perl's source code (\f(CW\*(C`perly.y\*(C'\fR),
along with many other, similar,
structural-level descriptions.
These are used to
generate the production parser for Perl so,
clearly, structural level descriptions are every bit
as much a precision instrument as regular expressions.
.SH "A very simple language"
.IX Header "A very simple language"
In order to focus on very basic issues,
I will use as an example,
a very simple language with a very simple semantics.
The language consists of decimal digits and \s-1ASCII\s0 spaces.
The semantics will treat it as a series of integers to be added.
.PP
Here are three strings in that language
.PP
.Vb 3
\&     8675311
\&     867 5311
\&     8 6 7 5 3 1 1
.Ve
.PP
According to our semantics,
the three strings contain respectively,
one, two and seven integers.
The values of the three strings are,
according to our semantics,
the sum of these integers:
respectively, 8675311, 6178, and 31.
.PP
It's sometimes said, in describing languages like the above,
that \*(L"whitespace is ignored\*(R".
From the purely structural point of view this can be, in one sense, true.
But from the lexical point of view it's clearly quite false.
.PP
Combining the two levels of description,
it is very hard to justify an assertion that \*(L"whitespace is ignored\*(R".
The three strings in the display above
differ only in whitespace.
Clearly the placement
of the whitespace makes a vast difference, and has a major
effect on the structure of string,
which in turn has a determining effect on its semantics.
.SH "Why the structural level?"
.IX Header "Why the structural level?"
As we've seen, the structural level ignores essential aspects
of the language.
It is possible to describe a language using a single level of description.
So why have a structural (G1) level of description?
Why not a \*(L"unified\*(R" instead of a \*(L"split\*(R" description.
.PP
It turns out that, for most languages of practical size,
particularly those that deploy whitespace in a natural
and intuitive way,
a \*(L"unified\*(R" description rapidly becomes unwriteable,
and even more rapidly becomes unreadable.
The reader should be able to
convince himself by taking the \s-1BNF\s0 from his favorite
language standard and recasting it so that
every rule takes into account whitespace.
As one example, consider declarations in the C language.
.PP
.Vb 2
\&    unsigned int a;
\&    unsigned*b;
.Ve
.PP
In the first of the two lines above the whitespace is necessary.
In the second of the two lines whitespace would be allowed,
but is not necessary.
You cannot simply insist on whitespace between all symbols,
because whitespace is and should be optional between
some symbols and not between others.
Where whitespace is optional, and where it should not be,
depends on which characters are adjacent to each other.
This kind of character-level information is not convenient to represent
at the structural (G1) level.
.PP
It is certainly possible to write whitespace-aware
\&\s-1BNF\s0 for the fragment of the C language
above.
And it is certainly possible to extend it to include more and
more of the declaration syntax.
But before you've extended the \s-1BNF\s0 very much,
you will notice it is becoming a lot harder to write.
You will also notice that, as quickly as it is becoming hard to
write, it is even more quickly becoming \*(L"write-only\*(R" \*(--
impossible to read.
In making your \s-1BNF\s0
whitespace-aware, you are more than doubling its size.
And you are burying
what intuition sees as the structure of the language
under a deep pile of special cases.
.PP
Long before you finish, I expect you will realize
that the \*(L"unified\*(R" approach is simply not workable.
The authors of the C language
relegated lexical issues to their own brief section,
and ignored them in
most of their language description.
This was clearly the only practical approach.
.SH "Interweaving the two levels"
.IX Header "Interweaving the two levels"
The scanless interface
interweaves the \*(L"split\*(R" and \*(L"unified\*(R" approaches
and, I hope, preserves the best features of each.
Here is full syntax of
the example whitespace-and-digit language,
described using Marpa::R2's scanless interface:
.PP
.Vb 6
\&    :start ::= <number sequence>
\&    <number sequence> ::= <number>+ action => add_sequence
\&    number ~ digit+
\&    digit ~ [0\-9]
\&    :discard ~ whitespace
\&    whitespace ~ [\es]+
.Ve
.SS "A new operator"
.IX Subsection "A new operator"
In this example, three of the scanless interface's extensions
to the Stuifzand interface are used.
First, the tilde ("\f(CW\*(C`~\*(C'\fR\*(L") is used to separate \s-1LHS\s0 and \s-1RHS\s0 of rules at the lexical
(L0) level.
Rules whose \s-1LHS\s0 and \s-1RHS\s0 are separated by the traditional \s-1BNF\s0 operator (\*(R"\f(CW\*(C`::=\*(C'\fR")
are at the structural (G1) level.
.PP
The programmer must decide when to use the "\f(CW\*(C`~\*(C'\fR\*(L" operator
and when to use the \*(R"\f(CW\*(C`::=\*(C'\fR\*(L" operator,
but the choice will usually be easy:
If you want Marpa to \*(R"do what I mean\*(L" with whitespace, you use the
\&\*(R"\f(CW\*(C`::=\*(C'\fR\*(L" operator.
If you want Marpa to do exactly what you say on a character-by-character basis,
then you use the \*(R"\f(CW\*(C`~\*(C'\fR" operator.
.SS "Character classes"
.IX Subsection "Character classes"
Perl character classes are now allowed on the \s-1RHS\s0 of prioritized and quantified
rules.
The example shows character classes only in L0 rules,
but character classes can also be used in G1 rules.
When a character class is used
in a G1 rule, it still must be implemented at
the L0 level.
Marpa knows this and \*(L"does what you mean.\*(R"
.SS "Discard pseudo-rules"
.IX Subsection "Discard pseudo-rules"
A new type of rule is introduced:
a \*(L"discard\*(R" pseudo-rule.
A discard pseudo-rule has a \f(CW\*(C`:discard\*(C'\fR pseudo-symbol on its \s-1LHS\s0
and one symbol name on its \s-1RHS.\s0
It indicates that, when the \s-1RHS\s0 symbol is recognized,
it should not be passed on as usual to the structural (G1) level.
Instead, the lexical (L0) level will simply \*(L"discard\*(R" what it has
found.
In the example, whitespace is discarded.
.SH "Lexemes"
.IX Header "Lexemes"
Tokens at the boundary between L0 and G1 have special
significance.
The top-level undiscarded symbols in L0,
which will be called \*(L"L0 lexemes\*(R",
go on to become the terminals in G1.
G1's terminals are called \*(L"G1 lexemes\*(R".
To find the \*(L"L0 lexemes\*(R",
Marpa looks for symbols which are on
the \s-1LHS\s0 of a L0 rule, but not on the \s-1RHS\s0 of any L0 rule.
To find the \*(L"G1 lexemes\*(R",
Marpa looks for symbols on the \s-1RHS\s0 of at least one G1 rule,
but not on the \s-1LHS\s0 of any G1 rule.
.PP
L0 and G1 should agree on what is a lexeme and what is not.
If they do not,
the programmer receives a fatal message which describes the
problem and the symbols involved.
So in practice I will usually simply refer to \*(L"lexemes\*(R".
.SH "Longest acceptable tokens match"
.IX Header "Longest acceptable tokens match"
If you specify
"\f(CW\*(C`latm => 1\*(C'\fR"
as the default,
which you almost always should,
the L0 grammar looks for tokens on a
longest acceptable tokens match (\s-1LATM\s0) basis.
Tokens which the structural grammar would reject
are thrown away.
So are tokens in discard pseudo-rules.
The rest are passed on to the G1 grammar.
.PP
Note that the match is longest \s-1TOKENS.\s0
Several tokens may have the same length,
so several tokens may be \*(L"longest\*(R".
When that happens, Marpa
uses the full set of longest tokens
in looking for possible parses.
For more about \s-1LATM\s0 and its alternative, \s-1LTM,\s0
see the detailed description of the \f(CW\*(C`latm\*(C'\fR
adverb.
.SH "Semantics"
.IX Header "Semantics"
The value of a L0 rule is always the string it matches,
and the value of a lexeme from the G1 point of view is the
same as its value from the L0 point of view.
This means that it makes no sense to specify semantic
actions for L0 rules, and that is not allowed.
.PP
With the exception of lexeme values,
the semantics of the G1 grammar are exactly the
same as for ordinary grammars.
Actions may be specified for G1 rules and will
behave as described in
Marpa::R2::Semantics.
.SH "Implementation"
.IX Header "Implementation"
The scannerless interface uses two co-operating Marpa grammars,
an approach pioneered by Andrew Rodland.
There are separate Marpa grammars for the L0 and G1 levels,
as well as separate parsers.
The details of their interaction are hidden from the user.
Typically, the L0 parser finds tokens and passes them up to the
G1 parser.
.PP
The interface described in
this document is surprisingly implementation-agnostic.
The author developed the basics of this
interface while trying an implementation approach,
that used a single Marpa grammar,
before changing to the dual grammar implementation.
.SH "Copyright and License"
.IX Header "Copyright and License"
.Vb 5
\&  Copyright 2014 Jeffrey Kegler
\&  This file is part of Marpa::R2.  Marpa::R2 is free software: you can
\&  redistribute it and/or modify it under the terms of the GNU Lesser
\&  General Public License as published by the Free Software Foundation,
\&  either version 3 of the License, or (at your option) any later version.
\&
\&  Marpa::R2 is distributed in the hope that it will be useful,
\&  but WITHOUT ANY WARRANTY; without even the implied warranty of
\&  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
\&  Lesser General Public License for more details.
\&
\&  You should have received a copy of the GNU Lesser
\&  General Public License along with Marpa::R2.  If not, see
\&  http://www.gnu.org/licenses/.
.Ve