.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.28)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is turned on, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{
.    if \nF \{
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Unicode::UTF8 3pm"
.TH Unicode::UTF8 3pm "2013-09-04" "perl v5.20.0" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
Unicode::UTF8 \- Encoding and decoding of UTF\-8 encoding form
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
.Vb 1
\&    use Unicode::UTF8 qw[decode_utf8 encode_utf8];
\&    
\&    use warnings FATAL => \*(Aqutf8\*(Aq; # fatalize encoding glitches
\&    $string = decode_utf8($octets);
\&    $octets = encode_utf8($string);
.Ve
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
This module provides functions to encode and decode \s-1UTF\-8\s0 encoding form as 
specified by Unicode and \s-1ISO/IEC 10646:2011.\s0
.SH "FUNCTIONS"
.IX Header "FUNCTIONS"
.SS "decode_utf8"
.IX Subsection "decode_utf8"
.Vb 2
\&    $string = decode_utf8($octets);
\&    $string = decode_utf8($octets, $fallback);
.Ve
.PP
Returns an decoded representation of \f(CW$octets\fR in \s-1UTF\-8\s0 encoding as a character
string.
.PP
\&\f(CW$fallback\fR is an optional \f(CW\*(C`CODE\*(C'\fR reference which provides a error-handling 
mechanism, allowing customization of error handling. The default error-handling 
mechanism is to replace any ill-formed \s-1UTF\-8\s0 sequences or encoded code points 
which can't be interchanged with \s-1REPLACEMENT CHARACTER \s0(U+FFFD).
.PP
.Vb 1
\&    $string = $fallback\->($octets, $is_usv, $position);
.Ve
.PP
\&\f(CW$fallback\fR is invoked with three arguments: \f(CW$octets\fR, \f(CW$is_usv\fR and 
\&\f(CW$position\fR. \f(CW$octets\fR is a sequence of one or more octets containing the 
maximal subpart of the ill-formed subsequence or encoded code point which 
can't be interchanged. \f(CW$is_usv\fR is a boolean indicating whether or not 
\&\f(CW$octets\fR represent a encoded Unicode scalar value. \f(CW$position\fR is a 
unsigned integer containing the zero based octet position at which the error 
occurred within the octets provided to \f(CW\*(C`decode_utf8()\*(C'\fR. \f(CW$fallback\fR must 
return a character string consisting of zero or more Unicode scalar values. 
Unicode scalar values consist of code points in the range U+0000..U+D7FF and 
U+E000..U+10FFFF.
.SS "encode_utf8"
.IX Subsection "encode_utf8"
.Vb 2
\&    $octets = encode_utf8($string);
\&    $octets = encode_utf8($string, $fallback);
.Ve
.PP
Returns an encoded representation of \f(CW$string\fR in \s-1UTF\-8\s0 encoding as an octet
string.
.PP
\&\f(CW$fallback\fR is an optional \f(CW\*(C`CODE\*(C'\fR reference which provides a error-handling 
mechanism, allowing customization of error handling. The default error-handling 
mechanism is to replace any code points which can't be interchanged or represented 
in \s-1UTF\-8\s0 encoding form with \s-1REPLACEMENT CHARACTER \s0(U+FFFD).
.PP
.Vb 1
\&    $string = $fallback\->($codepoint, $is_usv, $position);
.Ve
.PP
\&\f(CW$fallback\fR is invoked with three arguments: \f(CW$codepoint\fR, \f(CW$is_usv\fR and 
\&\f(CW$position\fR. \f(CW$codepoint\fR is a unsigned integer containing the code point 
which can't be interchanged or represented in \s-1UTF\-8\s0 encoding form. \f(CW$is_usv\fR 
is a boolean indicating whether or not \f(CW$codepoint\fR is a Unicode scalar value. 
\&\f(CW$position\fR is a unsigned integer containing the zero based character position 
at which the error occurred within the string provided to \f(CW\*(C`encode_utf8()\*(C'\fR. 
\&\f(CW$fallback\fR must return a character string consisting of zero or more Unicode 
scalar values.Unicode scalar values consist of code points in the range 
U+0000..U+D7FF and U+E000..U+10FFFF.
.SS "valid_utf8"
.IX Subsection "valid_utf8"
.Vb 1
\&    $boolean = valid_utf8($octets);
.Ve
.PP
Returns a boolean indicating whether or not the given \f(CW$octets\fR consist of 
well-formed \s-1UTF\-8\s0 sequences.
.SH "EXPORTS"
.IX Header "EXPORTS"
None by default. All functions can be exported using the \f(CW\*(C`:all\*(C'\fR tag or individually.
.SH "DIAGNOSTICS"
.IX Header "DIAGNOSTICS"
.IP "Can't decode a wide character string" 4
.IX Item "Can't decode a wide character string"
(F) Wide character in octets.
.IP "Can't validate a wide character string" 4
.IX Item "Can't validate a wide character string"
(F) Wide character in octets.
.ie n .IP "Can't decode ill-formed \s-1UTF\-8\s0 octet sequence <%s> in position %u" 4
.el .IP "Can't decode ill-formed \s-1UTF\-8\s0 octet sequence <%s> in position \f(CW%u\fR" 4
.IX Item "Can't decode ill-formed UTF-8 octet sequence <%s> in position %u"
(W utf8) Encountered an ill-formed \s-1UTF\-8\s0 octet sequence. <%s> contains a 
hexadecimal representation of the maximal subpart of the ill-formed subsequence.
.ie n .IP "Can't interchange noncharacter code point U+%X in position %u" 4
.el .IP "Can't interchange noncharacter code point U+%X in position \f(CW%u\fR" 4
.IX Item "Can't interchange noncharacter code point U+%X in position %u"
(W utf8, nonchar) Noncharacters are code points that are permanently reserved 
in the Unicode Standard for internal use. They are forbidden for use in open 
interchange of Unicode text data. Noncharacters consist of the values U+nFFFE 
and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.
.ie n .IP "Can't represent surrogate code point U+%X in position %u" 4
.el .IP "Can't represent surrogate code point U+%X in position \f(CW%u\fR" 4
.IX Item "Can't represent surrogate code point U+%X in position %u"
(W utf8, surrogate) Surrogate code points are designated only for surrogate code 
units in the \s-1UTF\-16\s0 character encoding form. Surrogates consist of code points 
in the range U+D800 to U+DFFF.
.ie n .IP "Can't represent super code point \ex{%X} in position %u" 4
.el .IP "Can't represent super code point \ex{%X} in position \f(CW%u\fR" 4
.IX Item "Can't represent super code point x{%X} in position %u"
(W utf8, non_unicode) Code points greater than U+10FFFF. Perl's extended codespace.
.ie n .IP "Can't decode ill-formed UTF-X octet sequence <%s> in position %u" 4
.el .IP "Can't decode ill-formed UTF-X octet sequence <%s> in position \f(CW%u\fR" 4
.IX Item "Can't decode ill-formed UTF-X octet sequence <%s> in position %u"
(F) Encountered an ill-formed octet sequence in Perl's internal representation 
of wide characters.
.PP
The sub-categories: \f(CW\*(C`nonchar\*(C'\fR, \f(CW\*(C`surrogate\*(C'\fR and \f(CW\*(C`non_unicode\*(C'\fR is only available 
on Perl 5.14 or greater. See perllexwarn for available categories and hierarchies.
.SH "COMPARISON"
.IX Header "COMPARISON"
Here is a summary of features for comparison with Encode's \s-1UTF\-8\s0 implementation:
.IP "\(bu" 4
Simple \s-1API\s0 which makes use of Perl's standard warning categories.
.IP "\(bu" 4
Recognizes all noncharacters regardless of Perl version
.IP "\(bu" 4
Implements Unicode's recommended practice for using U+FFFD.
.IP "\(bu" 4
Better diagnostics in warning messages
.IP "\(bu" 4
Detects and reports inconsistency in Perl's internal representation of 
wide characters (UTF-X)
.IP "\(bu" 4
Preserves taintedness of decoded \f(CW$octets\fR or encoded \f(CW$string\fR
.IP "\(bu" 4
Better performance ~ 600% \- 1200% (\s-1JA:\s0 600%, \s-1AR:\s0 700%, \s-1SV:\s0 900%, \s-1EN:\s0 1200%, 
see benchmarks directory in git repository)
.SH "CONFORMANCE"
.IX Header "CONFORMANCE"
It's the author's believe that this \s-1UTF\-8\s0 implementation is conformant with 
the Unicode Standard Version 6.0. Any deviations from the Unicode Standard 
is to be considered a bug.
.SH "SEE ALSO"
.IX Header "SEE ALSO"
.IP "Encode" 4
.IX Item "Encode"
.PD 0
.IP "<http://www.unicode.org/>" 4
.IX Item "<http://www.unicode.org/>"
.PD
.SH "SUPPORT"
.IX Header "SUPPORT"
.SS "\s-1BUGS\s0"
.IX Subsection "BUGS"
Please report any bugs by email to \f(CW\*(C`bug\-unicode\-utf8 at rt.cpan.org\*(C'\fR, or 
through the web interface at <http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode\-UTF8>. 
You will be automatically notified of any progress on the request by the system.
.SS "\s-1SOURCE CODE\s0"
.IX Subsection "SOURCE CODE"
This is open source software. The code repository is available for public 
review and contribution under the terms of the license.
.PP
<http://github.com/chansen/p5\-unicode\-utf8>
.PP
.Vb 1
\&    git clone http://github.com/chansen/p5\-unicode\-utf8
.Ve
.SH "AUTHOR"
.IX Header "AUTHOR"
Christian Hansen \f(CW\*(C`chansen@cpan.org\*(C'\fR
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
Copyright 2011\-2012 by Christian Hansen.
.PP
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.