.\" Automatically generated by Pod::Man 4.14 (Pod::Simple 3.43)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
.    if \nF \{\
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{\
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "Encode::ZapCP1252 3pm"
.TH Encode::ZapCP1252 3pm "2022-12-11" "perl v5.36.0" "User Contributed Perl Documentation"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "Name"
.IX Header "Name"
Encode::ZapCP1252 \- Zap Windows Western Gremlins
.SH "Synopsis"
.IX Header "Synopsis"
.Vb 1
\&  use Encode::ZapCP1252;
\&
\&  # Zap or fix in\-place.
\&  zap_cp1252 $latin1_text;
\&  fix_cp1252 $utf8_text;
\&
\&  # Zap or fix copy.
\&  my $clean_latin1 = zap_cp1252 $latin1_text;
\&  my $fixed_utf8   = fix_cp1252 $utf8_text;
.Ve
.SH "Description"
.IX Header "Description"
Have you ever been processing a Web form submit for feed, assuming that the
incoming text was encoded as specified in the Content-Type header, or in the
\&\s-1XML\s0 declaration, only to end up with a bunch of junk because someone pasted in
content from Microsoft Word? Well, this is because Microsoft uses a superset
of the Latin\-1 encoding called \*(L"Windows Western\*(R" or \*(L"\s-1CP1252\*(R".\s0 If the specified
encoding is Latin\-1, mostly things will come out right, but a few things\*(--like
curly quotes, m\-dashes, ellipses, and the like\*(--may not. The differences are
well-known; you see a nice chart at documenting the differences on
Wikipedia <https://en.wikipedia.org/wiki/Windows-1252>.
.PP
Of course, that won't really help you. What will help you is to quit using
Latin\-1 and switch to \s-1UTF\-8.\s0 Then you can just convert from \s-1CP1252\s0 to \s-1UTF\-8\s0
without losing a thing, just like this:
.PP
.Vb 2
\&  use Encode;
\&  $text = decode \*(Aqcp1252\*(Aq, $text, 1;
.Ve
.PP
But I know that there are those of you out there stuck with Latin\-1 and who
don't want any junk characters from Word users. That's where this module comes
in. Its \f(CW\*(C`zap_cp1252\*(C'\fR function will zap those \s-1CP1252\s0 gremlins for you, turning
them into their appropriate \s-1ASCII\s0 approximations.
.PP
Another case that can occasionally come up is when you're reading reading in
text that \fIclaims\fR to be \s-1UTF\-8,\s0 but it \fIstill\fR ends up with some \s-1CP1252\s0
gremlins mixed in with properly encoded characters. I've seen examples of just
this sort of thing when processing GMail messages and attempting to insert
them into a \s-1UTF\-8\s0 database, as well as in some feeds processed by, say
Yahoo! Pipes. Doesn't work so well. For such cases, there's \f(CW\*(C`fix_cp1252\*(C'\fR,
which converts those \s-1CP1252\s0 gremlins into their \s-1UTF\-8\s0 equivalents.
.SH "Usage"
.IX Header "Usage"
This module exports two subroutines: \f(CW\*(C`zap_cp1252()\*(C'\fR and \f(CW\*(C`fix_cp1252()\*(C'\fR,
each of which accept a single argument:
.PP
.Vb 2
\&  zap_cp1252 $text;
\&  fix_cp1252 $text;
.Ve
.PP
When called in a void context, as in these examples, \f(CW\*(C`zap_cp1252()\*(C'\fR and
\&\f(CW\*(C`fix_cp1252()\*(C'\fR subroutine perform \fIin place\fR conversions of any \s-1CP1252\s0
gremlins into their appropriate \s-1ASCII\s0 approximations or \s-1UTF\-8\s0 equivalents,
respectively. Note that because the conversion happens in place, the data to
be converted \fIcannot\fR be a string constant; it must be a scalar variable.
.PP
When called in a scalar or list context, on the other hand, a copy will be
modifed and returned. The original string will be unchanged:
.PP
.Vb 2
\&  my $clean_latin1 = zap_cp1252 $latin1_text;
\&  my $fixed_utf8   = fix_cp1252 $utf8_text;
.Ve
.PP
In this case, even constant values can be processed. Either way, \f(CW\*(C`undef\*(C'\fRs
will be ignored.
.PP
In Perl 5.10 and higher, the functions may optionally be called with no
arguments, in which case \f(CW$_\fR will be converted, instead:
.PP
.Vb 4
\&  zap_cp1252; # Modify $_ in\-place.
\&  fix_cp1252; # Modify $_ in\-place.
\&  my $zapped = zap_cp1252; # Copy $_ and return zapped
\&  my $fixed = zap_cp1252; # Copy $_ and return fixed
.Ve
.PP
In Perl 5.8.8 and higher, the conversion will work even when the string is
decoded to Perl's internal form (usually via \f(CW\*(C`decode \*(AqISO\-8859\-1\*(Aq, $text\*(C'\fR) or
the string is encoded (and thus simply processed by Perl as a series of
bytes). The conversion will even work on a string that has not been decoded
but has had its \f(CW\*(C`utf8\*(C'\fR flag flipped anyway (usually by an injudicious use of
\&\f(CW\*(C`Encode::_utf8_on()\*(C'\fR. This is to enable the highest possible likelihood of
removing those \s-1CP1252\s0 gremlins no matter what kind of processing has already
been executed on the string.
.PP
That said, although \f(CW\*(C`fix_cp1252()\*(C'\fR takes a conservative approach to replacing
text in Unicode strings, it should be used as a very last option. Really,
avoid that situation if you can.
.SH "Conversion Table"
.IX Header "Conversion Table"
Here's how the characters are converted to \s-1ASCII\s0 and \s-1UTF\-8.\s0 The \s-1ASCII\s0
conversions are not perfect, but they should be good enough for general
cleanup. If you want perfect, switch to \s-1UTF\-8\s0 and be done with it!
.PP
.Vb 10
\&   Hex | Char  | ASCII | UTF\-8 Name
\&  \-\-\-\-\-+\-\-\-\-\-\-\-+\-\-\-\-\-\-\-+\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-
\&  0x80 |   X   |   e   | EURO SIGN
\&  0x82 |   X   |   ,   | SINGLE LOW\-9 QUOTATION MARK
\&  0x83 |   X   |   f   | LATIN SMALL LETTER F WITH HOOK
\&  0x84 |   X   |   ,,  | DOUBLE LOW\-9 QUOTATION MARK
\&  0x85 |   X   |  ...  | HORIZONTAL ELLIPSIS
\&  0x86 |   X   |   +   | DAGGER
\&  0x87 |   X   |   ++  | DOUBLE DAGGER
\&  0x88 |   X   |   ^   | MODIFIER LETTER CIRCUMFLEX ACCENT
\&  0x89 |   X   |   %   | PER MILLE SIGN
\&  0x8a |   X   |   S   | LATIN CAPITAL LETTER S WITH CARON
\&  0x8b |   X   |   <   | SINGLE LEFT\-POINTING ANGLE QUOTATION MARK
\&  0x8c |   X   |   OE  | LATIN CAPITAL LIGATURE OE
\&  0x8e |   X   |   Z   | LATIN CAPITAL LETTER Z WITH CARON
\&  0x91 |   X   |   \*(Aq   | LEFT SINGLE QUOTATION MARK
\&  0x92 |   X   |   \*(Aq   | RIGHT SINGLE QUOTATION MARK
\&  0x93 |   X   |   "   | LEFT DOUBLE QUOTATION MARK
\&  0x94 |   X   |   "   | RIGHT DOUBLE QUOTATION MARK
\&  0x95 |   X   |   *   | BULLET
\&  0x96 |   X   |   \-   | EN DASH
\&  0x97 |   X   |   \-\-  | EM DASH
\&  0x98 |   X   |   ~   | SMALL TILDE
\&  0x99 |   X   |  (tm) | TRADE MARK SIGN
\&  0x9a |   X   |   s   | LATIN SMALL LETTER S WITH CARON
\&  0x9b |   X   |   >   | SINGLE RIGHT\-POINTING ANGLE QUOTATION MARK
\&  0x9c |   X   |   oe  | LATIN SMALL LIGATURE OE
\&  0x9e |   X   |   z   | LATIN SMALL LETTER Z WITH CARON
\&  0x9f |   X   |   Y   | LATIN CAPITAL LETTER Y WITH DIAERESIS
.Ve
.SS "Changing the Tables"
.IX Subsection "Changing the Tables"
Don't like these conversions? You can modify them to your heart's content by
accessing this module's internal conversion tables. For example, if you wanted
\&\f(CW\*(C`zap_cp1252()\*(C'\fR to use an uppercase \*(L"E\*(R" for the euro sign, just do this:
.PP
.Vb 1
\&  local $Encode::ZapCP1252::ascii_for{"\ex80"} = \*(AqE\*(Aq;
.Ve
.PP
Or if, for some reason, you wanted the \s-1UTF\-8\s0 equivalent for a bullet
converted by \f(CW\*(C`fix_cp1252()\*(C'\fR to be a black square, you can assign the
bytes (never a Unicode string) like so:
.PP
.Vb 1
\&  local $Encode::ZapCP1252::utf8_for{"\ex95"} = Encode::encode_utf8(\*(AqX\*(Aq);
.Ve
.PP
Just remember, without \f(CW\*(C`local\*(C'\fR this would be a global change. In that case,
be careful if your code zaps \s-1CP1252\s0 elsewhere. Of course, it shouldn't really
be doing that. These functions are just for cleaning up messes in one spot in
your code, not for making a fundamental part of your text handling. For that,
use Encode.
.SH "See Also"
.IX Header "See Also"
.IP "Encode" 4
.IX Item "Encode"
.PD 0
.IP "Encoding::FixLatin" 4
.IX Item "Encoding::FixLatin"
.IP "Wikipedia: Windows\-1252 <https://en.wikipedia.org/wiki/Windows-1252>" 4
.IX Item "Wikipedia: Windows-1252 <https://en.wikipedia.org/wiki/Windows-1252>"
.PD
.SH "Support"
.IX Header "Support"
This module is stored in an open GitHub
repository <https://github.com/theory/encode-zapcp1252/>. Feel free to fork
and contribute!
.PP
Please file bug reports via GitHub
Issues <https://github.com/theory/encode-zapcp1252/issues/> or by sending mail to
bug\-Encode\-CP1252@rt.cpan.org <mailto:bug-Encode-CP1252@rt.cpan.org>.
.SH "Author"
.IX Header "Author"
David E. Wheeler <david@justatheory.com>
.SH "Acknowledgments"
.IX Header "Acknowledgments"
My thanks to Sean Burke for sending me his original method for converting
\&\s-1CP1252\s0 gremlins to more-or-less appropriate \s-1ASCII\s0 characters, and to Karl
Williamson for more correct handling of Unicode strings.
.SH "Copyright and License"
.IX Header "Copyright and License"
Copyright (c) 2005\-2020 David E. Wheeler. Some Rights Reserved.
.PP
This module is free software; you can redistribute it and/or modify it under the
same terms as Perl itself.