'\" '\" Copyright (c) 1998 Scriptics Corporation. '\" Copyright (c) 2023 Nathan Coulter '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" .TH encoding 3tcl "8.1" Tcl "Tcl Built-In Commands" .\" The -*- nroff -*- definitions below are for supplemental macros used .\" in Tcl/Tk manual entries. .\" .\" .AP type name in/out ?indent? .\" Start paragraph describing an argument to a library procedure. .\" type is type of argument (int, etc.), in/out is either "in", "out", .\" or "in/out" to describe whether procedure reads or modifies arg, .\" and indent is equivalent to second arg of .IP (shouldn't ever be .\" needed; use .AS below instead) .\" .\" .AS ?type? ?name? .\" Give maximum sizes of arguments for setting tab stops. Type and .\" name are examples of largest possible arguments that will be passed .\" to .AP later. If args are omitted, default tab stops are used. .\" .\" .BS .\" Start box enclosure. From here until next .BE, everything will be .\" enclosed in one large box. .\" .\" .BE .\" End of box enclosure. .\" .\" .CS .\" Begin code excerpt. .\" .\" .CE .\" End code excerpt. .\" .\" .VS ?version? ?br? .\" Begin vertical sidebar, for use in marking newly-changed parts .\" of man pages. The first argument is ignored and used for recording .\" the version when the .VS was added, so that the sidebars can be .\" found and removed when they reach a certain age. If another argument .\" is present, then a line break is forced before starting the sidebar. .\" .\" .VE .\" End of vertical sidebar. .\" .\" .DS .\" Begin an indented unfilled display. .\" .\" .DE .\" End of indented unfilled display. .\" .\" .SO ?manpage? .\" Start of list of standard options for a Tk widget. The manpage .\" argument defines where to look up the standard options; if .\" omitted, defaults to "options". The options follow on successive .\" lines, in three columns separated by tabs. .\" .\" .SE .\" End of list of standard options for a Tk widget. .\" .\" .OP cmdName dbName dbClass .\" Start of description of a specific option. cmdName gives the .\" option's name as specified in the class command, dbName gives .\" the option's name in the option database, and dbClass gives .\" the option's class in the option database. .\" .\" .UL arg1 arg2 .\" Print arg1 underlined, then print arg2 normally. .\" .\" .QW arg1 ?arg2? .\" Print arg1 in quotes, then arg2 normally (for trailing punctuation). .\" .\" .PQ arg1 ?arg2? .\" Print an open parenthesis, arg1 in quotes, then arg2 normally .\" (for trailing punctuation) and then a closing parenthesis. .\" .\" # Set up traps and other miscellaneous stuff for Tcl/Tk man pages. .if t .wh -1.3i ^B .nr ^l \n(.l .ad b .\" # Start an argument description .de AP .ie !"\\$4"" .TP \\$4 .el \{\ . ie !"\\$2"" .TP \\n()Cu . el .TP 15 .\} .ta \\n()Au \\n()Bu .ie !"\\$3"" \{\ \&\\$1 \\fI\\$2\\fP (\\$3) .\".b .\} .el \{\ .br .ie !"\\$2"" \{\ \&\\$1 \\fI\\$2\\fP .\} .el \{\ \&\\fI\\$1\\fP .\} .\} .. .\" # define tabbing values for .AP .de AS .nr )A 10n .if !"\\$1"" .nr )A \\w'\\$1'u+3n .nr )B \\n()Au+15n .\" .if !"\\$2"" .nr )B \\w'\\$2'u+\\n()Au+3n .nr )C \\n()Bu+\\w'(in/out)'u+2n .. .AS Tcl_Interp Tcl_CreateInterp in/out .\" # BS - start boxed text .\" # ^y = starting y location .\" # ^b = 1 .de BS .br .mk ^y .nr ^b 1u .if n .nf .if n .ti 0 .if n \l'\\n(.lu\(ul' .if n .fi .. .\" # BE - end boxed text (draw box now) .de BE .nf .ti 0 .mk ^t .ie n \l'\\n(^lu\(ul' .el \{\ .\" Draw four-sided box normally, but don't draw top of .\" box if the box started on an earlier page. .ie !\\n(^b-1 \{\ \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' .\} .el \}\ \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\l'|0u-1.5n\(ul' .\} .\} .fi .br .nr ^b 0 .. .\" # VS - start vertical sidebar .\" # ^Y = starting y location .\" # ^v = 1 (for troff; for nroff this doesn't matter) .de VS .if !"\\$2"" .br .mk ^Y .ie n 'mc \s12\(br\s0 .el .nr ^v 1u .. .\" # VE - end of vertical sidebar .de VE .ie n 'mc .el \{\ .ev 2 .nf .ti 0 .mk ^t \h'|\\n(^lu+3n'\L'|\\n(^Yu-1v\(bv'\v'\\n(^tu+1v-\\n(^Yu'\h'-|\\n(^lu+3n' .sp -1 .fi .ev .\} .nr ^v 0 .. .\" # Special macro to handle page bottom: finish off current .\" # box/sidebar if in box/sidebar mode, then invoked standard .\" # page bottom macro. .de ^B .ev 2 'ti 0 'nf .mk ^t .if \\n(^b \{\ .\" Draw three-sided box if this is the box's first page, .\" draw two sides but no top otherwise. .ie !\\n(^b-1 \h'-1.5n'\L'|\\n(^yu-1v'\l'\\n(^lu+3n\(ul'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c .el \h'-1.5n'\L'|\\n(^yu-1v'\h'\\n(^lu+3n'\L'\\n(^tu+1v-\\n(^yu'\h'|0u'\c .\} .if \\n(^v \{\ .nr ^x \\n(^tu+1v-\\n(^Yu \kx\h'-\\nxu'\h'|\\n(^lu+3n'\ky\L'-\\n(^xu'\v'\\n(^xu'\h'|0u'\c .\} .bp 'fi .ev .if \\n(^b \{\ .mk ^y .nr ^b 2 .\} .if \\n(^v \{\ .mk ^Y .\} .. .\" # DS - begin display .de DS .RS .nf .sp .. .\" # DE - end display .de DE .fi .RE .sp .. .\" # SO - start of list of standard options .de SO 'ie '\\$1'' .ds So \\fBoptions\\fR 'el .ds So \\fB\\$1\\fR .SH "STANDARD OPTIONS" .LP .nf .ta 5.5c 11c .ft B .. .\" # SE - end of list of standard options .de SE .fi .ft R .LP See the \\*(So manual entry for details on the standard options. .. .\" # OP - start of full description for a single option .de OP .LP .nf .ta 4c Command-Line Name: \\fB\\$1\\fR Database Name: \\fB\\$2\\fR Database Class: \\fB\\$3\\fR .fi .IP .. .\" # CS - begin code excerpt .de CS .RS .nf .ta .25i .5i .75i 1i .. .\" # CE - end code excerpt .de CE .fi .RE .. .\" # UL - underline word .de UL \\$1\l'|0\(ul'\\$2 .. .\" # QW - apply quotation marks to word .de QW .ie '\\*(lq'"' ``\\$1''\\$2 .\"" fix emacs highlighting .el \\*(lq\\$1\\*(rq\\$2 .. .\" # PQ - apply parens and quotation marks to word .de PQ .ie '\\*(lq'"' (``\\$1''\\$2)\\$3 .\"" fix emacs highlighting .el (\\*(lq\\$1\\*(rq\\$2)\\$3 .. .\" # QR - quoted range .de QR .ie '\\*(lq'"' ``\\$1''\\-``\\$2''\\$3 .\"" fix emacs highlighting .el \\*(lq\\$1\\*(rq\\-\\*(lq\\$2\\*(rq\\$3 .. .\" # MT - "empty" string .de MT .QW "" .. .BS .SH NAME encoding \- Work with encodings .SH SYNOPSIS \fBencoding \fIoperation\fR ?\fIarg arg ...\fR? .BE .SH INTRODUCTION .PP In Tcl every string is composed of Unicode values. Text may be encoded into an encoding such as cp1252, iso8859-1, Shitf\-JIS, utf-8, utf-16, etc. Not every Unicode vealue is encodable in every encoding, and some encodings can encode values that are not available in Unicode. .PP Even though Unicode is for encoding the written texts of human languages, any sequence of bytes can be encoded as the first 255 Unicode values. iso8859-1 an encoding for a subset of Unicode in which each byte is a Unicode value of 255 or less. Thus, any sequence of bytes can be considered to be a Unicode string encoded in iso8859-1. To work with binary data in Tcl, decode it from iso8859-1 when reading it in, and encode it into iso8859-1 when writing it out, ensuring that each character in the string has a value of 255 or less. Decoding such a string does nothing, and encoding encoding such a string also does nothing. .PP For example, the following is true: .CS set text {In Tcl binary data is treated as Unicode text and it just works.} set encoded [encoding convertto iso8859-1 $text] expr {$text eq $encoded}; #-> 1 .CE The following is also true: .CS set decoded [encoding convertfrom iso8859-1 $text] expr {$text eq $decoded}; #-> 1 .CE .SH DESCRIPTION .PP Performs one of the following encoding \fIoperations\fR: .TP \fBencoding convertfrom\fR ?\fIencoding\fR? \fIdata\fR .TP \fBencoding convertfrom\fR ?\fB-profile \fIprofile\fR? ?\fB-failindex var\fR? \fIencoding\fR \fIdata\fR . Decodes \fIdata\fR encoded in \fIencoding\fR. If \fIencoding\fR is not specified the current system encoding is used. .VS "TCL8.7 TIP607, TIP656" \fB-profile\fR determines how invalid data for the encoding are handled. See the \fBPROFILES\fR section below for details. Returns an error if decoding fails. However, if \fB-failindex\fR given, returns the result of the conversion up to the point of termination, and stores in \fBvar\fR the index of the character that could not be converted. If no errors are encountered the entire result of the conversion is returned and the value \fB-1\fR is stored in \fBvar\fR. .VE "TCL8.7 TIP607, TIP656" .TP \fBencoding convertto\fR ?\fIencoding\fR? \fIdata\fR .TP \fBencoding convertto\fR ?\fB-profile \fIprofile\fR? ?\fB-failindex var\fR? \fIencoding\fR \fIdata\fR . Converts \fIstring\fR to \fIencoding\fR. If \fIencoding\fR is not given, the current system encoding is used. .VS "TCL8.7 TIP607, TIP656" See \fBencoding convertfrom\fR for the meaning of \fB-profile\fR and \fB-failindex\fR. .VE "TCL8.7 TIP607, TIP656" .TP \fBencoding dirs\fR ?\fIdirectoryList\fR? . Sets the search path for \fB*.enc\fR encoding data files to the list of directories given by \fIdirectoryList\fR. If \fIdirectoryList\fR is not given, returns the current list of directories that make up the search path. It is not an error for an item in \fIdirectoryList\fR to not refer to a readable, searchable directory. .TP \fBencoding names\fR . Returns a list of the names of available encodings. The encodings .QW utf-8 and .QW iso8859-1 are guaranteed to be present in the list. .VS "TCL8.7 TIP656" .TP \fBencoding profiles\fR Returns a list of names of available encoding profiles. See \fBPROFILES\fR below. .VE "TCL8.7 TIP656" .TP \fBencoding system\fR ?\fIencoding\fR? . Sets the system encoding to \fIencoding\fR. If \fIencoding\fR is not given, returns the current system encoding. The system encoding is used to pass strings to system calls. .\" Do not put .VS on whole section as that messes up the bullet list alignment .SH PROFILES .PP .VS "TCL8.7 TIP656" Each \fIprofile\fR is a distinct strategy for dealing with invalid data for an encoding. .PP The following profiles are currently implemented. .VS "TCL8.7 TIP656" .TP \fBstrict\fR . The default profile. The operation fails when invalid data for the encoding are encountered. .TP \fBtcl8\fR . Provides for behaviour identical to that of Tcl 8.6: When decoding, for encodings \fBother than utf-8\fR, each invalid byte is interpreted as the Unicode value given by that one byte. For example, the byte 0x80, which is invalid in the ASCII encoding would be mapped to the Unicode value U+0080. For \fButf-8\fR, each invalid byte that is a valid CP1252 character is interpreted as the Unicode value for that character, while each byte that is not is treated as the Unicode value given by that one byte. For example, byte 0x80 is defined by CP1252 and is therefore mapped to its Unicode equivalent U+20AC while byte 0x81 which is not defined by CP1252 is mapped to U+0081. As an additional special case, the sequence 0xC0 0x80 is mapped to U+0000. When encoding, each character that cannot be represented in the encoding is replaced by an encoding-dependent character, usually the question mark \fB?\fR. .TP \fBreplace\fR . When decoding, invalid bytes are replaced by U+FFFD, the Unicode REPLACEMENT CHARACTER. When encoding, Unicode values that cannot be represented in the target encoding are transformed to an encoding-specific fallback character, U+FFFD REPLACEMENT CHARACTER for UTF targets, and generally `?` for other encodings. .VE "TCL8.7 TIP656" .SH EXAMPLES .PP These examples use the utility proc below that prints the Unicode value for each character in a string. .PP .CS proc codepoints s {join [lmap c [split $s {}] { string cat U+ [format %.6X [scan $c %c]]}] } .CE .PP Example 1: Convert from euc-jp: .PP .CS % codepoints [\fBencoding convertfrom\fR euc-jp \exA4\exCF] U+00306F .CE .PP The result is the Unicode value .QW "\eu306F" , which is the Hiragana letter HA. .VS "TCL8.7 TIP607, TIP656" .PP Example 2: Error handling based on profiles: .PP The letter \fBA\fR is Unicode character U+0041 and the byte "\ex80" is invalid in ASCII encoding. .PP .CS % codepoints [encoding convertfrom -profile tcl8 ascii A\ex80] U+000041 U+000080 % codepoints [encoding convertfrom -profile replace ascii A\ex80] U+000041 U+00FFFD % codepoints [encoding convertfrom -profile strict ascii A\ex80] unexpected byte sequence starting at index 1: '\ex80' .CE .PP Example 3: Get partial data and the error location: .PP .CS % codepoints [encoding convertfrom -failindex idx ascii AB\ex80] U+000041 U+000042 % set idx 2 .CE .PP Example 4: Encode a character that is not representable in ISO8859-1: .PP .CS % encoding convertto iso8859-1 A\eu0141 A? % encoding convertto -profile strict iso8859-1 A\eu0141 unexpected character at index 1: 'U+000141' % encoding convertto -failindex idx iso8859-1 A\eu0141 A % set idx 1 .CE .VE "TCL8.7 TIP607, TIP656" .PP .SH "SEE ALSO" Tcl_GetEncoding(3tcl), fconfigure(3tcl) .SH KEYWORDS encoding, unicode .\" Local Variables: .\" mode: nroff .\" End: