1 ;;; code-pages.el --- coding systems for assorted codepages -*-coding: utf-8;-*-
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
5 ;; Author: Dave Love <fx@gnu.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
27 ;; Definitions of miscellaneous 8-bit coding systems based on ASCII
28 ;; (we can't cope properly with EBCDIC, for instance), mainly for PC
29 ;; `code pages'. They are decoded into Latin-1 and mule-unicode
30 ;; charsets rather than (lossily) into single iso8859 charsets à la
31 ;; codepage.el. The utility `cp-make-coding-system' derives them from
34 ;; Those covered are: cp437, cp737, cp720, cp775, cp850, cp851, cp852,
35 ;; cp855, cp857, cp860, cp861, cp862, cp863, cp864, cp865, cp866,
36 ;; cp869, cp874, cp1125, windows-1250, windows-1253, windows-1254,
37 ;; windows-1255, windows-1256, windows-1257, windows-1258, next,
38 ;; iso-8859-6, iso-8859-10, iso-8859-11, iso-8859-16, koi8-t,
39 ;; georgian-ps. This is meant to include all the single-byte ones
40 ;; relevant to GNU (used in glibc-defined locales); we don't yet get
41 ;; all the multibyte ones in base Emacs.
43 ;; Note that various of these can clash with definitions in
44 ;; codepage.el; we try to avoid damage from that. A few CPs from
45 ;; codepage.el (770, 773, 774) aren't covered (in the absence of
46 ;; translation tables to Unicode).
48 ;; Compile this, to avoid loading `ccl' at runtime.
50 ;; Although the tables used here aren't very big, it might be worth
51 ;; splitting the file and autoloading the coding systems if/when my
52 ;; (or similar) autoloading code is installed.
56 ;; The defsubsts here are just so that language files can use
57 ;; `cp-make-coding-system' and not require functions from this file
60 (defsubst cp-make-translation-table
(v)
61 "Return a translation table made from 128-long vector V.
62 V comprises characters encodable by mule-utf-8."
63 (let ((encoding-vector (make-vector 256 0)))
65 (aset encoding-vector i i
))
67 (aset encoding-vector
(+ i
128) (aref v i
)))
68 ;; Add equivalent characters to the encoder so that we can unify
70 (let* ((tab (make-translation-table-from-vector encoding-vector
))
71 ;; Translation table used for encoding:
72 (encode-table (char-table-extra-slot tab
0)))
73 (map-char-table (lambda (c v
)
75 (let ((c1 (aref encode-table v
)))
76 (if c1
; we encode that unicode
77 (aset encode-table c c1
)))))
78 ucs-mule-to-mule-unicode
)
81 (defsubst cp-valid-codes
(v)
82 "Derive a valid-codes list for translation vector V.
83 See `make-coding-system'."
85 (i 128) ; index into v
86 (start 0) ; start of a valid range
87 (end 127)) ; end of a valid range
89 (if (aref v
(- i
128)) ; start or extend range
92 (unless start
(setq start i
)))
94 (push (cons start end
) pairs
))
97 (if start
(push (cons start end
) pairs
))
100 ;; Fix things that have been, or might be, done by codepage.el.
101 (eval-after-load "codepage"
104 ;; Semi-dummy version for the stuff in codepage.el which we don't
105 ;; define here. (Used by mule-diag.)
106 (defun cp-supported-codepages ()
107 "Return an alist of supported codepages.
109 Each association in the alist has the form (NNN . CHARSET), where NNN is the
110 codepage number, and CHARSET is the MULE charset which is the closest match
111 for the character set supported by that codepage.
113 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
114 is a vector, and has a charset property."
115 '(("774" . latin-iso8859-4
) ("770" . latin-iso8859-4
)
116 ("773" . latin-iso8859-4
)))
118 ;; A version which doesn't override the coding systems set up by this
119 ;; file. It could still be used for the few missing ones from
121 (defun codepage-setup (codepage)
122 "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
124 These coding systems are meant for encoding and decoding 8-bit non-ASCII
125 characters used by the IBM codepages, typically in conjunction with files
126 read/written by MS-DOS software, or for display on the MS-DOS terminal."
128 (let ((completion-ignore-case t
)
129 (candidates (cp-supported-codepages)))
130 (list (completing-read "Setup DOS Codepage: (default 437) " candidates
131 nil t nil nil
"437"))))
132 (let ((cp (format "cp%s" codepage
)))
133 (unless (coding-system-p (intern cp
))
134 (cp-make-coding-systems-for-codepage
135 cp
(cp-charset-for-codepage cp
) (cp-offset-for-codepage cp
))))))
138 ;; Macro to allow ccl compilation at byte-compile time, avoiding
141 (defmacro cp-make-coding-system
(name v
&optional doc-string mnemonic
)
142 "Make coding system NAME for and 8-bit, extended-ASCII character set.
143 V is a 128-long vector of characters to translate the upper half of
144 the character set. DOC-STRING and MNEMONIC are used as the
145 corresponding args of `make-coding-system'. If MNEMONIC isn't given,
147 Return an updated `non-iso-charset-alist'."
148 (let* ((encoder (intern (format "encode-%s" name
)))
149 (decoder (intern (format "decode-%s" name
)))
155 (if (r1 < 128) ;; ASCII
156 (r0 = ,(charset-id 'ascii
))
158 (r0 = ,(charset-id 'eight-bit-control
))
159 (r0 = ,(charset-id 'eight-bit-graphic
))))
160 (translate-character ,decoder r0 r1
)
161 ;; Allow fragmentation on decoding -- relevant for
162 ;; Cyrillic, Greek and, possibly Arabic and Hebrew.
163 (translate-character utf-translation-table-for-decode r0 r1
)
164 (write-multibyte-character r0 r1
)
170 (read-multibyte-character r0 r1
)
171 (translate-character ,encoder r0 r1
)
172 (if (r0 != ,(charset-id 'ascii
))
173 (if (r0 != ,(charset-id 'eight-bit-graphic
))
174 (if (r0 != ,(charset-id 'eight-bit-control
))
176 (write-repeat r1
)))))))
177 `(let ((translation-table (cp-make-translation-table ,v
))
178 (codes (cp-valid-codes ,v
)))
179 (define-translation-table ',decoder translation-table
)
180 (define-translation-table ',encoder
181 (char-table-extra-slot translation-table
0))
183 ',name
4 ,(or mnemonic ?
*)
184 (or ,doc-string
(format "%s encoding" ',name
))
185 (cons ,ccl-decoder
,ccl-encoder
)
186 (list (cons 'safe-chars
(get ',encoder
'translation-table
))
187 (cons 'valid-codes codes
)
188 (cons 'mime-charset
',name
)
189 ;; For Quail translation. Fixme: this should really be
190 ;; a separate table that only translates the coding
191 ;; system's safe-chars.
192 (cons 'translation-table-for-input
,ucs-mule-to-mule-unicode
)))
196 (let (l) ; code range
197 (dolist (elt (reverse codes
))
201 non-iso-charset-alist
))))
203 (eval-when-compile (defvar non-iso-charset-alist
))
205 ;; These tables were mostly derived by running somthing like
206 ;; `recode -f cpxxx/..utf-8' on a binary file filled by
207 ;; `(dotimes (i 128) (insert ?? ?\\ (+ 128 i) ?\n))' and then
208 ;; exchanging the ?\� entries for nil. iconv was used instead in some
211 ;; Fixme: Do better for mode-line mnemonics?
213 (cp-make-coding-system
344 (cp-make-coding-system
474 (coding-system-put 'cp737
'mime-charset nil
) ; not in IANA list
476 (cp-make-coding-system
607 (cp-make-coding-system
738 (cp-make-coding-system
869 (cp-make-coding-system
1000 (cp-make-coding-system
1131 (cp-make-coding-system
1262 (cp-make-coding-system
1393 (cp-make-coding-system
1524 (cp-make-coding-system
1655 (cp-make-coding-system
1786 (cp-make-coding-system
1917 (cp-make-coding-system
2048 (cp-make-coding-system
2181 (cp-make-coding-system
2312 (cp-make-coding-system
2443 ;;;###autoload(autoload-coding-system 'windows-1250 '(require 'code-pages))
2444 (cp-make-coding-system
2575 ;;;###autoload(autoload-coding-system 'windows-1253 '(require 'code-pages))
2576 (cp-make-coding-system
2708 ;;;###autoload(autoload-coding-system 'windows-1254 '(require 'code-pages))
2709 (cp-make-coding-system
2841 ;;;###autoload(autoload-coding-system 'windows-1255 '(require 'code-pages))
2842 (cp-make-coding-system
2974 ;;;###autoload(autoload-coding-system 'windows-1256 '(require 'code-pages))
2975 (cp-make-coding-system
3107 ;;;###autoload(autoload-coding-system 'windows-1257 '(require 'code-pages))
3108 (cp-make-coding-system
3239 ;;;###autoload(autoload-coding-system 'windows-1258 '(require 'code-pages))
3240 (cp-make-coding-system
3371 (cp-make-coding-system
3501 "NeXTstep encoding." ?N
)
3503 (cp-make-coding-system
3504 koi8-t
; used by glibc for tg_TJ
3633 "Unicode-based KOI8-T encoding for Cyrillic")
3634 (coding-system-put 'koi8-t
'mime-charset nil
) ; not in the IANA list
3635 (define-coding-system-alias 'cyrillic-koi8-t
'koi8-t
)
3637 ;; Online final ISO draft:
3639 ;; http://www.evertype.com/standards/iso8859/fdis8859-16-en.pdf
3641 ;; Equivalent National Standard:
3642 ;; Romanian Standard SR 14111:1998, Romanian Standards Institution
3647 ;; "This set of coded graphic characters is intended for use in data and
3648 ;; text processing applications and also for information interchange. The
3649 ;; set contains graphic characters used for general purpose applications in
3650 ;; typical office environments in at least the following languages:
3651 ;; Albanian, Croatian, English, Finnish, French, German, Hungarian, Irish
3652 ;; Gaelic (new orthography), Italian, Latin, Polish, Romanian, and
3653 ;; Slovenian. This set of coded graphic characters may be regarded as a
3654 ;; version of an 8-bit code according to ISO/IEC 2022 or ISO/IEC 4873 at
3655 ;; level 1." [ISO 8859-16:2001(E), p. 1]
3657 ;; This charset is suitable for use in MIME text body parts.
3659 ;; ISO 8859-16 was primarily designed for single-byte encoding the Romanian
3660 ;; language. The UTF-8 charset is the preferred and in today's MIME software
3661 ;; more widely implemented encoding suitable for Romanian.
3662 ;;;###autoload(autoload-coding-system 'iso-8859-16 '(require 'code-pages))
3663 (cp-make-coding-system
3664 iso-latin-10
; consistent with, e.g. Latin-1
3665 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3666 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3763 "Unicode-based encoding for Latin-10 (MIME: ISO-8859-16)"
3765 (coding-system-put 'iso-latin-10
'mime-charset
'iso-8859-16
)
3766 (define-coding-system-alias 'iso-8859-16
'iso-latin-10
)
3767 (define-coding-system-alias 'latin-10
'iso-latin-10
)
3769 ;; Unicode-based alternative which has the possible advantage of
3770 ;; having its relative sparseness specified.
3771 (cp-make-coding-system
3772 ;; The base system uses arabic-iso-8bit, but that's not a MIME charset.
3774 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3775 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3827 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3828 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3829 nil nil nil nil nil nil nil nil nil nil nil
]
3830 "Unicode-based Arabic ISO/IEC 8859-6 (MIME: ISO-8859-6)"
3832 (define-coding-system-alias 'arabic-iso-8bit
'iso-8859-6
)
3834 ;;;###autoload(autoload-coding-system 'iso-8859-10 '(require 'code-pages))
3835 (cp-make-coding-system
3837 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3838 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3935 "Unicode-based encoding for Latin-6 (MIME: ISO-8859-10)")
3936 (coding-system-put 'iso-latin-6
'mime-charset
'iso-8859-10
)
3937 (define-coding-system-alias 'iso-8859-10
'iso-latin-6
)
3938 (define-coding-system-alias 'latin-6
'iso-latin-6
)
3940 ;; used by lt_LT, lv_LV, mi_NZ
3941 ;;;###autoload(autoload-coding-system 'iso-8859-13 '(require 'code-pages))
3942 (cp-make-coding-system
3944 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3945 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4043 "Unicode-based encoding for Latin-7 (MIME: ISO-8859-13)"
4044 ?l
) ;; Lithuanian/Latvian
4045 (coding-system-put 'iso-latin-7
'mime-charset
'iso-8859-13
)
4046 (define-coding-system-alias 'iso-8859-13
'iso-latin-7
)
4047 (define-coding-system-alias 'latin-7
'iso-latin-7
)
4049 ;; Fixme: check on the C1 characters which libiconv includes. They
4050 ;; are reproduced below, but are probably wrong. I can't find an
4051 ;; official definition of georgian-ps.
4052 ;;;###autoload(autoload-coding-system 'georgian-ps '(require 'code-pages))
4053 (cp-make-coding-system
4054 georgian-ps
; used by glibc for ka_GE
4184 (coding-system-put 'georgian-ps
'mime-charset nil
) ; not in IANA list
4186 ;; From http://www.microsoft.com/globaldev/reference/oem/720.htm
4187 (cp-make-coding-system
4317 (coding-system-put 'cp720
'mime-charset nil
) ; not in IANA list
4319 ;; http://oss.software.ibm.com/cvs/icu/charset/data/ucm/ibm-1125_P100-2000.ucm
4320 (cp-make-coding-system
4450 (define-coding-system-alias 'ruscii
'cp1125
)
4451 ;; Original name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
4452 (define-coding-system-alias 'cp866u
'cp1125
)
4453 (coding-system-put 'cp1125
'mime-charset nil
)
4455 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: Bulgarian DOS
4456 ;; codepage. Table at
4457 ;; <URL:http://czyborra.com/charsets/bulgarian-mik.txt.gz>.
4458 (cp-make-coding-system
4460 [?А ?Б ?В ?Г ?Д ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц
4461 ?Ч ?Ш ?Щ ?Ъ ?Ы ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н
4462 ?о ?п ?р ?с ?т ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я ?└ ?┴ ?┬ ?├ ?─
4463 ?┼ ?╣ ?║ ?╚ ?╔ ?╩ ?╦ ?╠ ?═ ?╬ ?┐ ?░ ?▒ ?▓ ?│ ?┤ ?№ ?§ ?╗ ?╝ ?┘ ?┌ ?█
4464 ?▄ ?▌ ?▐ ?▀ ?α ?β ?Γ ?π ?Σ ?σ ?μ ?τ ?Φ ?Θ ?Ω ?δ ?∞ ?∅ ?∈ ?∩ ?≡ ?± ?≥
4465 ?≤ ?⌠ ?⌡ ?÷ ?≈ ?° ?∙ ?· ?√ ?ⁿ ?² ?■ ?
])
4466 (coding-system-put 'mik
'mime-charset nil
)
4468 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: similar to CP1251
4469 ;; and used for some non-Slavic Cyrillic languages. Table found at
4470 ;; <URL:ftp://ftp.logic.ru/pub/logic/linux/cyr-asian/PT154>. See also
4471 ;; <URL:http://lists.w3.org/Archives/Public/ietf-charsets/2002AprJun/0092.html,
4472 ;; which suggests it's used in an Asian Cyrillic context.
4473 ;;;###autoload(autoload-coding-system 'pt154 '(require 'code-pages))
4474 (cp-make-coding-system
4476 [?Җ ?Ғ ?Ӯ ?ғ ?„ ?… ?Ҷ ?Ү ?Ҳ ?ү ?Ҡ ?Ӣ ?Ң ?Қ ?Һ ?Ҹ ?җ ?‘ ?’ ?“ ?” ?• ?–
4477 ?— ?ҳ ?ҷ ?ҡ ?ӣ ?ң ?қ ?һ ?ҹ ? ?Ў ?ў ?Ј ?Ө ?Ҙ ?Ұ ?§ ?Ё ?© ?Ә ?\« ?¬ ?ӯ
4478 ?® ?Ҝ ?° ?ұ ?І ?і ?ҙ ?ө ?¶ ?· ?ё ?№ ?ә ?» ?ј ?Ҫ ?ҫ ?ҝ ?А ?Б ?В ?Г ?Д
4479 ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц ?Ч ?Ш ?Щ ?Ъ ?Ы
4480 ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н ?о ?п ?р ?с ?т
4481 ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я
])
4483 ;;;###autoload(autoload-coding-system 'iso-8859-11 '(require 'code-pages))
4484 (cp-make-coding-system
4486 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4487 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4488 ? ?ก ?ข ?ฃ ?ค ?ฅ ?ฆ ?ง ?จ ?ฉ ?ช ?ซ ?ฌ ?ญ ?ฎ ?ฏ
4489 ?ฐ ?ฑ ?ฒ ?ณ ?ด ?ต ?ถ ?ท ?ธ ?น ?บ ?ป ?ผ ?ฝ ?พ ?ฟ
4490 ?ภ ?ม ?ย ?ร ?ฤ ?ล ?ฦ ?ว ?ศ ?ษ ?ส ?ห ?ฬ ?อ ?ฮ ?ฯ
4491 ?ะ ?ั ?า ?ำ ?ิ ?ี ?ึ ?ื ?ุ ?ู ?ฺ nil nil nil nil ?฿
4492 ?เ ?แ ?โ ?ใ ?ไ ?ๅ ?ๆ ?็ ?่ ?้ ?๊ ?๋ ?์ ?ํ ?๎ ?๏
4493 ?๐ ?๑ ?๒ ?๓ ?๔ ?๕ ?๖ ?๗ ?๘ ?๙ ?๚ ?๛ nil nil nil nil
]
4494 "ISO-8859-11. This is `thai-tis620' with the addition of no-break-space.")
4497 (let ((w (intern (format "windows-125%d" i
)))
4498 (c (intern (format "cp125%d" i
))))
4499 ;; Define cp125* as aliases for all windows-125*, so on Windows
4500 ;; we can just concat "cp" to the ANSI codepage we get from the system
4501 ;; and not have to worry about whether it should be "cp" or "windows-".
4502 (if (coding-system-p w
)
4503 (define-coding-system-alias c w
))
4504 ;; Compatibility with codepage.el, though cp... are not the
4506 (push (assoc w non-iso-charset-alist
) non-iso-charset-alist
)))
4508 (provide 'code-pages
)
4510 ;;; arch-tag: 8b6e3c73-b271-4198-866d-ea6d0ceff1b2
4511 ;;; code-pages.el ends here