1 ;;; code-pages.el --- coding systems for assorted codepages -*-coding: utf-8;-*-
3 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 ;; Author: Dave Love <fx@gnu.org>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
27 ;; Definitions of miscellaneous 8-bit coding systems based on ASCII
28 ;; (we can't cope properly with EBCDIC, for instance), mainly for PC
29 ;; `code pages'. They are decoded into Latin-1 and mule-unicode
30 ;; charsets rather than (lossily) into single iso8859 charsets à la
31 ;; codepage.el. The utility `cp-make-coding-system' derives them from
34 ;; Those covered are: cp437, cp737, cp720, cp775, cp850, cp851, cp852,
35 ;; cp855, cp857, cp860, cp861, cp862, cp863, cp864, cp865, cp866,
36 ;; cp869, cp874, cp1125, windows-1250, windows-1251, windows-1252,
37 ;; windows-1253, windows-1254, windows-1255, windows-1256,
38 ;; windows-1257, windows-1258, next, koi8-u, iso-8859-6,
39 ;; iso-8859-10, iso-8859-11, iso-8859-16, koi8-t, georgian-ps. This
40 ;; is meant to include all the single-byte ones relevant to GNU (used
41 ;; in glibc-defined locales); we don't yet get all the multibyte ones
44 ;; Note that various of these can clash with definitions in
45 ;; codepage.el; we try to avoid damage from that. A few CPs from
46 ;; codepage.el (770, 773, 774) aren't covered (in the absence of
47 ;; translation tables to Unicode).
49 ;; Compile this, to avoid loading `ccl' at runtime.
51 ;; Although the tables used here aren't very big, it might be worth
52 ;; splitting the file and autoloading the coding systems if/when my
53 ;; (or similar) autoloading code is installed.
57 (defun cp-make-translation-table (v)
58 "Return a translation table made from 128-long vector V.
59 V comprises characters encodable by mule-utf-8."
60 (let ((encoding-vector (make-vector 256 0)))
62 (aset encoding-vector i i
))
64 (aset encoding-vector
(+ i
128) (aref v i
)))
65 ;; Add equivalent characters to the encoder so that we can unify
67 (let* ((tab (make-translation-table-from-vector encoding-vector
))
68 ;; Translation table used for encoding:
69 (encode-table (char-table-extra-slot tab
0)))
70 (map-char-table (lambda (c v
)
72 (let ((c1 (aref encode-table v
)))
73 (if c1
; we encode that unicode
74 (aset encode-table c c1
)))))
75 ucs-mule-to-mule-unicode
)
78 (defun cp-valid-codes (v)
79 "Derive a valid-codes list for translation vector V.
80 See `make-coding-system'."
82 (i 128) ; index into v
83 (start 0) ; start of a valid range
84 (end 127)) ; end of a valid range
86 (if (aref v
(- i
128)) ; start or extend range
89 (unless start
(setq start i
)))
91 (push (cons start end
) pairs
))
94 (if start
(push (cons start end
) pairs
))
97 (defun cp-fix-safe-chars (cs)
98 "Remove `char-coding-system-table' entries from previous definition of CS.
99 CS is a base coding system or alias."
100 (when (coding-system-p cs
)
101 (let ((chars (coding-system-get cs
'safe-chars
)))
104 (if (and v
(not (eq v t
)))
105 (aset char-coding-system-table
107 (remq cs
(aref char-coding-system-table k
)))))
110 ;; Fix things that have been, or might be, done by codepage.el.
111 (eval-after-load "codepage"
114 (dolist (cs '(cp857 cp861 cp1253 cp852 cp866 cp437 cp855 cp869 cp775
115 cp862 cp864 cp1250 cp863 cp865 cp1251 cp737 cp1257 cp850
117 (cp-fix-safe-chars cs
))
119 ;; Semi-dummy version for the stuff in codepage.el which we don't
120 ;; define here. (Used by mule-diag.)
121 (defun cp-supported-codepages ()
122 "Return an alist of supported codepages.
124 Each association in the alist has the form (NNN . CHARSET), where NNN is the
125 codepage number, and CHARSET is the MULE charset which is the closest match
126 for the character set supported by that codepage.
128 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
129 is a vector, and has a charset property."
130 '(("774" . latin-iso8859-4
) ("770" . latin-iso8859-4
)
131 ("773" . latin-iso8859-4
)))
133 ;; A version which doesn't override the coding systems set up by this
134 ;; file. It could still be used for the few missing ones from
136 (defun codepage-setup (codepage)
137 "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
139 These coding systems are meant for encoding and decoding 8-bit non-ASCII
140 characters used by the IBM codepages, typically in conjunction with files
141 read/written by MS-DOS software, or for display on the MS-DOS terminal."
143 (let ((completion-ignore-case t
)
144 (candidates (cp-supported-codepages)))
145 (list (completing-read "Setup DOS Codepage: (default 437) " candidates
146 nil t nil nil
"437"))))
147 (let ((cp (format "cp%s" codepage
)))
148 (unless (coding-system-p (intern cp
))
149 (cp-make-coding-systems-for-codepage
150 cp
(cp-charset-for-codepage cp
) (cp-offset-for-codepage cp
))))))
153 ;; For `non-iso-charset-alist'. Do this after redefining
154 ;; `cp-supported-codepages', which is called through loading
158 ;; Macro to allow ccl compilation at byte-compile time, avoiding
161 (defmacro cp-make-coding-system
(name v
&optional doc-string mnemonic
)
162 "Make coding system NAME for and 8-bit, extended-ASCII character set.
163 V is a 128-long vector of characters to translate the upper half of
164 the charactert set. DOC-STRING and MNEMONIC are used as the
165 corresponding args of `make-coding-system'. If MNEMONIC isn't given,
167 (let* ((encoder (intern (format "encode-%s" name
)))
168 (decoder (intern (format "decode-%s" name
)))
174 (if (r1 < 128) ;; ASCII
175 (r0 = ,(charset-id 'ascii
))
177 (r0 = ,(charset-id 'eight-bit-control
))
178 (r0 = ,(charset-id 'eight-bit-graphic
))))
179 (translate-character ,decoder r0 r1
)
180 ;; Allow fragmentation on decoding -- relevant for
181 ;; Cyrillic, Greek and, possibly Arabic and Hebrew.
182 (translate-character utf-translation-table-for-decode r0 r1
)
183 (write-multibyte-character r0 r1
)
189 (read-multibyte-character r0 r1
)
190 (translate-character ,encoder r0 r1
)
191 (if (r0 != ,(charset-id 'ascii
))
192 (if (r0 != ,(charset-id 'eight-bit-graphic
))
193 (if (r0 != ,(charset-id 'eight-bit-control
))
195 (write-repeat r1
)))))))
196 `(let ((translation-table (cp-make-translation-table ,v
))
197 (codes (cp-valid-codes ,v
)))
198 (define-translation-table ',decoder translation-table
)
199 (define-translation-table ',encoder
200 (char-table-extra-slot translation-table
0))
201 (cp-fix-safe-chars ',name
)
203 ',name
4 ,(or mnemonic ?
*)
204 (or ,doc-string
(format "%s encoding" ',name
))
205 (cons ,ccl-decoder
,ccl-encoder
)
206 (list (cons 'safe-chars
(get ',encoder
'translation-table
))
207 (cons 'valid-codes codes
)
208 (cons 'mime-charset
',name
)
209 ;; For Quail translation. Fixme: this should really be
210 ;; a separate table that only translates the coding
211 ;; system's safe-chars.
212 (cons 'translation-table-for-input
,ucs-mule-to-mule-unicode
)))
216 (let (l) ; code range
217 (dolist (elt (reverse codes
))
221 non-iso-charset-alist
))))
224 ;; These tables were mostly derived by running somthing like
225 ;; `recode -f cpxxx/..utf-8' on a binary file filled by
226 ;; `(dotimes (i 128) (insert ?? ?\\ (+ 128 i) ?\n))' and then
227 ;; exchanging the ?\� entries for nil. iconv was used instead in some
230 ;; Fixme: Do better for mode-line mnemonics?
232 (cp-make-coding-system
363 (cp-make-coding-system
493 (coding-system-put 'cp737
'mime-charset nil
) ; not in IANA list
495 (cp-make-coding-system
626 (cp-make-coding-system
757 (cp-make-coding-system
888 (cp-make-coding-system
1019 (cp-make-coding-system
1150 (cp-make-coding-system
1281 (cp-make-coding-system
1412 (cp-make-coding-system
1543 (cp-make-coding-system
1674 (cp-make-coding-system
1805 (cp-make-coding-system
1936 (cp-make-coding-system
2067 (cp-make-coding-system
2200 (cp-make-coding-system
2331 (cp-make-coding-system
2462 (cp-make-coding-system
2594 (cp-make-coding-system
2726 (cp-make-coding-system
2857 (cp-make-coding-system
2989 (cp-make-coding-system
3121 (cp-make-coding-system
3253 (cp-make-coding-system
3385 (cp-make-coding-system
3516 (cp-make-coding-system
3647 (cp-make-coding-system
3777 "NeXTstep encoding." ?N
)
3779 (cp-make-coding-system
3780 koi8-t
; used by glibc for tg_TJ
3909 "Unicode-based KOI8-T encoding for Cyrillic")
3910 (coding-system-put 'koi8-t
'mime-charset nil
) ; not in the IANA list
3912 ;; Online final ISO draft:
3914 ;; http://www.evertype.com/standards/iso8859/fdis8859-16-en.pdf
3916 ;; Equivalent National Standard:
3917 ;; Romanian Standard SR 14111:1998, Romanian Standards Institution
3922 ;; "This set of coded graphic characters is intended for use in data and
3923 ;; text processing applications and also for information interchange. The
3924 ;; set contains graphic characters used for general purpose applications in
3925 ;; typical office environments in at least the following languages:
3926 ;; Albanian, Croatian, English, Finnish, French, German, Hungarian, Irish
3927 ;; Gaelic (new orthography), Italian, Latin, Polish, Romanian, and
3928 ;; Slovenian. This set of coded graphic characters may be regarded as a
3929 ;; version of an 8-bit code according to ISO/IEC 2022 or ISO/IEC 4873 at
3930 ;; level 1." [ISO 8859-16:2001(E), p. 1]
3932 ;; This charset is suitable for use in MIME text body parts.
3934 ;; ISO 8859-16 was primarily designed for single-byte encoding the Romanian
3935 ;; language. The UTF-8 charset is the preferred and in today's MIME software
3936 ;; more widely implemented encoding suitable for Romanian.
3937 (cp-make-coding-system
3938 iso-latin-10
; consistent with, e.g. Latin-1
3939 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3940 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4037 "Unicode-based encoding for Latin-10 (MIME: ISO-8859-16)"
4039 (coding-system-put 'iso-latin-10
'mime-charset
'iso-8859-16
)
4040 (define-coding-system-alias 'iso-8859-16
'iso-latin-10
)
4041 (define-coding-system-alias 'latin-10
'iso-latin-10
)
4043 ;; Unicode-based alternative which has the possible advantage of
4044 ;; having its relative sparseness specified.
4045 (cp-make-coding-system
4046 ;; The base system uses arabic-iso-8bit, but that's not a MIME charset.
4048 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4049 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4101 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4102 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4103 nil nil nil nil nil nil nil nil nil nil nil
]
4104 "Unicode-based Arabic ISO/IEC 8859-6 (MIME: ISO-8859-6)"
4106 (define-coding-system-alias 'arabic-iso-8bit
'iso-8859-6
)
4108 (cp-make-coding-system
4110 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4111 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4208 "Unicode-based encoding for Latin-6 (MIME: ISO-8859-10)")
4209 (coding-system-put 'iso-latin-6
'mime-charset
'iso-8859-10
)
4210 (define-coding-system-alias 'iso-8859-10
'iso-latin-6
)
4211 (define-coding-system-alias 'latin-6
'iso-latin-6
)
4213 ;; used by lt_LT, lv_LV, mi_NZ
4214 (cp-make-coding-system
4216 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4217 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4315 "Unicode-based encoding for Latin-7 (MIME: ISO-8859-13)"
4316 ?l
) ;; Lithuanian/Latvian
4317 (coding-system-put 'iso-latin-7
'mime-charset
'iso-8859-13
)
4318 (define-coding-system-alias 'iso-8859-13
'iso-latin-7
)
4319 (define-coding-system-alias 'latin-7
'iso-latin-7
)
4321 (cp-make-coding-system
4322 georgian-ps
; used by glibc for ka_GE
4452 (coding-system-put 'georgian-ps
'mime-charset nil
) ; not in IANA list
4454 ;; From http://www.microsoft.com/globaldev/reference/oem/720.htm
4455 (cp-make-coding-system
4585 (coding-system-put 'cp720
'mime-charset nil
) ; not in IANA list
4587 ;; http://oss.software.ibm.com/cvs/icu/charset/data/ucm/ibm-1125_P100-2000.ucm
4588 (cp-make-coding-system
4718 (define-coding-system-alias 'ruscii
'cp1125
)
4719 ;; Original name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
4720 (define-coding-system-alias 'cp866u
'cp1125
)
4722 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: Bulgarian DOS
4723 ;; codepage. Table at
4724 ;; <URL:http://czyborra.com/charsets/bulgarian-mik.txt.gz>.
4725 (cp-make-coding-system
4727 [?А ?Б ?В ?Г ?Д ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц
4728 ?Ч ?Ш ?Щ ?Ъ ?Ы ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н
4729 ?о ?п ?р ?с ?т ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я ?└ ?┴ ?┬ ?├ ?─
4730 ?┼ ?╣ ?║ ?╚ ?╔ ?╩ ?╦ ?╠ ?═ ?╬ ?┐ ?░ ?▒ ?▓ ?│ ?┤ ?№ ?§ ?╗ ?╝ ?┘ ?┌ ?█
4731 ?▄ ?▌ ?▐ ?▀ ?α ?β ?Γ ?π ?Σ ?σ ?μ ?τ ?Φ ?Θ ?Ω ?δ ?∞ ?∅ ?∈ ?∩ ?≡ ?± ?≥
4732 ?≤ ?⌠ ?⌡ ?÷ ?≈ ?° ?∙ ?· ?√ ?ⁿ ?² ?■ ?
])
4734 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: similar to CP1251
4735 ;; and used for some non-Slavic Cyrillic languages. Table found at
4736 ;; <URL:ftp://ftp.logic.ru/pub/logic/linux/cyr-asian/PT154>. See also
4737 ;; <URL:http://lists.w3.org/Archives/Public/ietf-charsets/2002AprJun/0092.html,
4738 ;; which suggests it's used in an Asian Cyrillic context.
4739 (cp-make-coding-system
4741 [?Җ ?Ғ ?Ӯ ?ғ ?„ ?… ?Ҷ ?Ү ?Ҳ ?ү ?Ҡ ?Ӣ ?Ң ?Қ ?Һ ?Ҹ ?җ ?‘ ?’ ?“ ?” ?• ?–
4742 ?— ?ҳ ?ҷ ?ҡ ?ӣ ?ң ?қ ?һ ?ҹ ? ?Ў ?ў ?Ј ?Ө ?Ҙ ?Ұ ?§ ?Ё ?© ?Ә ?\« ?¬ ?ӯ
4743 ?® ?Ҝ ?° ?ұ ?І ?і ?ҙ ?ө ?¶ ?· ?ё ?№ ?ә ?» ?ј ?Ҫ ?ҫ ?ҝ ?А ?Б ?В ?Г ?Д
4744 ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц ?Ч ?Ш ?Щ ?Ъ ?Ы
4745 ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н ?о ?п ?р ?с ?т
4746 ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я
])
4748 (cp-make-coding-system
4750 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4751 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4752 ? ?ก ?ข ?ฃ ?ค ?ฅ ?ฆ ?ง ?จ ?ฉ ?ช ?ซ ?ฌ ?ญ ?ฎ ?ฏ
4753 ?ฐ ?ฑ ?ฒ ?ณ ?ด ?ต ?ถ ?ท ?ธ ?น ?บ ?ป ?ผ ?ฝ ?พ ?ฟ
4754 ?ภ ?ม ?ย ?ร ?ฤ ?ล ?ฦ ?ว ?ศ ?ษ ?ส ?ห ?ฬ ?อ ?ฮ ?ฯ
4755 ?ะ ?ั ?า ?ำ ?ิ ?ี ?ึ ?ื ?ุ ?ู ?ฺ nil nil nil nil ?฿
4756 ?เ ?แ ?โ ?ใ ?ไ ?ๅ ?ๆ ?็ ?่ ?้ ?๊ ?๋ ?์ ?ํ ?๎ ?๏
4757 ?๐ ?๑ ?๒ ?๓ ?๔ ?๕ ?๖ ?๗ ?๘ ?๙ ?๚ ?๛ nil nil nil nil
]
4758 "ISO-8859-11. This is `thai-tis620' with the addition of no-break-space.")
4761 (let ((w (intern (format "windows-125%d" i
)))
4762 (c (intern (format "cp125%d" i
))))
4763 (define-coding-system-alias c w
)
4764 ;; Compatibility with codepage.el, though cp... are not the
4766 (push (assoc w non-iso-charset-alist
) non-iso-charset-alist
)))
4768 ;; Use Unicode font under Windows. Jason Rumney fecit.
4769 (if (and (fboundp 'w32-add-charset-info
)
4770 (not (boundp 'w32-unicode-charset-defined
)))
4771 (w32-add-charset-info "iso10646-1" 'w32-charset-ansi t
))
4773 (provide 'code-pages
)
4775 ;;; code-pages.el ends here