;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*- ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN. ;; Licensed to the Free Software Foundation. ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc. ;; Author: TAKAHASHI Naoto ;; Maintainer: FSF ;; Keywords: multilingual, Unicode, UTF-8, i18n ;; This file is part of GNU Emacs. ;; GNU Emacs is free software; you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation; either version 2, or (at your option) ;; any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, ;; Boston, MA 02111-1307, USA. ;;; Commentary: ;; The coding-system `mule-utf-8' basically supports encoding/decoding ;; of the following character sets to and from UTF-8: ;; ;; ascii ;; eight-bit-control ;; latin-iso8859-1 ;; mule-unicode-0100-24ff ;; mule-unicode-2500-33ff ;; mule-unicode-e000-ffff ;; ;; On decoding, Unicode characters that do not fit into the above ;; character sets are handled as `eight-bit-control' or ;; `eight-bit-graphic' characters to retain the information about the ;; original byte sequence and text properties record the corresponding ;; unicode. ;; ;; Fixme: note that reading and writing invalid utf-8 may not be ;; idempotent -- to represent the bytes to fix that needs a new charset. ;; ;; Characters from other character sets can be encoded with ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and ;; registering the translation with `register-char-codings'. Hash ;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to ;; support encoding and decoding of about a quarter of the CJK space ;; between U+3400 and U+DFFF. ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is: ;; scalar | utf-8 ;; value | 1st byte | 2nd byte | 3rd byte ;; --------------------+-----------+-----------+---------- ;; 0000 0000 0xxx xxxx | 0xxx xxxx | | ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx | ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx ;;; Code: (defvar ucs-mule-to-mule-unicode (make-translation-table) "Translation table for encoding to `mule-utf-8'.") (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode) (defvar utf-8-subst-table (make-hash-table :test 'eq)) (defvar utf-8-subst-rev-table (make-hash-table :test 'eq)) (define-translation-hash-table 'utf-8-subst-table utf-8-subst-table) (define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table) (defvar utf-8-translation-table-for-decode (make-translation-table) "Translation table applied after decoding utf-8 to mule-unicode. This is only actually applied to characters which would normally be decoded into mule-unicode-0100-24ff.") (define-translation-table 'utf-8-translation-table-for-decode utf-8-translation-table-for-decode) ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the ;; space of mule-unicode. For Latin scripts this isn't very ;; important. Hebrew and Arabic might go here too when there's proper ;; support for them. (defvar utf-8-fragmentation-table (make-translation-table) "Char table normally mapping non-Latin mule-unicode-... characters to iso8859. Used as the value of `utf-8-translation-table-for-decode' in `utf-8-fragment-on-decoding' mode.") (mapc (lambda (pair) (aset utf-8-fragmentation-table (car pair) (cdr pair))) '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B) (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B) (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B) (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B) (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B) (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B) (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B) (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B) (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B) (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B) (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B) (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B) (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B) (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B) (?$,1'N(B . ?,F~(B) (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B) (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B) (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B) (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B) (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B) (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B) (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B) (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B) (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B) (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B) (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B) (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B) (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B) (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B) (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B) (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B) (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B) (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B) (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B))) (defcustom utf-8-fragment-on-decoding nil "Whether or not to decode some scripts in UTF-8 text into iso8859 charsets. Setting this means that the relevant Cyrillic and Greek characters are decoded into the iso8859 charsets rather than into mule-unicode-0100-24ff. The iso8859 charsets take half as much space in the buffer, but using them may affect how the buffer can be re-encoded and may require a different input method to search for them, for instance. See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode' for mechanisms to make this largely transparent. Setting this variable outside customize has no effect." :set (lambda (s v) (setq utf-8-translation-table-for-decode (if v utf-8-fragmentation-table (make-translation-table))) (define-translation-table 'utf-8-translation-table-for-decode utf-8-translation-table-for-decode) (set-default s v)) :version "21.4" :type 'boolean :group 'mule) (defcustom utf-8-translate-cjk nil "Whether the `mule-utf-8' coding system should encode many CJK characters. Enabling this loads tables which enable the coding system to encode characters in the charsets `korean-ksc5601', `chinese-gb2312' and `japanese-jisx0208', and to decode the corresponding unicodes into such characters. This works by loading the library `utf-8-subst'; see its commentary. The tables are fairly large (about 33000 entries), so this option is not the default." :link '(emacs-commentary-link "utf-8-subst") :set (lambda (s v) (when v (require 'utf-8-subst) (let ((table (make-char-table 'translation-table))) (coding-system-put 'mule-utf-8 'safe-charsets (append (coding-system-get 'mule-utf-8 'safe-charsets) '(korean-ksc5601 chinese-gb2312 japanese-jisx0208))) (maphash (lambda (k v) (aset table k v)) utf-8-subst-rev-table) (register-char-codings 'mule-utf-8 table))) (set-default s v)) :version "21.4" :type 'boolean :group 'mule) (define-ccl-program ccl-decode-mule-utf-8 ;; ;; charset | bytes in utf-8 | bytes in emacs ;; -----------------------+----------------+--------------- ;; ascii | 1 | 1 ;; -----------------------+----------------+--------------- ;; eight-bit-control | 2 | 2 ;; eight-bit-graphic | 2 | 1 ;; latin-iso8859-1 | 2 | 2 ;; -----------------------+----------------+--------------- ;; mule-unicode-0100-24ff | 2 | 4 ;; (< 0800) | | ;; -----------------------+----------------+--------------- ;; mule-unicode-0100-24ff | 3 | 4 ;; (>= 8000) | | ;; mule-unicode-2500-33ff | 3 | 4 ;; mule-unicode-e000-ffff | 3 | 4 ;; ;; Thus magnification factor is two. ;; `(2 ((r5 = ,(charset-id 'eight-bit-control)) (r6 = ,(charset-id 'eight-bit-graphic)) (loop (read r0) ;; 1byte encoding, i.e., ascii (if (r0 < #x80) (write r0) (if (r0 < #xc0) ; continuation byte (invalid here) (if (r0 < #xa0) (write-multibyte-character r5 r0) (write-multibyte-character r6 r0)) ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx (if (r0 < #xe0) ((read r1) (if ((r1 & #b11000000) != #b10000000) ;; Invalid 2-byte sequence ((if (r0 < #xa0) (write-multibyte-character r5 r0) (write-multibyte-character r6 r0)) (if (r1 < #x80) (write r1) (if (r1 < #xa0) (write-multibyte-character r5 r1) (write-multibyte-character r6 r1)))) ((r3 = r0) ; save in case of overlong sequence (r2 = r1) (r0 &= #x1f) (r0 <<= 6) (r2 = r1) ; save in case of overlong sequence (r1 &= #x3f) (r1 += r0) ;; Now r1 holds scalar value (if (r1 < 128) ; `overlong sequence' ((if (r3 < #xa0) (write-multibyte-character r5 r3) (write-multibyte-character r6 r3)) (if (r2 < #x80) (write r2) (if (r2 < #xa0) (write-multibyte-character r5 r2) (write-multibyte-character r6 r2)))) ;; eight-bit-control (if (r1 < 160) ((write-multibyte-character r5 r1)) ;; latin-iso8859-1 (if (r1 < 256) ((r0 = ,(charset-id 'latin-iso8859-1)) (r1 -= 128) (write-multibyte-character r0 r1)) ;; mule-unicode-0100-24ff (< 0800) ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) (r1 -= #x0100) (r2 = (((r1 / 96) + 32) << 7)) (r1 %= 96) (r1 += (r2 + 32)) (translate-character utf-8-translation-table-for-decode r0 r1) (write-multibyte-character r0 r1)))))))) ;; 3byte encoding ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx (if (r0 < #xf0) ((read r1 r2) ;; This is set to 1 if the encoding is invalid. (r4 = 0) (r3 = (r1 & #b11000000)) (r3 |= ((r2 >> 2) & #b00110000)) (if (r3 != #b10100000) (r4 = 1) ((r3 = ((r0 & #x0f) << 12)) (r3 += ((r1 & #x3f) << 6)) (r3 += (r2 & #x3f)) (if (r3 < #x0800) (r4 = 1)))) (if (r4 != 0) ;; Invalid 3-byte sequence ((if (r0 < #xa0) (write-multibyte-character r5 r0) (write-multibyte-character r6 r0)) (if (r1 < #x80) (write r1) (if (r1 < #xa0) (write-multibyte-character r5 r1) (write-multibyte-character r6 r1))) (if (r2 < #x80) (write r2) (if (r2 < #xa0) (write-multibyte-character r5 r2) (write-multibyte-character r6 r2)))) ;; mule-unicode-0100-24ff (>= 0800) ((if (r3 < #x2500) ((r0 = ,(charset-id 'mule-unicode-0100-24ff)) (r3 -= #x0100) (r3 //= 96) (r1 = (r7 + 32)) (r1 += ((r3 + 32) << 7)) (translate-character utf-8-translation-table-for-decode r0 r1) (write-multibyte-character r0 r1)) ;; mule-unicode-2500-33ff ;; Fixme: Perhaps allow translation via ;; utf-8-subst-table for #x2e80 up, so that we use ;; consistent charsets for all of CJK. Would need ;; corresponding change to encoding tables. (if (r3 < #x3400) ((r0 = ,(charset-id 'mule-unicode-2500-33ff)) (r3 -= #x2500) (r3 //= 96) (r1 = (r7 + 32)) (r1 += ((r3 + 32) << 7)) (write-multibyte-character r0 r1)) ;; U+3400 .. U+D7FF ;; Try to convert to CJK chars, else keep ;; them as eight-bit-{control|graphic}. (if (r3 < #xd800) ((r4 = r3) ; don't zap r3 (lookup-integer utf-8-subst-table r4 r5) (if r7 ;; got a translation ((write-multibyte-character r4 r5) ;; Zapped through register starvation. (r5 = ,(charset-id 'eight-bit-control))) ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic ((r3 = r6) (write-multibyte-character r3 r0) (if (r1 < #xa0) (r3 = r5)) (write-multibyte-character r3 r1) (if (r2 < #xa0) (r3 = r5) (r3 = r6)) (write-multibyte-character r3 r2)))) ;; Surrogates, U+D800 .. U+DFFF (if (r3 < #xe000) ((r3 = r6) (write-multibyte-character r3 r0) ; eight-bit-graphic (if (r1 < #xa0) (r3 = r5)) (write-multibyte-character r3 r1) (if (r2 < #xa0) (r3 = r5) (r3 = r6)) (write-multibyte-character r3 r2)) ;; mule-unicode-e000-ffff ;; Fixme: fffe and ffff are invalid. ((r0 = ,(charset-id 'mule-unicode-e000-ffff)) (r3 -= #xe000) (r3 //= 96) (r1 = (r7 + 32)) (r1 += ((r3 + 32) << 7)) (write-multibyte-character r0 r1))))))))) (if (r0 < #xfe) ;; 4byte encoding ;; keep those bytes as eight-bit-{control|graphic} ;; Fixme: allow lookup in utf-8-subst-table. ((read r1 r2 r3) ;; r0 > #xf0, thus eight-bit-graphic (write-multibyte-character r6 r0) (if (r1 < #xa0) (if (r1 < #x80) ; invalid byte (write r1) (write-multibyte-character r5 r1)) (write-multibyte-character r6 r1)) (if (r2 < #xa0) (if (r2 < #x80) ; invalid byte (write r2) (write-multibyte-character r5 r2)) (write-multibyte-character r6 r2)) (if (r3 < #xa0) (if (r3 < #x80) ; invalid byte (write r3) (write-multibyte-character r5 r3)) (write-multibyte-character r6 r3)) (if (r0 >= #xf8) ; 5- or 6-byte encoding ((read r1) (if (r1 < #xa0) (if (r1 < #x80) ; invalid byte (write r1) (write-multibyte-character r5 r1)) (write-multibyte-character r6 r1)) (if (r0 >= #xfc) ; 6-byte ((read r1) (if (r1 < #xa0) (if (r1 < #x80) ; invalid byte (write r1) (write-multibyte-character r5 r1)) (write-multibyte-character r6 r1))))))) ;; else invalid byte >= #xfe (write-multibyte-character r6 r0)))))) (repeat)))) "CCL program to decode UTF-8. Basic decoding is done into the charsets ascii, latin-iso8859-1 and mule-unicode-*, but see also `utf-8-translation-table-for-decode' and `utf-8-subst-table'. Encodings of un-representable Unicode characters are decoded asis into eight-bit-control and eight-bit-graphic characters.") (define-ccl-program ccl-encode-mule-utf-8 `(1 ((r5 = -1) (loop (if (r5 < 0) ((r1 = -1) (read-multibyte-character r0 r1) (translate-character ucs-mule-to-mule-unicode r0 r1)) (;; We have already done read-multibyte-character. (r0 = r5) (r1 = r6) (r5 = -1))) (if (r0 == ,(charset-id 'ascii)) (write r1) (if (r0 == ,(charset-id 'latin-iso8859-1)) ;; r1 scalar utf-8 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx ;; 20 0000 0000 1010 0000 1100 0010 1010 0000 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111 ((r0 = (((r1 & #x40) >> 6) | #xc2)) (r1 &= #x3f) (r1 |= #x80) (write r0 r1)) (if (r0 == ,(charset-id 'mule-unicode-0100-24ff)) ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) ;; #x3f80 == (0011 1111 1000 0000)b (r1 &= #x7f) (r1 += (r0 + 224)) ; 240 == -32 + #x0100 ;; now r1 holds scalar value (if (r1 < #x0800) ;; 2byte encoding ((r0 = (((r1 & #x07c0) >> 6) | #xc0)) ;; #x07c0 == (0000 0111 1100 0000)b (r1 &= #x3f) (r1 |= #x80) (write r0 r1)) ;; 3byte encoding ((r0 = (((r1 & #xf000) >> 12) | #xe0)) (r2 = ((r1 & #x3f) | #x80)) (r1 &= #x0fc0) (r1 >>= 6) (r1 |= #x80) (write r0 r1 r2)))) (if (r0 == ,(charset-id 'mule-unicode-2500-33ff)) ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) (r1 &= #x7f) (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500 (r0 = (((r1 & #xf000) >> 12) | #xe0)) (r2 = ((r1 & #x3f) | #x80)) (r1 &= #x0fc0) (r1 >>= 6) (r1 |= #x80) (write r0 r1 r2)) (if (r0 == ,(charset-id 'mule-unicode-e000-ffff)) ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96)) (r1 &= #x7f) (r1 += (r0 + 57312)) ; 57312 == -32 + #xe000 (r0 = (((r1 & #xf000) >> 12) | #xe0)) (r2 = ((r1 & #x3f) | #x80)) (r1 &= #x0fc0) (r1 >>= 6) (r1 |= #x80) (write r0 r1 r2)) (if (r0 == ,(charset-id 'eight-bit-control)) ;; r1 scalar utf-8 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx ;; 80 0000 0000 1000 0000 1100 0010 1000 0000 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111 ((write #xc2) (write r1)) (if (r0 == ,(charset-id 'eight-bit-graphic)) ;; r1 scalar utf-8 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx ;; a0 0000 0000 1010 0000 1100 0010 1010 0000 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111 ((write r1) (r1 = -1) (read-multibyte-character r0 r1) (if (r0 != ,(charset-id 'eight-bit-graphic)) (if (r0 != ,(charset-id 'eight-bit-control)) ((r5 = r0) (r6 = r1)))) (if (r5 < 0) ((read-multibyte-character r0 r2) (if (r0 != ,(charset-id 'eight-bit-graphic)) (if (r0 != ,(charset-id 'eight-bit-control)) ((r5 = r0) (r6 = r2)))) (if (r5 < 0) (write r1 r2) (if (r1 < #xa0) (write r1) ((write #xc2) (write r1))))))) ((lookup-character utf-8-subst-rev-table r0 r1) (if r7 ; lookup succeeded ((r1 = (((r0 & #xf000) >> 12) | #xe0)) (r2 = ((r0 & #x3f) | #x80)) (r0 &= #x0fc0) (r0 >>= 6) (r0 |= #x80) (write r1 r0 r2)) ;; Unsupported character. ;; Output U+FFFD, which is `ef bf bd' in UTF-8. ((write #xef) (write #xbf) (write #xbd))))))))))) (repeat))) (if (r1 >= #xa0) (write r1) (if (r1 >= #x80) ((write #xc2) (write r1))))) "CCL program to encode into UTF-8.") ;; Dummy definition so that the CCL can be checked correctly; the ;; actual data are loaded on demand. (unless (boundp 'ucs-mule-8859-to-mule-unicode) ; don't zap it (define-translation-table 'ucs-mule-8859-to-mule-unicode)) (define-ccl-program ccl-untranslated-to-ucs `(0 (if (r0 < #xf0) ; 3-byte encoding, as above ((r4 = 0) (r3 = (r1 & #b11000000)) (r3 |= ((r2 >> 2) & #b00110000)) (if (r3 != #b10100000) (r4 = 1) ((r3 = ((r0 & #x0f) << 12)) (r3 += ((r1 & #x3f) << 6)) (r3 += (r2 & #x3f)) (if (r3 < #x0800) (r4 = 1)))) (if (r4 != 0) (r0 = 0) (r0 = r3))) (if (r0 < #xf8) ; 4-byte (Mule-UCS recipe) ((r4 = (r1 >> 6)) (if (r4 != #b10) (r0 = 0) ((r4 = (r2 >> 6)) (if (r4 != #b10) (r0 = 0) ((r4 = (r3 >> 6)) (if (r4 != #b10) (r0 = 0) ((r1 = ((r1 & #x3F) << 12)) (r2 = ((r2 & #x3F) << 6)) (r3 &= #x3F) (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3))))))))) (r0 = 0)))) "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0. r0 == 0 for invalid sequence.") (defvar utf-8-ccl-regs (make-vector 8 0)) (defsubst utf-8-untranslated-to-ucs () "Return the UCS code for an untranslated sequence of raw bytes t point. Only for 3- or 4-byte sequences." (aset utf-8-ccl-regs 0 (or (char-after) 0)) (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0)) (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0)) (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0)) (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs) (aref utf-8-ccl-regs 0)) (defun utf-8-help-echo (window object position) (format "Untranslated Unicode U+%04X" (get-char-property position 'untranslated-utf-8 object))) ;; We compose the untranslatable sequences into a single character. ;; This is infelicitous for editing, because there's currently no ;; mechanism for treating compositions as atomic, but is OK for ;; display. They are composed to U+FFFD with help-echo which ;; indicates the unicodes they represent. This function GCs too much. (defsubst utf-8-compose () "Put a suitable composition on an untranslatable sequence. Return the sequence's length." (let* ((u (utf-8-untranslated-to-ucs)) (l (unless (zerop u) (if (>= u #x10000) 4 3)))) (when l (put-text-property (point) (min (point-max) (+ l (point))) 'untranslated-utf-8 u) (put-text-property (point) (min (point-max) (+ l (point))) 'help-echo 'utf-8-help-echo) (compose-region (point) (+ l (point)) ?$,3u=(B) l))) (defcustom utf-8-compose-scripts nil "*Non-nil means compose various scripts on decoding utf-8 text." :group 'mule :version "21.4" :type 'boolean) (defun utf-8-post-read-conversion (length) "Compose untranslated utf-8 sequences into single characters. Also compose particular scripts if `utf-8-compose-scripts' is non-nil." (save-excursion ;; Can't do eval-when-compile to insert a multibyte constant ;; version of the string in the loop, since it's always loaded as ;; unibyte from a byte-compiled file. (let ((range (string-as-multibyte "^\xe1-\xf7"))) (while (and (skip-chars-forward range) (not (eobp))) (forward-char (utf-8-compose))))) ;; Fixme: Takahashi-san implies it may not work this easily. I ;; asked why but didn't get a reply. -- fx (when (and utf-8-compose-scripts (> length 1)) ;; These currently have definitions which cover the relevant ;; unicodes. We could avoid loading thai-util &c by checking ;; whether the region contains any characters with the appropriate ;; categories. There aren't yet Unicode-based rules for Tibetan. (save-excursion (setq length (diacritic-post-read-conversion length))) (save-excursion (setq length (thai-post-read-conversion length))) (save-excursion (setq length (lao-post-read-conversion length))) (save-excursion (setq length (in-is13194-devanagari-post-read-conversion length)))) length) ;; ucs-tables is preloaded ;; (defun utf-8-pre-write-conversion (beg end) ;; "Semi-dummy pre-write function effectively to autoload ucs-tables." ;; ;; Ensure translation table is loaded. ;; (require 'ucs-tables) ;; ;; Don't do this again. ;; (coding-system-put 'mule-utf-8 'pre-write-conversion nil) ;; nil) (make-coding-system 'mule-utf-8 4 ?u "UTF-8 encoding for Emacs-supported Unicode characters. The supported Emacs character sets are the following, plus any other characters included in the tables `ucs-mule-to-mule-unicode' and `utf-8-subst-rev-table': ascii eight-bit-control eight-bit-graphic latin-iso8859-1 latin-iso8859-2 latin-iso8859-3 latin-iso8859-4 cyrillic-iso8859-5 greek-iso8859-7 hebrew-iso8859-8 latin-iso8859-9 latin-iso8859-14 latin-iso8859-15 mule-unicode-0100-24ff mule-unicode-2500-33ff mule-unicode-e000-ffff Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208 \(see user option `utf-8-translate-cjk'); otherwise, sequences of eight-bit-control and eight-bit-graphic characters are used to preserve their byte sequences, and these are composed to display as a single character. Emacs characters that otherwise can't be encoded are encoded as U+FFFD." '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8) '((safe-charsets ascii eight-bit-control eight-bit-graphic latin-iso8859-1 latin-iso8859-15 latin-iso8859-14 latin-iso8859-9 hebrew-iso8859-8 greek-iso8859-7 cyrillic-iso8859-5 latin-iso8859-4 latin-iso8859-3 latin-iso8859-2 vietnamese-viscii-lower vietnamese-viscii-upper thai-tis620 ipa ethiopic indian-is13194 katakana-jisx0201 chinese-sisheng lao mule-unicode-0100-24ff mule-unicode-2500-33ff mule-unicode-e000-ffff) (mime-charset . utf-8) (coding-category . coding-category-utf-8) (valid-codes (0 . 255)) ;; (pre-write-conversion . utf-8-pre-write-conversion) (post-read-conversion . utf-8-post-read-conversion))) (define-coding-system-alias 'utf-8 'mule-utf-8) ;; I think this needs special private charsets defined for the ;; untranslated sequences, if it's going to work well. ;;; (defun utf-8-compose-function (pos to pattern &optional string) ;;; (let* ((prop (get-char-property pos 'composition string)) ;;; (l (and prop (- (cadr prop) (car prop))))) ;;; (cond ((and l (> l (- to pos))) ;;; (delete-region pos to)) ;;; ((and (> (char-after pos) 224) ;;; (< (char-after pos) 256) ;;; (save-restriction ;;; (narrow-to-region pos to) ;;; (utf-8-compose))) ;;; t)))) ;;; (dotimes (i 96) ;;; (aset composition-function-table ;;; (+ 128 i) ;;; `((,(string-as-multibyte "[\200-\237\240-\377]") ;;; . utf-8-compose-function)))) ;;; utf-8.el ends here