lisp/international/utf-16.el

   1 ;;; utf-16.el --- UTF-16 encoding/decoding
   2
   3 ;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   4
   5 ;; Author: Dave Love <fx@gnu.org>
   6 ;; Keywords: Unicode, UTF-16, i18n
   7
   8 ;; This file is part of GNU Emacs.
   9
  10 ;; GNU Emacs is free software; you can redistribute it and/or modify
  11 ;; it under the terms of the GNU General Public License as published by
  12 ;; the Free Software Foundation; either version 2, or (at your option)
  13 ;; any later version.
  14
  15 ;; GNU Emacs is distributed in the hope that it will be useful,
  16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18 ;; GNU General Public License for more details.
  19
  20 ;; You should have received a copy of the GNU General Public License
  21 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23 ;; Boston, MA 02111-1307, USA.
  24
  25 ;;; Commentary:
  26
  27 ;; Support for UTF-16, which is a two-byte encoding (modulo
  28 ;; surrogates) of Unicode, written either in little or big endian
  29 ;; order: coding-systems `mule-utf-16-le' and `mule-utf-16-be'.
  30 ;; (utf-16-le is used by the DozeN'T clipboard, for instance.)  The
  31 ;; data are preceeded by a two-byte signature which identifies their
  32 ;; byte sex.  These are used by the coding-category-utf-16-{b,l}e code
  33 ;; to identify the coding, but ignored on decoding.
  34
  35 ;; Note that un-decodable sequences aren't (yet?) preserved as raw
  36 ;; bytes, as they are with utf-8, so reading and writing as utf-16 can
  37 ;; corrupt data.
  38
  39 ;;; Code:
  40
  41 ;; We end up with trivially different -le and -be versions of most
  42 ;; things below, sometimes with commonality abstracted into a let
  43 ;; binding for maintenance convenience.
  44
  45 ;; We'd need new charsets distinct from ascii and eight-bit-control to
  46 ;; deal with untranslated sequences, since we can't otherwise
  47 ;; distinguish the bytes, as we can with utf-8.
  48
  49 ;; ;; Do a multibyte write for bytes in r3 and r4.
  50 ;; ;; Intended for untranslatable utf-16 sequences.
  51 ;; (define-ccl-program ccl-mule-utf-16-untrans
  52 ;;   `(0
  53 ;;      (if (r3 < 128)
  54 ;;       (r0 = ,(charset-id 'ascii))
  55 ;;        (if (r3 < 160)
  56 ;;         (r0 = ,(charset-id 'eight-bit-control))
  57 ;;       (r0 = ,(charset-id 'eight-bit-graphic))))
  58 ;;      (if (r4 < 128)
  59 ;;       (r0 = ,(charset-id 'ascii))
  60 ;;        (if (r4 < 160)
  61 ;;         (r0 = ,(charset-id 'eight-bit-control))
  62 ;;       (r0 = ,(charset-id 'eight-bit-graphic))))
  63 ;;      (r1 = r4)))
  64 ;;   "Do a multibyte write for bytes in r3 and r4.
  65 ;; First swap them if we're big endian, indicated by r5==0.
  66 ;; Intended for untranslatable utf-16 sequences.")
  67
  68 ;; Needed in macro expansion, so can't be let-bound.  Zapped after use.
  69 (eval-and-compile
  70 (defconst utf-16-decode-ucs
  71   ;; We have the unicode in r1.  Output is charset ID in r0, code point
  72   ;; in r1.
  73   `((lookup-integer utf-subst-table-for-decode r1 r3)
  74     (if r7                              ; got a translation
  75         ((r0 = r1) (r1 = r3))
  76       (if (r1 < 128)
  77           (r0 = ,(charset-id 'ascii))
  78         (if (r1 < 160)
  79             (r0 = ,(charset-id 'eight-bit-control))
  80           (if (r1 < 256)
  81               ((r0 = ,(charset-id 'latin-iso8859-1))
  82                (r1 -= 128))
  83             (if (r1 < #x2500)
  84                 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
  85                  (r1 -= #x100)
  86                  (r2 = (((r1 / 96) + 32) << 7))
  87                  (r1 %= 96)
  88                  (r1 += (r2 + 32)))
  89               (if (r1 < #x3400)
  90                   ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
  91                    (r1 -= #x2500)
  92                    (r2 = (((r1 / 96) + 32) << 7))
  93                    (r1 %= 96)
  94                    (r1 += (r2 + 32)))
  95                 (if (r1 < #xd800)       ; 2 untranslated bytes
  96                     ;;           ;; Assume this is rare, so don't worry about the
  97                     ;;           ;; overhead of the call.
  98                     ;;           (call mule-utf-16-untrans)
  99                     ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 100                      (r1 = 15037))      ; U+fffd
 101                   (if (r1 < #xe000)     ; surrogate
 102                       ;;                        ((call mule-utf-16-untrans)
 103                       ;;                         (write-multibyte-character r0 r1)
 104                       ;;                         (read r3 r4)
 105                       ;;                         (call mule-utf-16-untrans))
 106                       ((read r3 r4)
 107                        (r0 = ,(charset-id 'mule-unicode-e000-ffff))
 108                        (r1 = 15037))
 109                     ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 110                      (r1 -= #xe000)
 111                      (r2 = (((r1 / 96) + 32) << 7))
 112                      (r1 %= 96)
 113                      (r1 += (r2 + 32))))))))))))))
 114
 115 (define-ccl-program ccl-decode-mule-utf-16-le
 116   `(2                                   ; 2 bytes -> 1 to 4 bytes
 117     ((read r0 r1)                       ; signature
 118      (loop
 119       (read r3 r4)
 120       (r1 = (r4 <8 r3))
 121       ,utf-16-decode-ucs
 122       (translate-character utf-translation-table-for-decode r0 r1)
 123       (write-multibyte-character r0 r1)
 124       (repeat))))
 125   "Decode little endian UTF-16 (ignoring signature bytes).
 126 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 127 mule-unicode-*.  Un-representable Unicode characters are decoded as
 128 U+fffd.  The result is run through the translation-table named
 129 `utf-translation-table-for-decode'.")
 130
 131 (define-ccl-program ccl-decode-mule-utf-16-be
 132   `(2                                   ; 2 bytes -> 1 to 4 bytes
 133     ((read r0 r1)                       ; signature
 134      (loop
 135       (read r3 r4)
 136       (r1 = (r3 <8 r4))
 137       ,utf-16-decode-ucs
 138       (translate-character utf-translation-table-for-decode r0 r1)
 139       (write-multibyte-character r0 r1)
 140       (repeat))))
 141   "Decode big endian UTF-16 (ignoring signature bytes).
 142 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 143 mule-unicode-*.  Un-representable Unicode characters are
 144 decoded as U+fffd.  The result is run through the translation-table of
 145 name `utf-translation-table-for-decode'.")
 146
 147 (makunbound 'utf-16-decode-ucs)         ; done with it
 148
 149 (eval-and-compile
 150 (defconst utf-16-decode-to-ucs
 151   ;; CCL which, given the result of a multibyte read in r0 and r1,
 152   ;; sets r0 to the character's Unicode if the charset is one of the
 153   ;; basic utf-8 coding system ones.  Otherwise set to U+fffd.
 154   `(if (r0 == ,(charset-id 'ascii))
 155        (r0 = r1)
 156      (if (r0 == ,(charset-id 'latin-iso8859-1))
 157          (r0 = (r1 + 128))
 158        (if (r0 == ,(charset-id 'eight-bit-control))
 159            (r0 = r1)
 160          (if (r0 == ,(charset-id 'eight-bit-graphic))
 161              (r0 = r1)
 162            ((r2 = (r1 & #x7f))
 163             (r1 >>= 7)
 164             (r3 = ((r1 - 32) * 96))
 165             (r3 += (r2 - 32))
 166             (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
 167                 (r0 = (r3 + #x100))
 168               (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
 169                   (r0 = (r3 + #x2500))
 170                 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 171                     (r0 = (r3 + #xe000))
 172                   (r0 = #xfffd)))))))))))
 173
 174 (define-ccl-program ccl-encode-mule-utf-16-le
 175   `(1
 176     ((write #xff)
 177      (write #xfe)
 178      (loop
 179       (read-multibyte-character r0 r1)
 180       (lookup-character utf-subst-table-for-encode r0 r1)
 181       (if (r7 == 0)
 182           ((translate-character utf-translation-table-for-encode r0 r1)
 183            ,utf-16-decode-to-ucs))
 184       (write (r0 & 255))
 185       (write (r0 >> 8))
 186       (repeat))))
 187   "Encode to little endian UTF-16 with signature.
 188 Characters from the charsets ascii, eight-bit-control,
 189 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
 190 after translation through the translation-table of name
 191 `utf-translation-table-for-encode'.
 192 Others are encoded as U+FFFD.")
 193
 194 (define-ccl-program ccl-encode-mule-utf-16-be
 195   `(1
 196     ((write #xfe)
 197      (write #xff)
 198      (loop
 199       (read-multibyte-character r0 r1)
 200       (lookup-character utf-subst-table-for-encode r0 r1)
 201       (if (r7 == 0)
 202           ((translate-character utf-translation-table-for-encode r0 r1)
 203            ,utf-16-decode-to-ucs))
 204       (write (r0 >> 8))
 205       (write (r0 & 255))
 206       (repeat))))
 207   "Encode to big endian UTF-16 with signature.
 208 Characters from the charsets ascii, eight-bit-control,
 209 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
 210 after translation through the translation-table named
 211 `utf-translation-table-for-encode'.
 212 Others are encoded as U+FFFD.")
 213
 214 (makunbound 'utf-16-decode-to-ucs)
 215
 216 (let ((doc "
 217
 218 Assumes and ignores the leading two-byte signature.
 219
 220 It supports Unicode characters of these ranges:
 221     U+0000..U+33FF, U+E000..U+FFFF.
 222 They correspond to these Emacs character sets:
 223     ascii, latin-iso8859-1, mule-unicode-0100-24ff,
 224     mule-unicode-2500-33ff, mule-unicode-e000-ffff
 225
 226 On decoding (e.g. reading a file), Unicode characters not in the above
 227 ranges are decoded as U+FFFD, effectively corrupting the data
 228 if they are re-encoded.
 229
 230 On encoding (e.g. writing a file), Emacs characters not belonging to
 231 any of the character sets listed above are encoded into the byte
 232 sequence representing U+FFFD (REPLACEMENT CHARACTER)."))
 233   (make-coding-system
 234    'mule-utf-16-le 4
 235    ?u         ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
 236    (concat
 237     "Little endian UTF-16 encoding for Emacs-supported Unicode characters."
 238     doc)
 239
 240    '(ccl-decode-mule-utf-16-le . ccl-encode-mule-utf-16-le)
 241    '((safe-charsets
 242       ascii
 243       eight-bit-control
 244       latin-iso8859-1
 245       mule-unicode-0100-24ff
 246       mule-unicode-2500-33ff
 247       mule-unicode-e000-ffff)
 248      (mime-charset . utf-16le)
 249      (coding-category . coding-category-utf-16-le)
 250      (valid-codes (0 . 255))
 251      (dependency unify-8859-on-encoding-mode
 252                  unify-8859-on-decoding-mode
 253                  utf-fragment-on-decoding
 254                  utf-translate-cjk)))
 255
 256   (make-coding-system
 257    'mule-utf-16-be 4 ?u
 258    (concat
 259     "Big endian UTF-16 encoding for Emacs-supported Unicode characters."
 260     doc)
 261
 262    '(ccl-decode-mule-utf-16-be . ccl-encode-mule-utf-16-be)
 263    '((safe-charsets
 264       ascii
 265       eight-bit-control
 266       latin-iso8859-1
 267       mule-unicode-0100-24ff
 268       mule-unicode-2500-33ff
 269       mule-unicode-e000-ffff)
 270      (mime-charset . utf-16be)
 271      (coding-category . coding-category-utf-16-be)
 272      (valid-codes (0 . 255))
 273      (dependency unify-8859-on-encoding-mode
 274                  unify-8859-on-decoding-mode
 275                  utf-fragment-on-decoding
 276                  utf-translate-cjk))))
 277
 278 (define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
 279 (define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
 280
 281 ;;; utf-16.el ends here