* lisp/emacs-lisp/bytecomp.el (byte-compile-disable-print-circle): Obsolete.
[bpt/emacs.git] / lisp / international / ja-dic-cnv.el
CommitLineData
60370d40 1;;; ja-dic-cnv.el --- convert a Japanese dictionary (SKK-JISYO.L) to Emacs Lisp
b47a7926 2
7976eda0 3;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5df4f04c 4;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
2fd125a3
KH
5;; National Institute of Advanced Industrial Science and Technology (AIST)
6;; Registration Number H14PRO021
b47a7926 7
49e64228 8;; Keywords: i18n, mule, multilingual, Japanese
b47a7926
KH
9
10;; This file is part of GNU Emacs.
11
4936186e 12;; GNU Emacs is free software: you can redistribute it and/or modify
b47a7926 13;; it under the terms of the GNU General Public License as published by
4936186e
GM
14;; the Free Software Foundation, either version 3 of the License, or
15;; (at your option) any later version.
b47a7926
KH
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
4936186e 23;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
b47a7926
KH
24
25;;; Commentary:
26
27;; SKK is a Japanese input method running on Mule created by Masahiko
28;; Sato <masahiko@sato.riec.tohoku.ac.jp>. Here we provide utilities
29;; to handle a dictionary distributed with SKK so that a different
30;; input method (e.g. quail-japanese) can utilize the dictionary.
31
32;; The format of SKK dictionary is quite simple. Each line has the
33;; form "KANASTRING /CONV1/CONV2/.../" which means KANASTRING (\e$B2>L>J8\e(B
34;; \e$B;zNs\e(B) can be converted to one of CONVi. CONVi is a Kanji (\e$B4A;z\e(B)
35;; and Kana (\e$B2>L>\e(B) mixed string.
36;;
37;; KANASTRING may have a trailing ASCII letter for Okurigana (\e$BAw$j2>L>\e(B)
38;; information. For instance, the trailing letter `k' means that one
39;; of the following Okurigana is allowed: \e$B$+$-$/$1$3\e(B. So, in that
40;; case, the string "KANASTRING\e$B$/\e(B" can be converted to one of "CONV1\e$B$/\e(B",
41;; CONV2\e$B$/\e(B, ...
42
43;;; Code:
44
45;; Name of a file to generate from SKK dictionary.
46(defvar ja-dic-filename "ja-dic.el")
47
b47a7926
KH
48(defun skkdic-convert-okuri-ari (skkbuf buf)
49 (message "Processing OKURI-ARI entries ...")
50 (goto-char (point-min))
9a529312 51 (with-current-buffer buf
b47a7926
KH
52 (insert ";; Setting okuri-ari entries.\n"
53 "(skkdic-set-okuri-ari\n"))
54 (while (not (eobp))
fa6ea913
KH
55 (if (/= (following-char) ?>)
56 (let ((from (point))
57 (to (line-end-position)))
58 (with-current-buffer buf
59 (insert-buffer-substring skkbuf from to)
60 (beginning-of-line)
61 (insert "\"")
62 (search-forward " ")
63 (delete-char 1) ; delete the first '/'
64 (let ((p (point)))
65 (end-of-line)
66 (delete-char -1) ; delete the last '/'
67 (subst-char-in-region p (point) ?/ ? 'noundo))
68 (insert "\"\n"))))
69
70 (forward-line 1))
9a529312 71 (with-current-buffer buf
b47a7926
KH
72 (insert ")\n\n")))
73
74(defconst skkdic-postfix-list '(skkdic-postfix-list))
75
76(defconst skkdic-postfix-data
77 '(("\e$B$$$-\e(B" "\e$B9T\e(B")
78 ("\e$B$,$+$j\e(B" "\e$B78\e(B")
79 ("\e$B$,$/\e(B" "\e$B3X\e(B")
80 ("\e$B$,$o\e(B" "\e$B@n\e(B")
81 ("\e$B$7$c\e(B" "\e$B<R\e(B")
82 ("\e$B$7$e$&\e(B" "\e$B=8\e(B")
83 ("\e$B$7$g$&\e(B" "\e$B>^\e(B" "\e$B>k\e(B")
84 ("\e$B$8$g$&\e(B" "\e$B>k\e(B")
85 ("\e$B$;$s\e(B" "\e$B@~\e(B")
86 ("\e$B$@$1\e(B" "\e$B3Y\e(B")
87 ("\e$B$A$c$/\e(B" "\e$BCe\e(B")
88 ("\e$B$F$s\e(B" "\e$BE9\e(B")
89 ("\e$B$H$&$2\e(B" "\e$BF=\e(B")
90 ("\e$B$I$*$j\e(B" "\e$BDL$j\e(B")
91 ("\e$B$d$^\e(B" "\e$B;3\e(B")
92 ("\e$B$P$7\e(B" "\e$B66\e(B")
93 ("\e$B$O$D\e(B" "\e$BH/\e(B")
94 ("\e$B$b$/\e(B" "\e$BL\\e(B")
95 ("\e$B$f$-\e(B" "\e$B9T\e(B")))
96
97(defun skkdic-convert-postfix (skkbuf buf)
98 (message "Processing POSTFIX entries ...")
99 (goto-char (point-min))
9a529312 100 (with-current-buffer buf
b47a7926
KH
101 (insert ";; Setting postfix entries.\n"
102 "(skkdic-set-postfix\n"))
103
104 ;; Initialize SKKDIC-POSTFIX-LIST by predefined data
105 ;; SKKDIC-POSTFIX-DATA.
9a529312 106 (with-current-buffer buf
b47a7926
KH
107 (let ((l skkdic-postfix-data)
108 kana candidates entry)
109 (while l
110 (setq kana (car (car l)) candidates (cdr (car l)))
111 (insert "\"" kana)
112 (while candidates
113 (insert " " (car candidates))
114 (setq entry (lookup-nested-alist (car candidates)
115 skkdic-postfix-list nil nil t))
116 (if (consp (car entry))
117 (setcar entry (cons kana (car entry)))
118 (set-nested-alist (car candidates) (list kana)
119 skkdic-postfix-list))
120 (setq candidates (cdr candidates)))
121 (insert "\"\n")
122 (setq l (cdr l)))))
123
124 ;; Search postfix entries.
125 (while (re-search-forward "^[#<>?]\\(\\(\\cH\\|\e$B!<\e(B\\)+\\) " nil t)
126 (let ((kana (match-string 1))
127 str candidates)
128 (while (looking-at "/[#0-9 ]*\\([^/\n]*\\)/")
129 (setq str (match-string 1))
130 (if (not (member str candidates))
131 (setq candidates (cons str candidates)))
132 (goto-char (match-end 1)))
9a529312 133 (with-current-buffer buf
b47a7926
KH
134 (insert "\"" kana)
135 (while candidates
136 (insert " " (car candidates))
137 (let ((entry (lookup-nested-alist (car candidates)
138 skkdic-postfix-list nil nil t)))
139 (if (consp (car entry))
140 (if (not (member kana (car entry)))
141 (setcar entry (cons kana (car entry))))
142 (set-nested-alist (car candidates) (list kana)
143 skkdic-postfix-list)))
144 (setq candidates (cdr candidates)))
145 (insert "\"\n"))))
9a529312 146 (with-current-buffer buf
b47a7926 147 (insert ")\n\n")))
a1506d29 148
b47a7926
KH
149(defconst skkdic-prefix-list '(skkdic-prefix-list))
150
151(defun skkdic-convert-prefix (skkbuf buf)
152 (message "Processing PREFIX entries ...")
153 (goto-char (point-min))
9a529312 154 (with-current-buffer buf
b47a7926
KH
155 (insert ";; Setting prefix entries.\n"
156 "(skkdic-set-prefix\n"))
157 (save-excursion
158 (while (re-search-forward "^\\(\\(\\cH\\|\e$B!<\e(B\\)+\\)[<>?] " nil t)
159 (let ((kana (match-string 1))
160 str candidates)
161 (while (looking-at "/\\([^/\n]+\\)/")
162 (setq str (match-string 1))
163 (if (not (member str candidates))
164 (setq candidates (cons str candidates)))
165 (goto-char (match-end 1)))
9a529312 166 (with-current-buffer buf
b47a7926
KH
167 (insert "\"" kana)
168 (while candidates
169 (insert " " (car candidates))
170 (set-nested-alist (car candidates) kana skkdic-prefix-list)
171 (setq candidates (cdr candidates)))
172 (insert "\"\n")))))
9a529312 173 (with-current-buffer buf
b47a7926 174 (insert ")\n\n")))
a1506d29 175
b47a7926
KH
176;; FROM and TO point the head and tail of "/J../J../.../".
177(defun skkdic-get-candidate-list (from to)
178 (let (candidates)
179 (goto-char from)
38d777a8 180 (while (re-search-forward "/[^/ \n]+" to t)
b47a7926
KH
181 (setq candidates (cons (buffer-substring (1+ (match-beginning 0))
182 (match-end 0))
183 candidates)))
184 candidates))
185
186;; Return entry for STR from nested alist ALIST.
187(defsubst skkdic-get-entry (str alist)
188 (car (lookup-nested-alist str alist nil nil t)))
189
190
191(defconst skkdic-word-list '(skkdic-word-list))
192
193;; Return t if substring of STR (between FROM and TO) can be broken up
194;; to chunks all of which can be derived from another entry in SKK
195;; dictionary. SKKBUF is the buffer where the original SKK dictionary
3ecd3a56
GM
196;; is visited, KANA is the current entry for STR. FIRST is t only if
197;; this is called at top level.
b47a7926
KH
198
199(defun skkdic-breakup-string (skkbuf kana str from to &optional first)
200 (let ((len (- to from)))
201 (or (and (>= len 2)
202 (let ((min-idx (+ from 2))
203 (idx (if first (1- to ) to))
204 (found nil))
205 (while (and (not found) (>= idx min-idx))
206 (let ((kana2-list (skkdic-get-entry
207 (substring str from idx)
208 skkdic-word-list)))
209 (if (or (and (consp kana2-list)
210 (let ((kana-len (length kana))
211 kana2)
212 (catch 'skkdic-tag
213 (while kana2-list
214 (setq kana2 (car kana2-list))
215 (if (string-match kana2 kana)
216 (throw 'skkdic-tag t))
217 (setq kana2-list (cdr kana2-list)))))
218 (or (= idx to)
219 (skkdic-breakup-string skkbuf kana str
220 idx to)))
221 (and (stringp kana2-list)
222 (string-match kana2-list kana)))
223 (setq found t)
224 (setq idx (1- idx)))))
225 found))
226 (and first
227 (> len 2)
228 (let ((kana2 (skkdic-get-entry
229 (substring str from (1+ from))
230 skkdic-prefix-list)))
231 (and (stringp kana2)
232 (eq (string-match kana2 kana) 0)))
233 (skkdic-breakup-string skkbuf kana str (1+ from) to))
234 (and (not first)
235 (>= len 1)
236 (let ((kana2-list (skkdic-get-entry
237 (substring str from to)
238 skkdic-postfix-list)))
239 (and (consp kana2-list)
240 (let (kana2)
241 (catch 'skkdic-tag
242 (while kana2-list
243 (setq kana2 (car kana2-list))
244 (if (string= kana2
245 (substring kana (- (length kana2))))
246 (throw 'skkdic-tag t))
247 (setq kana2-list (cdr kana2-list)))))))))))
248
249;; Return list of candidates which excludes some from CANDIDATES.
250;; Excluded candidates can be derived from another entry.
251
252(defun skkdic-reduced-candidates (skkbuf kana candidates)
253 (let (elt l)
254 (while candidates
255 (setq elt (car candidates))
256 (if (or (= (length elt) 1)
257 (and (string-match "^\\cj" elt)
258 (not (skkdic-breakup-string skkbuf kana elt 0 (length elt)
259 'first))))
260 (setq l (cons elt l)))
261 (setq candidates (cdr candidates)))
262 (nreverse l)))
263
bb2ed48e
JPW
264(defvar skkdic-okuri-nasi-entries (list nil))
265(defvar skkdic-okuri-nasi-entries-count 0)
b47a7926
KH
266
267(defun skkdic-collect-okuri-nasi ()
268 (message "Collecting OKURI-NASI entries ...")
269 (save-excursion
270 (let ((prev-ratio 0)
271 ratio)
272 (while (re-search-forward "^\\(\\(\\cH\\|\e$B!<\e(B\\)+\\) \\(/\\cj.*\\)/$"
273 nil t)
274 (let ((kana (match-string 1))
275 (candidates (skkdic-get-candidate-list (match-beginning 3)
276 (match-end 3))))
277 (setq skkdic-okuri-nasi-entries
278 (cons (cons kana candidates) skkdic-okuri-nasi-entries)
279 skkdic-okuri-nasi-entries-count
280 (1+ skkdic-okuri-nasi-entries-count))
281 (setq ratio (floor (/ (* (point) 100.0) (point-max))))
282 (if (/= ratio prev-ratio)
283 (progn
284 (message "collected %2d%% %s ..." ratio kana)
285 (setq prev-ratio ratio)))
286 (while candidates
287 (let ((entry (lookup-nested-alist (car candidates)
288 skkdic-word-list nil nil t)))
289 (if (consp (car entry))
290 (setcar entry (cons kana (car entry)))
291 (set-nested-alist (car candidates) (list kana)
292 skkdic-word-list)))
293 (setq candidates (cdr candidates))))))))
294
295(defun skkdic-convert-okuri-nasi (skkbuf buf)
296 (message "Processing OKURI-NASI entries ...")
9a529312 297 (with-current-buffer buf
b47a7926
KH
298 (insert ";; Setting okuri-nasi entries.\n"
299 "(skkdic-set-okuri-nasi\n")
300 (let ((l (nreverse skkdic-okuri-nasi-entries))
301 (count 0)
302 (prev-ratio 0)
303 ratio)
304 (while l
305 (let ((kana (car (car l)))
306 (candidates (cdr (car l))))
307 (setq ratio (/ (* count 1000) skkdic-okuri-nasi-entries-count)
308 count (1+ count))
309 (if (/= prev-ratio (/ ratio 10))
310 (progn
311 (message "processed %2d%% %s ..." (/ ratio 10) kana)
312 (setq prev-ratio (/ ratio 10))))
313 (if (setq candidates
314 (skkdic-reduced-candidates skkbuf kana candidates))
315 (progn
316 (insert "\"" kana)
317 (while candidates
318 (insert " " (car candidates))
319 (setq candidates (cdr candidates)))
320 (insert "\"\n"))))
321 (setq l (cdr l))))
322 (insert ")\n\n")))
323
324(defun skkdic-convert (filename &optional dirname)
32ef9469 325 "Generate Emacs Lisp file form Japanese dictionary file FILENAME.
b47a7926
KH
326The format of the dictionary file should be the same as SKK dictionaries.
327Optional argument DIRNAME if specified is the directory name under which
328the generated Emacs Lisp is saved.
329The name of generated file is specified by the variable `ja-dic-filename'."
330 (interactive "FSKK dictionary file: ")
331 (message "Reading file \"%s\" ..." filename)
332 (let* ((coding-system-for-read 'euc-japan)
333 (skkbuf(find-file-noselect (expand-file-name filename)))
334 (buf (get-buffer-create "*skkdic-work*")))
9a529312
SM
335 ;; Setup and generate the header part of working buffer.
336 (with-current-buffer buf
b47a7926
KH
337 (erase-buffer)
338 (buffer-disable-undo)
c4b4e56c 339 (insert ";;; ja-dic.el --- dictionary for Japanese input method"
53cfe624 340 " -*-coding: euc-japan; -*-\n"
b47a7926
KH
341 ";;\tGenerated by the command `skkdic-convert'\n"
342 ";;\tDate: " (current-time-string) "\n"
343 ";;\tOriginal SKK dictionary file: "
091fee44 344 (file-relative-name (expand-file-name filename) dirname)
b47a7926 345 "\n\n"
381194d0
PJ
346 ";; This file is part of GNU Emacs.\n\n"
347 ";;; Commentary:\n\n"
b47a7926
KH
348 ";; Do byte-compile this file again after any modification.\n\n"
349 ";;; Start of the header of the original SKK dictionary.\n\n")
350 (set-buffer skkbuf)
351 (widen)
352 (goto-char 1)
353 (let (pos)
354 (search-forward ";; okuri-ari")
355 (forward-line 1)
356 (setq pos (point))
357 (set-buffer buf)
358 (insert-buffer-substring skkbuf 1 pos))
359 (insert "\n"
c9ef1229 360 ";;; Code:\n\n(eval-when-compile (require 'ja-dic-cnv))\n\n")
b47a7926
KH
361
362 ;; Generate the body part of working buffer.
363 (set-buffer skkbuf)
364 (let ((from (point))
365 to)
366 ;; Convert okuri-ari entries.
367 (search-forward ";; okuri-nasi")
368 (beginning-of-line)
369 (setq to (point))
370 (narrow-to-region from to)
371 (skkdic-convert-okuri-ari skkbuf buf)
372 (widen)
373
374 ;; Convert okuri-nasi postfix entries.
375 (goto-char to)
376 (forward-line 1)
377 (setq from (point))
378 (re-search-forward "^\\cH")
379 (setq to (match-beginning 0))
380 (narrow-to-region from to)
381 (skkdic-convert-postfix skkbuf buf)
382 (widen)
383
384 ;; Convert okuri-nasi prefix entries.
385 (goto-char to)
386 (skkdic-convert-prefix skkbuf buf)
387
a1506d29 388 ;;
b47a7926
KH
389 (skkdic-collect-okuri-nasi)
390
391 ;; Convert okuri-nasi general entries.
392 (skkdic-convert-okuri-nasi skkbuf buf)
393
394 ;; Postfix
9a529312 395 (with-current-buffer buf
b47a7926 396 (goto-char (point-max))
381194d0 397 (insert ";;\n(provide 'ja-dic)\n\n;;; ja-dic.el ends here\n")))
b47a7926
KH
398
399 ;; Save the working buffer.
400 (set-buffer buf)
401 (set-visited-file-name (expand-file-name ja-dic-filename dirname) t)
fa6ea913 402 (set-buffer-file-coding-system 'euc-japan)
b47a7926
KH
403 (save-buffer 0))
404 (kill-buffer skkbuf)
405 (switch-to-buffer buf)))
406
407(defun batch-skkdic-convert ()
408 "Run `skkdic-convert' on the files remaining on the command line.
409Use this from the command line, with `-batch';
410it won't work in an interactive Emacs.
411For example, invoke:
412 % emacs -batch -l ja-dic-cnv -f batch-skkdic-convert SKK-JISYO.L
413to generate \"ja-dic.el\" from SKK dictionary file \"SKK-JISYO.L\".
414To get complete usage, invoke:
415 % emacs -batch -l ja-dic-cnv -f batch-skkdic-convert -h"
416 (defvar command-line-args-left) ; Avoid compiler warning.
417 (if (not noninteractive)
418 (error "`batch-skkdic-convert' should be used only with -batch"))
419 (if (string= (car command-line-args-left) "-h")
420 (progn
421 (message "To convert SKK-JISYO.L into skkdic.el:")
422 (message " %% emacs -batch -l ja-dic-cnv -f batch-skkdic-convert SKK-JISYO.L")
423 (message "To convert SKK-JISYO.L into DIR/ja-dic.el:")
424 (message " %% emacs -batch -l ja-dic-cnv -f batch-skkdic-convert -dir DIR SKK-JISYO.L"))
425 (let (targetdir filename)
426 (if (string= (car command-line-args-left) "-dir")
427 (progn
428 (setq command-line-args-left (cdr command-line-args-left))
429 (setq targetdir (expand-file-name (car command-line-args-left)))
430 (setq command-line-args-left (cdr command-line-args-left))))
431 (setq filename (expand-file-name (car command-line-args-left)))
432 (message "Converting %s to %s ..." filename ja-dic-filename)
433 (message "It takes around 10 minutes even on Sun SS20.")
434 (skkdic-convert filename targetdir)
435 (message "Do byte-compile the created file by:")
436 (message " %% emacs -batch -f batch-byte-compile %s" ja-dic-filename)
437 ))
438 (kill-emacs 0))
439
440
441;; The following macros are expanded at byte-compiling time so that
442;; compiled code can be loaded quickly.
443
444(defun skkdic-get-kana-compact-codes (kana)
445 (let* ((len (length kana))
446 (vec (make-vector len 0))
447 (i 0)
448 ch)
449 (while (< i len)
450 (setq ch (aref kana i))
451 (aset vec i
452 (if (< ch 128) ; CH is an ASCII letter for OKURIGANA,
453 (- ch) ; represented by a negative code.
454 (if (= ch ?\e$B!<\e(B) ; `\e$B!<\e(B' is represented by 0.
455 0
a10201c8 456 (- (logand (encode-char ch 'japanese-jisx0208) #xFF) 32))))
b47a7926
KH
457 (setq i (1+ i)))
458 vec))
459
460(defun skkdic-extract-conversion-data (entry)
461 (string-match "^\\cj+[a-z]* " entry)
462 (let ((kana (substring entry (match-beginning 0) (1- (match-end 0))))
463 (i (match-end 0))
464 candidates)
465 (while (string-match "[^ ]+" entry i)
466 (setq candidates (cons (match-string 0 entry) candidates))
467 (setq i (match-end 0)))
468 (cons (skkdic-get-kana-compact-codes kana) candidates)))
469
470(defmacro skkdic-set-okuri-ari (&rest entries)
471 `(defconst skkdic-okuri-ari
472 ',(let ((l entries)
473 (map '(skkdic-okuri-ari))
474 entry)
475 (while l
476 (setq entry (skkdic-extract-conversion-data (car l)))
477 (set-nested-alist (car entry) (cdr entry) map)
478 (setq l (cdr l)))
479 map)))
480
481(defmacro skkdic-set-postfix (&rest entries)
482 `(defconst skkdic-postfix
483 ',(let ((l entries)
484 (map '(nil))
485 (longest 1)
486 len entry)
487 (while l
488 (setq entry (skkdic-extract-conversion-data (car l)))
489 (setq len (length (car entry)))
490 (if (> len longest)
491 (setq longest len))
492 (let ((entry2 (lookup-nested-alist (car entry) map nil nil t)))
493 (if (consp (car entry2))
494 (let ((conversions (cdr entry)))
495 (while conversions
496 (if (not (member (car conversions) (car entry2)))
497 (setcar entry2 (cons (car conversions) (car entry2))))
498 (setq conversions (cdr conversions))))
499 (set-nested-alist (car entry) (cdr entry) map)))
500 (setq l (cdr l)))
501 (setcar map longest)
502 map)))
503
504(defmacro skkdic-set-prefix (&rest entries)
505 `(defconst skkdic-prefix
506 ',(let ((l entries)
507 (map '(nil))
508 (longest 1)
509 len entry)
510 (while l
511 (setq entry (skkdic-extract-conversion-data (car l)))
512 (setq len (length (car entry)))
513 (if (> len longest)
514 (setq longest len))
515 (let ((entry2 (lookup-nested-alist (car entry) map len nil t)))
516 (if (consp (car entry2))
517 (let ((conversions (cdr entry)))
518 (while conversions
519 (if (not (member (car conversions) (car entry2)))
520 (setcar entry2 (cons (car conversions) (car entry2))))
521 (setq conversions (cdr conversions))))
522 (set-nested-alist (car entry) (cdr entry) map len)))
523 (setq l (cdr l)))
524 (setcar map longest)
525 map)))
526
527(defmacro skkdic-set-okuri-nasi (&rest entries)
528 `(defconst skkdic-okuri-nasi
529 ',(let ((l entries)
530 (map '(skdic-okuri-nasi))
531 (count 0)
532 entry)
533 (while l
534 (setq count (1+ count))
50e3a346 535 (if (= (% count 10000) 0)
274f1353 536 (message "%d entries" count))
b47a7926
KH
537 (setq entry (skkdic-extract-conversion-data (car l)))
538 (set-nested-alist (car entry) (cdr entry) map)
539 (setq l (cdr l)))
540 map)))
541
542(provide 'ja-dic-cnv)
543
5bbba7b2
KH
544;; Local Variables:
545;; coding: iso-2022-7bit
546;; End:
60370d40
PJ
547
548;;; ja-dic-cnv.el ends here