;;; Code:
+;; We must set utf-translate-cjk-mode to nil while loading this file
+;; to avoid translating CJK characters in decode-char.
+(defvar saved-utf-translate-cjk-mode utf-translate-cjk-mode)
+(setq utf-translate-cjk-mode nil)
+
;;; Predefined categories.
;; For each character set.
(modify-syntax-entry ?\\e$A!:\e(B "(\e$A!;\e(B")
(modify-syntax-entry ?\\e$A!<\e(B "(\e$A!=\e(B")
(modify-syntax-entry ?\\e$A!>\e(B "(\e$A!?\e(B")
+(modify-syntax-entry ?\\e$A#(\e(B "(\e$A#)\e(B")
+(modify-syntax-entry ?\\e$A#{\e(B "(\e$A#}\e(B")
+(modify-syntax-entry ?\\e$A#[\e(B "(\e$A#]\e(B")
(modify-syntax-entry ?\\e$A!3\e(B ")\e$A!2\e(B")
(modify-syntax-entry ?\\e$A!5\e(B ")\e$A!4\e(B")
(modify-syntax-entry ?\\e$A!7\e(B ")\e$A!6\e(B")
(modify-syntax-entry ?\\e$A!;\e(B ")\e$A!:\e(B")
(modify-syntax-entry ?\\e$A!=\e(B ")\e$A!<\e(B")
(modify-syntax-entry ?\\e$A!?\e(B ")\e$A!>\e(B")
-;; Unicode equivalents of above
-(modify-syntax-entry ?\\e$,2=T\e(B "(\e$,2=U\e(B")
-(modify-syntax-entry ?\\e$,2=H\e(B "(\e$,2=I\e(B")
-(modify-syntax-entry ?\\e$,2=J\e(B "(\e$,2=K\e(B")
-(modify-syntax-entry ?\\e$,2=L\e(B "(\e$,2=M\e(B")
-(modify-syntax-entry ?\\e$,2=N\e(B "(\e$,2=O\e(B")
-(modify-syntax-entry ?\\e$,2=V\e(B "(\e$,2=W\e(B")
-(modify-syntax-entry ?\\e$,2=P\e(B "(\e$,2=Q\e(B")
-(modify-syntax-entry ?\\e$,2=U\e(B ")\e$,2=T\e(B")
-(modify-syntax-entry ?\\e$,2=I\e(B ")\e$,2=H\e(B")
-(modify-syntax-entry ?\\e$,2=K\e(B ")\e$,2=J\e(B")
-(modify-syntax-entry ?\\e$,2=M\e(B ")\e$,2=L\e(B")
-(modify-syntax-entry ?\\e$,2=O\e(B ")\e$,2=N\e(B")
-(modify-syntax-entry ?\\e$,2=W\e(B ")\e$,2=V\e(B")
-(modify-syntax-entry ?\\e$,2=Q\e(B ")\e$,2=P\e(B")
+(modify-syntax-entry ?\\e$A#)\e(B ")\e$A#(\e(B")
+(modify-syntax-entry ?\\e$A#}\e(B ")\e$A#{\e(B")
+(modify-syntax-entry ?\\e$A#]\e(B ")\e$A#[\e(B")
+
+(let ((chars "\e$A#,!"!##.!$#;#:#?#!!C!-!'#|#_!.!/!0!1#"!e#`!d\e(B"))
+ (dotimes (i (length chars))
+ (modify-syntax-entry (aref chars i) ".")))
(modify-category-entry (make-char 'chinese-gb2312) ?c)
(modify-category-entry (make-char 'chinese-gb2312) ?\|)
;; Chinese character set (BIG5)
+(let ((from (decode-big5-char #xA141))
+ (to (decode-big5-char #xA15D)))
+ (while (< from to)
+ (modify-syntax-entry from ".")
+ (setq from (1+ from))))
+(let ((from (decode-big5-char #xA1A5))
+ (to (decode-big5-char #xA1AD)))
+ (while (< from to)
+ (modify-syntax-entry from ".")
+ (setq from (1+ from))))
+(let ((from (decode-big5-char #xA1AD))
+ (to (decode-big5-char #xA2AF)))
+ (while (< from to)
+ (modify-syntax-entry from "_")
+ (setq from (1+ from))))
+
+(let ((parens "\e$(0!>!?!@!A!B!C!D!E!F!G!H!I!J!K!L!M!N!O!P!Q!R!S!T!U!V!W!X!Y!Z![!\!]!^!_!`!a!b!c\e(B")
+ open close)
+ (dotimes (i (/ (length parens) 2))
+ (setq open (aref parens (* i 2))
+ close (aref parens (1+ (* i 2))))
+ (modify-syntax-entry open (format "(%c" close))
+ (modify-syntax-entry close (format ")%c" open))))
+
(let ((generic-big5-1-char (make-char 'chinese-big5-1))
(generic-big5-2-char (make-char 'chinese-big5-2)))
;; (modify-syntax-entry generic-big5-1-char "w")
(modify-category-entry generic-char ?|)
(setq cns-list (cdr cns-list))))
+(let ((parens "\e$(G!>!?!@!A!B!C!D!E!F!G!H!I!J!K!L!M!N!O!P!Q!R!S!T!U!V!W!X!Y!Z![!\!]!^!_!`!a!b!c\e(B")
+ open close)
+ (dotimes (i (/ (length parens) 2))
+ (setq open (aref parens (* i 2))
+ close (aref parens (1+ (* i 2))))
+ (modify-syntax-entry open (format "(%c" close))
+ (modify-syntax-entry close (format ")%c" open))))
+
;; Cyrillic character set (ISO-8859-5)
(modify-category-entry (make-char 'cyrillic-iso8859-5) ?y)
(set-case-syntax-pair ?\e,FO\e(B ?\e,Fo\e(B tbl)
(set-case-syntax-pair ?\e,FP\e(B ?\e,Fp\e(B tbl)
(set-case-syntax-pair ?\e,FQ\e(B ?\e,Fq\e(B tbl)
+ (set-upcase-syntax ?\e,FS\e(B ?\e,Fr\e(B tbl)
(set-case-syntax-pair ?\e,FS\e(B ?\e,Fs\e(B tbl)
(set-case-syntax-pair ?\e,FT\e(B ?\e,Ft\e(B tbl)
(set-case-syntax-pair ?\e,FU\e(B ?\e,Fu\e(B tbl)
(set-case-syntax-pair ?\e$,1&\7f\e(B ?\e$,1'?\e(B tbl)
(set-case-syntax-pair ?\e$,1' \e(B ?\e$,1'@\e(B tbl)
(set-case-syntax-pair ?\e$,1'!\e(B ?\e$,1'A\e(B tbl)
+ (set-upcase-syntax ?\e$,1'#\e(B ?\e$,1'B\e(B tbl)
(set-case-syntax-pair ?\e$,1'#\e(B ?\e$,1'C\e(B tbl)
(set-case-syntax-pair ?\e$,1'$\e(B ?\e$,1'D\e(B tbl)
(set-case-syntax-pair ?\e$,1'%\e(B ?\e$,1'E\e(B tbl)
(modify-category-entry (decode-char 'ucs c) ?i)
(setq c (1+ c))))
-;;; Commented out since the categories appear not to be used anywhere
-;;; and word syntax is the default.
-;; (let ((deflist ;
-;; '(;; chars syntax category
-;; ("\e(5!"#\e(B" "w" ?7) ; vowel-modifying diacritical mark
-;; ; chandrabindu, anuswar, visarga
-;; ("\e(5$\e(B-\e(52\e(B" "w" ?1) ; base (independent) vowel
-;; ("\e(53\e(B-\e(5X\e(B" "w" ?0) ; consonant
-;; ("\e(5Z\e(B-\e(5g\e(B" "w" ?8) ; matra
-;; ("\e(5q\e(B-\e(5z\e(B" "w" ?6) ; digit
-;; ))
-;; elm chars len syntax category to ch i)
-;; (while deflist
-;; (setq elm (car deflist))
-;; (setq chars (car elm)
-;; len (length chars)
-;; syntax (nth 1 elm)
-;; category (nth 2 elm)
-;; i 0)
-;; (while (< i len)
-;; (if (= (aref chars i) ?-)
-;; (setq i (1+ i)
-;; to (aref chars i))
-;; (setq ch (aref chars i)
-;; to ch))
-;; (while (<= ch to)
-;; (modify-syntax-entry ch syntax)
-;; (modify-category-entry ch category)
-;; (setq ch (1+ ch)))
-;; (setq i (1+ i)))
-;; (setq deflist (cdr deflist))))
-
+(let ((l '(;; RANGE CATEGORY MEANINGS
+ (#x01 #x03 ?7) ; vowel modifier
+ (#x05 #x14 ?1) ; base vowel
+ (#x15 #x39 ?0) ; consonants
+ (#x3e #x4d ?8) ; vowel modifier
+ (#x51 #x54 ?4) ; stress/tone mark
+ (#x58 #x5f ?0) ; consonants
+ (#x60 #x61 ?1) ; base vowel
+ (#x62 #x63 ?8) ; vowel modifier
+ (#x66 #x6f ?6) ; digits
+ )))
+ (dolist (elt1 '(#x900 #x980 #xa00 #xa80 #xb00 #xb80 #xc00 #xc80 #xd00))
+ (dolist (elt2 l)
+ (let* ((from (car elt2))
+ (counts (1+ (- (nth 1 elt2) from)))
+ (category (nth 2 elt2)))
+ (dotimes (i counts)
+ (modify-category-entry (decode-char 'ucs (+ elt1 from i))
+ category))))))
;; Japanese character set (JISX0201-kana, JISX0201-roman, JISX0208, JISX0212)
(while chars
(modify-syntax-entry (car chars) "w")
(setq chars (cdr chars))))
-(modify-syntax-entry ?\\e$B!J\e(B "(\e$B!K\e(B")
-(modify-syntax-entry ?\\e$B!N\e(B "(\e$B!O\e(B")
-(modify-syntax-entry ?\\e$B!P\e(B "(\e$B!Q\e(B")
-(modify-syntax-entry ?\\e$B!V\e(B "(\e$B!W\e(B")
-(modify-syntax-entry ?\\e$B!X\e(B "(\e$B!Y\e(B")
-(modify-syntax-entry ?\\e$B!K\e(B ")\e$B!J\e(B")
-(modify-syntax-entry ?\\e$B!O\e(B ")\e$B!N\e(B")
-(modify-syntax-entry ?\\e$B!Q\e(B ")\e$B!P\e(B")
-(modify-syntax-entry ?\\e$B!W\e(B ")\e$B!V\e(B")
-(modify-syntax-entry ?\\e$B!Y\e(B ")\e$B!X\e(B")
+(let ((parens "\e$B!J!K!L!M!N!O!P!Q!R!S!T!U!V!W!X!Y!Z![\e(B" )
+ open close)
+ (dotimes (i (/ (length parens) 2))
+ (setq open (aref parens (* i 2))
+ close (aref parens (1+ (* i 2))))
+ (modify-syntax-entry open (format "(%c" close))
+ (modify-syntax-entry close (format ")%c" open))))
(modify-category-entry (make-char 'japanese-jisx0208 35) ?A)
(modify-category-entry (make-char 'japanese-jisx0208 36) ?H)
(modify-category-entry (make-char 'korean-ksc5601 43) ?K)
(modify-category-entry (make-char 'korean-ksc5601 44) ?Y)
+(let ((parens "\e$(C!2!3!4!5!6!7!8!9!:!;!<!=#(#)#[#]#{#}\e(B" )
+ open close)
+ (dotimes (i (/ (length parens) 2))
+ (setq open (aref parens (* i 2))
+ close (aref parens (1+ (* i 2))))
+ (modify-syntax-entry open (format "(%c" close))
+ (modify-syntax-entry close (format ")%c" open))))
+
;; Latin character set (latin-1,2,3,4,5,8,9)
(modify-category-entry (make-char 'latin-iso8859-1) ?l)
(set-case-syntax-pair
(decode-char 'ucs (1- c)) (decode-char 'ucs c) tbl))
(setq c (1+ c)))
+ ;;(set-downcase-syntax ?\e$,1 P\e(B ?i tbl)
+ ;;(set-upcase-syntax ?I ?\e$,1 Q\e(B tbl)
(set-case-syntax-pair ?\e$,1 R\e(B ?\e$,1 S\e(B tbl)
(set-case-syntax-pair ?\e$,1 T\e(B ?\e$,1 U\e(B tbl)
(set-case-syntax-pair ?\e$,1 V\e(B ?\e$,1 W\e(B tbl)
-;;; (set-case-syntax-pair ?\e$,1!8\e(B ?\e,A\7f\e(B tbl) ; these two have different length!
+ (set-case-syntax-pair ?\e$,1!8\e(B ?\e,A\7f\e(B tbl)
(set-case-syntax-pair ?\e$,1!9\e(B ?\e$,1!:\e(B tbl)
(set-case-syntax-pair ?\e$,1!;\e(B ?\e$,1!<\e(B tbl)
(set-case-syntax-pair ?\e$,1!=\e(B ?\e$,1!>\e(B tbl)
(setq c #x2000)
(while (<= c #x200b)
(set-case-syntax (decode-char 'ucs c) " " tbl)
- (setq c (decode-char 'ucs (1+ c))))
+ (setq c (1+ c)))
(setq c #x2010)
+ (while (<= c #x2016)
+ (set-case-syntax (decode-char 'ucs c) "_" tbl)
+ (setq c (1+ c)))
+ ;; Punctuation syntax for quotation marks (like `)
+ (while (<= c #x201f)
+ (set-case-syntax (decode-char 'ucs c) "." tbl)
+ (setq c (1+ c)))
(while (<= c #x2027)
(set-case-syntax (decode-char 'ucs c) "_" tbl)
- (setq c (decode-char 'ucs (1+ c))))
+ (setq c (1+ c)))
;; Roman numerals
(setq c #x2160)
;; Fixme: syntax for symbols &c
)
+
+(let ((pairs
+ '("\e$,1sEsF\e(B" ; U+2045 U+2046
+ "\e$,1s}s~\e(B" ; U+207D U+207E
+ "\e$,1t-t.\e(B" ; U+208D U+208E
+ "\e$,1{){*\e(B" ; U+2329 U+232A
+ "\e$,1|T|U\e(B" ; U+23B4 U+23B5
+ "\e$,2&H&I\e(B" ; U+2768 U+2769
+ "\e$,2&J&K\e(B" ; U+276A U+276B
+ "\e$,2&L&M\e(B" ; U+276C U+276D
+ "\e$,2&P&Q\e(B" ; U+2770 U+2771
+ "\e$,2&R&S\e(B" ; U+2772 U+2773
+ "\e$,2&T&U\e(B" ; U+2774 U+2775
+ "\e$,2'f'g\e(B" ; U+27E6 U+27E7
+ "\e$,2'h'i\e(B" ; U+27E8 U+27E9
+ "\e$,2'j'k\e(B" ; U+27EA U+27EB
+ "\e$,2,#,$\e(B" ; U+2983 U+2984
+ "\e$,2,%,&\e(B" ; U+2985 U+2986
+ "\e$,2,',(\e(B" ; U+2987 U+2988
+ "\e$,2,),*\e(B" ; U+2989 U+298A
+ "\e$,2,+,,\e(B" ; U+298B U+298C
+ "\e$,2,-,.\e(B" ; U+298D U+298E
+ "\e$,2,/,0\e(B" ; U+298F U+2990
+ "\e$,2,1,2\e(B" ; U+2991 U+2992
+ "\e$,2,3,4\e(B" ; U+2993 U+2994
+ "\e$,2,5,6\e(B" ; U+2995 U+2996
+ "\e$,2,7,8\e(B" ; U+2997 U+2998
+ "\e$,2-<-=\e(B" ; U+29FC U+29FD
+ "\e$,2=H=I\e(B" ; U+3008 U+3009
+ "\e$,2=J=K\e(B" ; U+300A U+300B
+ "\e$,2=L=M\e(B" ; U+300C U+300D
+ "\e$,2=N=O\e(B" ; U+300E U+300F
+ "\e$,2=P=Q\e(B" ; U+3010 U+3011
+ "\e$,2=T=U\e(B" ; U+3014 U+3015
+ "\e$,2=V=W\e(B" ; U+3016 U+3017
+ "\e$,2=X=Y\e(B" ; U+3018 U+3019
+ "\e$,2=Z=[\e(B" ; U+301A U+301B
+ "\e$,3m~m\7f\e(B" ; U+FD3E U+FD3F
+ "\e$,3pUpV\e(B" ; U+FE35 U+FE36
+ "\e$,3pWpX\e(B" ; U+FE37 U+FE38
+ "\e$,3pYpZ\e(B" ; U+FE39 U+FE3A
+ "\e$,3p[p\\e(B" ; U+FE3B U+FE3C
+ "\e$,3p]p^\e(B" ; U+FE3D U+FE3E
+ "\e$,3p_p`\e(B" ; U+FE3F U+FE40
+ "\e$,3papb\e(B" ; U+FE41 U+FE42
+ "\e$,3pcpd\e(B" ; U+FE43 U+FE44
+ "\e$,3pypz\e(B" ; U+FE59 U+FE5A
+ "\e$,3p{p|\e(B" ; U+FE5B U+FE5C
+ "\e$,3p}p~\e(B" ; U+FE5D U+FE5E
+ "\e$,3rhri\e(B" ; U+FF08 U+FF09
+ "\e$,3s;s=\e(B" ; U+FF3B U+FF3D
+ "\e$,3s[s]\e(B" ; U+FF5B U+FF5D
+ "\e$,3s_s`\e(B" ; U+FF5F U+FF60
+ "\e$,3sbsc\e(B" ; U+FF62 U+FF63
+ )))
+ (dolist (elt pairs)
+ (modify-syntax-entry (aref elt 0) (string ?\( (aref elt 1)))
+ (modify-syntax-entry (aref elt 1) (string ?\) (aref elt 0)))))
+
\f
;;; Setting word boundary.
(put-charset-property (car l) 'nospace-between-words t)
(setq l (cdr l))))
+\f
+(setq utf-translate-cjk-mode saved-utf-translate-cjk-mode)
+(makunbound 'saved-utf-translate-cjk-mode)
+
;;; Local Variables:
;;; coding: iso-2022-7bit
;;; End:
+;;; arch-tag: 85889c35-9f4d-4912-9bf5-82de31b0d42d
;;; characters.el ends here