Commit | Line | Data |
---|---|---|
bc9a5003 KH |
1 | ;;; decoder-tests.el --- test for text decoder |
2 | ||
ba318903 | 3 | ;; Copyright (C) 2013-2014 Free Software Foundation, Inc. |
bc9a5003 KH |
4 | |
5 | ;; Author: Kenichi Handa <handa@gnu.org> | |
6 | ||
7 | ;; This file is part of GNU Emacs. | |
8 | ||
9 | ;; GNU Emacs is free software: you can redistribute it and/or modify | |
10 | ;; it under the terms of the GNU General Public License as published by | |
11 | ;; the Free Software Foundation, either version 3 of the License, or | |
12 | ;; (at your option) any later version. | |
13 | ||
14 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
15 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 | ;; GNU General Public License for more details. | |
18 | ||
19 | ;; You should have received a copy of the GNU General Public License | |
20 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. | |
21 | ||
22 | ;;; Code: | |
23 | ||
24 | (require 'ert) | |
25 | ||
bc9a5003 KH |
26 | ;; Directory to hold test data files. |
27 | (defvar decoder-tests-workdir | |
28 | (expand-file-name "decoder-tests" temporary-file-directory)) | |
29 | ||
3e3da660 KH |
30 | ;; Remove all generated test files. |
31 | (defun decoder-tests-remove-files () | |
32 | (delete-directory decoder-tests-workdir t)) | |
33 | ||
bc9a5003 KH |
34 | ;; Return the contents (specified by CONTENT-TYPE; ascii, latin, or |
35 | ;; binary) of a test file. | |
36 | (defun decoder-tests-file-contents (content-type) | |
37 | (let* ((ascii "ABCDEFGHIJKLMNOPQRSTUVWXYZ\n") | |
38 | (latin (concat ascii "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏ\n")) | |
39 | (binary (string-to-multibyte | |
40 | (concat (string-as-unibyte latin) | |
41 | (unibyte-string #xC0 #xC1 ?\n))))) | |
42 | (cond ((eq content-type 'ascii) ascii) | |
43 | ((eq content-type 'latin) latin) | |
44 | ((eq content-type 'binary) binary) | |
45 | (t | |
46 | (error "Invalid file content type: %s" content-type))))) | |
47 | ||
3e3da660 | 48 | ;; Generate FILE with CONTENTS encoded by CODING-SYSTEM. |
bc9a5003 | 49 | ;; whose encoding specified by CODING-SYSTEM. |
3e3da660 | 50 | (defun decoder-tests-gen-file (file contents coding-system) |
bc9a5003 KH |
51 | (or (file-directory-p decoder-tests-workdir) |
52 | (mkdir decoder-tests-workdir t)) | |
3e3da660 KH |
53 | (setq file (expand-file-name file decoder-tests-workdir)) |
54 | (with-temp-file file | |
55 | (set-buffer-file-coding-system coding-system) | |
56 | (insert contents)) | |
57 | file) | |
bc9a5003 KH |
58 | |
59 | ;;; The following three functions are filters for contents of a test | |
60 | ;;; file. | |
61 | ||
62 | ;; Convert all LFs to CR LF sequences in the string STR. | |
63 | (defun decoder-tests-lf-to-crlf (str) | |
64 | (with-temp-buffer | |
65 | (insert str) | |
66 | (goto-char (point-min)) | |
67 | (while (search-forward "\n" nil t) | |
68 | (delete-char -1) | |
69 | (insert "\r\n")) | |
70 | (buffer-string))) | |
71 | ||
72 | ;; Convert all LFs to CRs in the string STR. | |
73 | (defun decoder-tests-lf-to-cr (str) | |
74 | (with-temp-buffer | |
75 | (insert str) | |
76 | (subst-char-in-region (point-min) (point-max) ?\n ?\r) | |
77 | (buffer-string))) | |
78 | ||
79 | ;; Convert all LFs to LF LF sequences in the string STR. | |
80 | (defun decoder-tests-lf-to-lflf (str) | |
81 | (with-temp-buffer | |
82 | (insert str) | |
83 | (goto-char (point-min)) | |
84 | (while (search-forward "\n" nil t) | |
85 | (insert "\n")) | |
86 | (buffer-string))) | |
87 | ||
88 | ;; Prepend the UTF-8 BOM to STR. | |
89 | (defun decoder-tests-add-bom (str) | |
90 | (concat "\xfeff" str)) | |
91 | ||
3e3da660 KH |
92 | ;; Return the name of test file whose contents specified by |
93 | ;; CONTENT-TYPE and whose encoding specified by CODING-SYSTEM. | |
94 | (defun decoder-tests-filename (content-type coding-system &optional ext) | |
95 | (if ext | |
96 | (expand-file-name (format "%s-%s.%s" content-type coding-system ext) | |
97 | decoder-tests-workdir) | |
98 | (expand-file-name (format "%s-%s" content-type coding-system) | |
99 | decoder-tests-workdir))) | |
100 | ||
101 | \f | |
102 | ;;; Check ASCII optimizing decoder | |
103 | ||
104 | ;; Generate a test file whose contents specified by CONTENT-TYPE and | |
105 | ;; whose encoding specified by CODING-SYSTEM. | |
106 | (defun decoder-tests-ao-gen-file (content-type coding-system) | |
107 | (let ((file (decoder-tests-filename content-type coding-system))) | |
108 | (decoder-tests-gen-file file | |
109 | (decoder-tests-file-contents content-type) | |
110 | coding-system))) | |
111 | ||
bc9a5003 KH |
112 | ;; Test the decoding of a file whose contents and encoding are |
113 | ;; specified by CONTENT-TYPE and WRITE-CODING. The test passes if the | |
114 | ;; file is read by READ-CODING and detected as DETECTED-CODING and the | |
115 | ;; contents is correctly decoded. | |
116 | ;; Optional 5th arg TRANSLATOR is a function to translate the original | |
117 | ;; file contents to match with the expected result of decoding. For | |
118 | ;; instance, when a file of dos eol-type is read by unix eol-type, | |
119 | ;; `decode-test-lf-to-crlf' must be specified. | |
120 | ||
121 | (defun decoder-tests (content-type write-coding read-coding detected-coding | |
122 | &optional translator) | |
123 | (prefer-coding-system 'utf-8-auto) | |
124 | (let ((filename (decoder-tests-filename content-type write-coding))) | |
125 | (with-temp-buffer | |
126 | (let ((coding-system-for-read read-coding) | |
127 | (contents (decoder-tests-file-contents content-type)) | |
128 | (disable-ascii-optimization nil)) | |
129 | (if translator | |
130 | (setq contents (funcall translator contents))) | |
131 | (insert-file-contents filename) | |
132 | (if (and (coding-system-equal buffer-file-coding-system detected-coding) | |
133 | (string= (buffer-string) contents)) | |
134 | nil | |
135 | (list buffer-file-coding-system | |
136 | (string-to-list (buffer-string)) | |
137 | (string-to-list contents))))))) | |
138 | ||
139 | (ert-deftest ert-test-decoder-ascii () | |
140 | (unwind-protect | |
141 | (progn | |
142 | (dolist (eol-type '(unix dos mac)) | |
3e3da660 | 143 | (decoder-tests-ao-gen-file 'ascii eol-type)) |
bc9a5003 KH |
144 | (should-not (decoder-tests 'ascii 'unix 'undecided 'unix)) |
145 | (should-not (decoder-tests 'ascii 'dos 'undecided 'dos)) | |
146 | (should-not (decoder-tests 'ascii 'dos 'dos 'dos)) | |
147 | (should-not (decoder-tests 'ascii 'mac 'undecided 'mac)) | |
148 | (should-not (decoder-tests 'ascii 'mac 'mac 'mac)) | |
149 | (should-not (decoder-tests 'ascii 'dos 'utf-8 'utf-8-dos)) | |
150 | (should-not (decoder-tests 'ascii 'dos 'unix 'unix | |
151 | 'decoder-tests-lf-to-crlf)) | |
152 | (should-not (decoder-tests 'ascii 'mac 'dos 'dos | |
153 | 'decoder-tests-lf-to-cr)) | |
154 | (should-not (decoder-tests 'ascii 'dos 'mac 'mac | |
155 | 'decoder-tests-lf-to-lflf))) | |
156 | (decoder-tests-remove-files))) | |
157 | ||
158 | (ert-deftest ert-test-decoder-latin () | |
159 | (unwind-protect | |
160 | (progn | |
161 | (dolist (coding '("utf-8" "utf-8-with-signature")) | |
162 | (dolist (eol-type '("unix" "dos" "mac")) | |
3e3da660 KH |
163 | (decoder-tests-ao-gen-file 'latin |
164 | (intern (concat coding "-" eol-type))))) | |
bc9a5003 KH |
165 | (should-not (decoder-tests 'latin 'utf-8-unix 'undecided 'utf-8-unix)) |
166 | (should-not (decoder-tests 'latin 'utf-8-unix 'utf-8-unix 'utf-8-unix)) | |
167 | (should-not (decoder-tests 'latin 'utf-8-dos 'undecided 'utf-8-dos)) | |
168 | (should-not (decoder-tests 'latin 'utf-8-dos 'utf-8-dos 'utf-8-dos)) | |
169 | (should-not (decoder-tests 'latin 'utf-8-mac 'undecided 'utf-8-mac)) | |
170 | (should-not (decoder-tests 'latin 'utf-8-mac 'utf-8-mac 'utf-8-mac)) | |
171 | (should-not (decoder-tests 'latin 'utf-8-dos 'unix 'utf-8-unix | |
172 | 'decoder-tests-lf-to-crlf)) | |
173 | (should-not (decoder-tests 'latin 'utf-8-mac 'dos 'utf-8-dos | |
174 | 'decoder-tests-lf-to-cr)) | |
175 | (should-not (decoder-tests 'latin 'utf-8-dos 'mac 'utf-8-mac | |
176 | 'decoder-tests-lf-to-lflf)) | |
177 | (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'undecided | |
178 | 'utf-8-with-signature-unix)) | |
179 | (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8-auto | |
180 | 'utf-8-with-signature-unix)) | |
181 | (should-not (decoder-tests 'latin 'utf-8-with-signature-dos 'undecided | |
182 | 'utf-8-with-signature-dos)) | |
183 | (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8 | |
184 | 'utf-8-unix 'decoder-tests-add-bom)) | |
185 | (should-not (decoder-tests 'latin 'utf-8-with-signature-unix 'utf-8 | |
186 | 'utf-8-unix 'decoder-tests-add-bom))) | |
187 | (decoder-tests-remove-files))) | |
188 | ||
189 | (ert-deftest ert-test-decoder-binary () | |
190 | (unwind-protect | |
191 | (progn | |
192 | (dolist (eol-type '("unix" "dos" "mac")) | |
3e3da660 KH |
193 | (decoder-tests-ao-gen-file 'binary |
194 | (intern (concat "raw-text" "-" eol-type)))) | |
bc9a5003 KH |
195 | (should-not (decoder-tests 'binary 'raw-text-unix 'undecided |
196 | 'raw-text-unix)) | |
197 | (should-not (decoder-tests 'binary 'raw-text-dos 'undecided | |
198 | 'raw-text-dos)) | |
199 | (should-not (decoder-tests 'binary 'raw-text-mac 'undecided | |
200 | 'raw-text-mac)) | |
201 | (should-not (decoder-tests 'binary 'raw-text-dos 'unix | |
202 | 'raw-text-unix 'decoder-tests-lf-to-crlf)) | |
203 | (should-not (decoder-tests 'binary 'raw-text-mac 'dos | |
204 | 'raw-text-dos 'decoder-tests-lf-to-cr)) | |
205 | (should-not (decoder-tests 'binary 'raw-text-dos 'mac | |
206 | 'raw-text-mac 'decoder-tests-lf-to-lflf))) | |
207 | (decoder-tests-remove-files))) | |
208 | ||
3e3da660 KH |
209 | \f |
210 | ;;; Check the coding system `prefer-utf-8'. | |
211 | ||
212 | ;; Read FILE. Check if the encoding was detected as DETECT. If | |
213 | ;; PREFER is non-nil, prefer that coding system before reading. | |
214 | ||
215 | (defun decoder-tests-prefer-utf-8-read (file detect prefer) | |
3e3da660 | 216 | (with-temp-buffer |
0f01f02f KH |
217 | (with-coding-priority (if prefer (list prefer)) |
218 | (insert-file-contents file)) | |
3e3da660 KH |
219 | (if (eq buffer-file-coding-system detect) |
220 | nil | |
221 | (format "Invalid detection: %s" buffer-file-coding-system)))) | |
222 | ||
223 | ;; Read FILE, modify it, and write it. Check if the coding system | |
224 | ;; used for writing was CODING. If CODING-TAG is non-nil, insert | |
225 | ;; coding tag with it before writing. If STR is non-nil, insert it | |
226 | ;; before writing. | |
227 | ||
228 | (defun decoder-tests-prefer-utf-8-write (file coding-tag coding | |
229 | &optional str) | |
230 | (with-temp-buffer | |
231 | (insert-file-contents file) | |
232 | (goto-char (point-min)) | |
233 | (if coding-tag | |
234 | (insert (format ";; -*- coding: %s; -*-\n" coding-tag)) | |
235 | (insert ";;\n")) | |
236 | (if str | |
237 | (insert str)) | |
238 | (write-file (decoder-tests-filename 'test 'test "el")) | |
239 | (if (coding-system-equal buffer-file-coding-system coding) | |
240 | nil | |
241 | (format "Incorrect encoding: %s" last-coding-system-used)))) | |
242 | ||
243 | (ert-deftest ert-test-decoder-prefer-utf-8 () | |
244 | (unwind-protect | |
245 | (let ((ascii (decoder-tests-gen-file "ascii.el" | |
246 | (decoder-tests-file-contents 'ascii) | |
247 | 'unix)) | |
248 | (latin (decoder-tests-gen-file "utf-8.el" | |
249 | (decoder-tests-file-contents 'latin) | |
a48eb50b | 250 | 'utf-8-unix))) |
3e3da660 KH |
251 | (should-not (decoder-tests-prefer-utf-8-read |
252 | ascii 'prefer-utf-8-unix nil)) | |
253 | (should-not (decoder-tests-prefer-utf-8-read | |
254 | latin 'utf-8-unix nil)) | |
255 | (should-not (decoder-tests-prefer-utf-8-read | |
256 | latin 'utf-8-unix 'iso-8859-1)) | |
257 | (should-not (decoder-tests-prefer-utf-8-read | |
258 | latin 'utf-8-unix 'sjis)) | |
259 | (should-not (decoder-tests-prefer-utf-8-write | |
260 | ascii nil 'prefer-utf-8-unix)) | |
261 | (should-not (decoder-tests-prefer-utf-8-write | |
262 | ascii 'iso-8859-1 'iso-8859-1-unix)) | |
263 | (should-not (decoder-tests-prefer-utf-8-write | |
264 | ascii nil 'utf-8-unix "À"))) | |
265 | (decoder-tests-remove-files))) | |
bc9a5003 KH |
266 | |
267 | \f | |
268 | ;;; The following is for benchmark testing of the new optimized | |
269 | ;;; decoder, not for regression testing. | |
270 | ||
271 | (defun generate-ascii-file () | |
272 | (dotimes (i 100000) | |
273 | (insert-char ?a 80) | |
274 | (insert "\n"))) | |
275 | ||
276 | (defun generate-rarely-nonascii-file () | |
277 | (dotimes (i 100000) | |
278 | (if (/= i 50000) | |
279 | (insert-char ?a 80) | |
280 | (insert ?À) | |
281 | (insert-char ?a 79)) | |
282 | (insert "\n"))) | |
283 | ||
284 | (defun generate-mostly-nonascii-file () | |
285 | (dotimes (i 30000) | |
286 | (insert-char ?a 80) | |
287 | (insert "\n")) | |
288 | (dotimes (i 20000) | |
289 | (insert-char ?À 80) | |
290 | (insert "\n")) | |
291 | (dotimes (i 10000) | |
292 | (insert-char ?あ 80) | |
293 | (insert "\n"))) | |
294 | ||
295 | ||
296 | (defvar test-file-list | |
297 | '((generate-ascii-file | |
298 | ("~/ascii-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" unix) | |
299 | ("~/ascii-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" unix) | |
300 | ("~/ascii-tag-none.unix" "" unix) | |
301 | ("~/ascii-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" dos) | |
302 | ("~/ascii-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" dos) | |
303 | ("~/ascii-tag-none.dos" "" dos)) | |
304 | (generate-rarely-nonascii-file | |
305 | ("~/utf-8-r-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix) | |
306 | ("~/utf-8-r-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix) | |
307 | ("~/utf-8-r-tag-none.unix" "" utf-8-unix) | |
308 | ("~/utf-8-r-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos) | |
309 | ("~/utf-8-r-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos) | |
310 | ("~/utf-8-r-tag-none.dos" "" utf-8-dos)) | |
311 | (generate-mostly-nonascii-file | |
312 | ("~/utf-8-m-tag-utf-8-unix.unix" ";; -*- coding: utf-8-unix; -*-" utf-8-unix) | |
313 | ("~/utf-8-m-tag-utf-8.unix" ";; -*- coding: utf-8; -*-" utf-8-unix) | |
314 | ("~/utf-8-m-tag-none.unix" "" utf-8-unix) | |
315 | ("~/utf-8-m-tag-utf-8-dos.dos" ";; -*- coding: utf-8-dos; -*-" utf-8-dos) | |
316 | ("~/utf-8-m-tag-utf-8.dos" ";; -*- coding: utf-8; -*-" utf-8-dos) | |
317 | ("~/utf-8-m-tag-none.dos" "" utf-8-dos)))) | |
318 | ||
319 | (defun generate-benchmark-test-file () | |
320 | (interactive) | |
321 | (with-temp-buffer | |
322 | (message "Generating data...") | |
323 | (dolist (files test-file-list) | |
324 | (delete-region (point-min) (point-max)) | |
325 | (funcall (car files)) | |
326 | (dolist (file (cdr files)) | |
327 | (message "Writing %s..." (car file)) | |
328 | (goto-char (point-min)) | |
329 | (insert (nth 1 file) "\n") | |
330 | (let ((coding-system-for-write (nth 2 file))) | |
331 | (write-region (point-min) (point-max) (car file))) | |
332 | (delete-region (point-min) (point)))))) | |
333 | ||
334 | (defun benchmark-decoder () | |
335 | (let ((gc-cons-threshold 4000000)) | |
336 | (insert "Without optimization:\n") | |
337 | (dolist (files test-file-list) | |
338 | (dolist (file (cdr files)) | |
339 | (let* ((disable-ascii-optimization t) | |
340 | (result (benchmark-run 10 | |
341 | (with-temp-buffer (insert-file-contents (car file)))))) | |
342 | (insert (format "%s: %s\n" (car file) result))))) | |
343 | (insert "With optimization:\n") | |
344 | (dolist (files test-file-list) | |
345 | (dolist (file (cdr files)) | |
346 | (let* ((disable-ascii-optimization nil) | |
347 | (result (benchmark-run 10 | |
348 | (with-temp-buffer (insert-file-contents (car file)))))) | |
349 | (insert (format "%s: %s\n" (car file) result))))))) |