More setlocale robustness in regexp tests
[bpt/guile.git] / test-suite / tests / regexp.test
1 ;;;; regexp.test --- test Guile's regular expression functions -*- scheme -*-
2 ;;;; Jim Blandy <jimb@red-bean.com> --- September 1999
3 ;;;;
4 ;;;; Copyright (C) 1999, 2004, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
5 ;;;;
6 ;;;; This library is free software; you can redistribute it and/or
7 ;;;; modify it under the terms of the GNU Lesser General Public
8 ;;;; License as published by the Free Software Foundation; either
9 ;;;; version 3 of the License, or (at your option) any later version.
10 ;;;;
11 ;;;; This library is distributed in the hope that it will be useful,
12 ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;;;; Lesser General Public License for more details.
15 ;;;;
16 ;;;; You should have received a copy of the GNU Lesser General Public
17 ;;;; License along with this library; if not, write to the Free Software
18 ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
20 (define-module (test-suite test-regexp)
21 #:use-module (test-suite lib)
22 #:use-module (srfi srfi-1)
23 #:use-module (ice-9 regex))
24
25 ;; Set the locale to LOC, if possible. Failing that, set the locale
26 ;; to C. If that fails, force the port encoding to ASCII.
27 (define (mysetlocale loc)
28 (or
29 (and (defined? 'setlocale)
30 (false-if-exception (setlocale LC_ALL loc)))
31 (and (defined? 'setlocale)
32 (false-if-exception (setlocale LC_ALL "C")))
33 (begin
34 (false-if-exception (set-port-encoding! (current-input-port)
35 "ASCII"))
36 (false-if-exception (set-port-encoding! (current-output-port)
37 "ASCII"))
38 #f)))
39
40 ;; Set the locale to a Latin-1 friendly locale. Failing that, force
41 ;; the port encoding to Latin-1. Returns the encoding used.
42 (define (set-latin-1)
43 (set-port-conversion-strategy! (current-output-port) 'escape)
44 (or
45 (any
46 (lambda (loc)
47 (if (defined? 'setlocale)
48 (let ((ret (false-if-exception (setlocale LC_ALL loc))))
49 (if ret
50 loc
51 #f))
52 #f))
53 (append
54 (map (lambda (name)
55 (string-append name ".ISO-8859-1"))
56 '("fr_FR" "es_MX" "es_ES" "en_GB" "en_US" "de_DE" "pt_PT"))
57 (map (lambda (name)
58 (string-append name ".iso88591"))
59 '("fr_FR" "es_MX" "es_ES" "en_GB" "en_US" "de_DE" "pt_PT"))))
60 (begin
61 (false-if-exception (set-port-encoding! (current-input-port)
62 "ISO-8859-1"))
63 (false-if-exception (set-port-encoding! (current-output-port)
64 "ISO-8859-1"))
65 #f)))
66
67 (mysetlocale "C")
68
69 \f
70 ;;; Run a regexp-substitute or regexp-substitute/global test, once
71 ;;; providing a real port and once providing #f, requesting direct
72 ;;; string output.
73 (define (vary-port func expected . args)
74 (pass-if "port is string port"
75 (equal? expected
76 (call-with-output-string
77 (lambda (port)
78 (apply func port args)))))
79 (pass-if "port is #f"
80 (equal? expected
81 (apply func #f args))))
82
83 (define (object->string obj)
84 (call-with-output-string
85 (lambda (port)
86 (write obj port))))
87
88 ;;;
89 ;;; make-regexp
90 ;;;
91
92 (with-test-prefix "make-regexp"
93
94 (pass-if-exception "no args" exception:wrong-num-args
95 (make-regexp))
96
97 (pass-if-exception "bad pat arg" exception:wrong-type-arg
98 (make-regexp 'blah))
99
100 ;; in guile prior to 1.6.5 make-regex didn't validate its flags args
101 (pass-if-exception "bad arg 2" exception:wrong-type-arg
102 (make-regexp "xyz" 'abc))
103
104 (pass-if-exception "bad arg 3" exception:wrong-type-arg
105 (make-regexp "xyz" regexp/icase 'abc)))
106
107 ;;;
108 ;;; match:string
109 ;;;
110
111 (with-test-prefix "match:string"
112
113 (pass-if "foo"
114 (string=? "foo" (match:string (string-match ".*" "foo"))))
115
116 (pass-if "foo offset 1"
117 (string=? "foo" (match:string (string-match ".*" "foo" 1)))))
118
119 ;;;
120 ;;; regexp-exec
121 ;;;
122
123 (with-test-prefix "regexp-exec"
124
125 (pass-if-exception "non-integer offset" exception:wrong-type-arg
126 (let ((re (make-regexp "ab+")))
127 (regexp-exec re "aaaabbbb" 1.5 'bogus-flags-arg)))
128
129 (pass-if-exception "non-string input" exception:wrong-type-arg
130 (let ((re (make-regexp "ab+")))
131 (regexp-exec re 'not-a-string)))
132
133 (pass-if-exception "non-string input, with offset" exception:wrong-type-arg
134 (let ((re (make-regexp "ab+")))
135 (regexp-exec re 'not-a-string 5)))
136
137 ;; in guile 1.8.1 and earlier, a #\nul character in the input string was
138 ;; only detected in a critical section, and the resulting error throw
139 ;; abort()ed the program
140 (pass-if-exception "nul in input" exception:string-contains-nul
141 (let ((re (make-regexp "ab+")))
142 (regexp-exec re (string #\a #\b (integer->char 0)))))
143
144 ;; in guile 1.8.1 and earlier, a bogus flags argument was only detected
145 ;; inside a critical section, and the resulting error throw abort()ed the
146 ;; program
147 (pass-if-exception "non-integer flags" exception:wrong-type-arg
148 (let ((re (make-regexp "ab+")))
149 (regexp-exec re "aaaabbbb" 0 'bogus-flags-arg))))
150
151 ;;;
152 ;;; fold-matches
153 ;;;
154
155 (with-test-prefix "fold-matches"
156
157 (pass-if "without flags"
158 (equal? '("hello")
159 (fold-matches "^[a-z]+$" "hello" '()
160 (lambda (match result)
161 (cons (match:substring match)
162 result)))))
163
164 (pass-if "with flags"
165 ;; Prior to 1.8.6, passing an additional flag would not work.
166 (null?
167 (fold-matches "^[a-z]+$" "hello" '()
168 (lambda (match result)
169 (cons (match:substring match)
170 result))
171 (logior regexp/notbol regexp/noteol)))))
172
173
174 ;;;
175 ;;; regexp-quote
176 ;;;
177
178 (with-test-prefix "regexp-quote"
179
180 (pass-if-exception "no args" exception:wrong-num-args
181 (regexp-quote))
182
183 (pass-if-exception "bad string arg" exception:wrong-type-arg
184 (regexp-quote 'blah))
185
186 (let ((lst `((regexp/basic ,regexp/basic)
187 (regexp/extended ,regexp/extended)))
188 ;; string of all characters, except #\nul which doesn't work because
189 ;; it's the usual end-of-string for the underlying C regexec()
190 (allchars (list->string (map integer->char
191 (cdr (iota char-code-limit))))))
192 (for-each
193 (lambda (elem)
194 (let ((name (car elem))
195 (flag (cadr elem)))
196
197 (with-test-prefix name
198
199 ;; try on each individual character, except #\nul
200 (do ((i 1 (1+ i)))
201 ((>= i char-code-limit))
202 (let* ((c (integer->char i))
203 (s (string c))
204 (q (regexp-quote s)))
205 (pass-if (list "char" i (format #f "~s ~s ~s" c s q))
206 (set-latin-1) ; set locale for regexp processing
207 ; on binary data
208 (let ((m (regexp-exec (make-regexp q flag) s)))
209 (mysetlocale "") ; restore locale
210 (and (= 0 (match:start m))
211 (= 1 (match:end m)))))))
212
213 ;; try on pattern "aX" where X is each character, except #\nul
214 ;; this exposes things like "?" which are special only when they
215 ;; follow a pattern to repeat or whatever ("a" in this case)
216 (do ((i 1 (1+ i)))
217 ((>= i char-code-limit))
218 (let* ((c (integer->char i))
219 (s (string #\a c))
220 (q (regexp-quote s)))
221 (pass-if (list "string \"aX\"" i (format #f "~s ~s ~s" c s q))
222 (set-latin-1)
223 (let* ((m (regexp-exec (make-regexp q flag) s)))
224 (mysetlocale "")
225 (and (= 0 (match:start m))
226 (= 2 (match:end m)))))))
227
228 (pass-if "string of all chars"
229 (setbinary)
230 (let ((m (regexp-exec (make-regexp (regexp-quote allchars)
231 flag) allchars)))
232 (and (= 0 (match:start m))
233 (= (string-length allchars) (match:end m))))))))
234 lst)))
235
236 ;;;
237 ;;; regexp-substitute
238 ;;;
239 (mysetlocale "C")
240
241 (with-test-prefix "regexp-substitute"
242 (let ((match
243 (string-match "patleft(sub1)patmid(sub2)patright"
244 "contleftpatleftsub1patmidsub2patrightcontright")))
245 (define (try expected . args)
246 (with-test-prefix (object->string args)
247 (apply vary-port regexp-substitute expected match args)))
248
249 (try "")
250 (try "string1" "string1")
251 (try "string1string2" "string1" "string2")
252 (try "patleftsub1patmidsub2patright" 0)
253 (try "hi-patleftsub1patmidsub2patright-bye" "hi-" 0 "-bye")
254 (try "sub1" 1)
255 (try "hi-sub1-bye" "hi-" 1 "-bye")
256 (try "hi-sub2-bye" "hi-" 2 "-bye")
257 (try "contleft" 'pre)
258 (try "contright" 'post)
259 (try "contrightcontleft" 'post 'pre)
260 (try "contrightcontleftcontrightcontleft" 'post 'pre 'post 'pre)
261 (try "contrightsub2sub1contleft" 'post 2 1 'pre)
262 (try "foosub1sub1sub1sub1bar" "foo" 1 1 1 1 "bar")))
263
264 (with-test-prefix "regexp-substitute/global"
265
266 (define (try expected . args)
267 (with-test-prefix (object->string args)
268 (apply vary-port regexp-substitute/global expected args)))
269
270 (try "hi" "a(x*)b" "ab" "hi")
271 (try "" "a(x*)b" "ab" 1)
272 (try "xx" "a(x*)b" "axxb" 1)
273 (try "xx" "a(x*)b" "_axxb_" 1)
274 (try "pre" "a(x*)b" "preaxxbpost" 'pre)
275 (try "post" "a(x*)b" "preaxxbpost" 'post)
276 (try "string" "x" "string" 'pre "y" 'post)
277 (try "4" "a(x*)b" "_axxb_" (lambda (m)
278 (number->string (match:end m 1))))
279
280 (try "_aybycyd_" "x+" "_axbxxcxxxd_" 'pre "y" 'post)
281
282 ;; This should not go into an infinite loop, just because the regexp
283 ;; can match the empty string. This test also kind of beats on our
284 ;; definition of where a null string can match.
285 (try "y_yaybycydy_y" "x*" "_axbxxcxxxd_" 'pre "y" 'post)
286
287 ;; These kind of bother me. The extension from regexp-substitute to
288 ;; regexp-substitute/global is only natural if your item list
289 ;; includes both pre and post. If those are required, why bother
290 ;; to include them at all?
291 (try "4:7:12:_" "a(x*)b" "_axxbaxbaxxxb_"
292 (lambda (m) (number->string (match:end m 1))) ":"
293 'post)
294 (try "4:10:19:_:19:10:4" "a(x*)b" "_axxbaxxxxbaxxxxxxxb_"
295 (lambda (m) (number->string (match:end m 1))) ":"
296 'post
297 ":" (lambda (m) (number->string (match:end m 1))))
298
299 ;; Jan Nieuwenhuizen's bug, 2 Sep 1999
300 (try "" "_" (make-string 500 #\_)
301 'post))