-;;;; regexp.test --- test Guile's regular expression functions -*- scheme -*-
+;;;; regexp.test --- test Guile's regexps -*- coding: utf-8; mode: scheme -*-
;;;; Jim Blandy <jimb@red-bean.com> --- September 1999
;;;;
-;;;; Copyright (C) 1999, 2004, 2006, 2007, 2008 Free Software Foundation, Inc.
-;;;;
+;;;; Copyright (C) 1999, 2004, 2006, 2007, 2008, 2009, 2010,
+;;;; 2012, 2013 Free Software Foundation, Inc.
+;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
;;;; License as published by the Free Software Foundation; either
(define-module (test-suite test-regexp)
#:use-module (test-suite lib)
+ #:use-module (srfi srfi-1)
#:use-module (ice-9 regex))
-(setlocale LC_ALL "C")
+(if (defined? 'setlocale)
+ (setlocale LC_ALL "C"))
+
+;; Don't fail if we can't display a test name to stdout/stderr.
+(set-port-conversion-strategy! (current-output-port) 'escape)
+(set-port-conversion-strategy! (current-error-port) 'escape)
\f
;;; Run a regexp-substitute or regexp-substitute/global test, once
(lambda (match result)
(cons (match:substring match)
result))
- (logior regexp/notbol regexp/noteol)))))
+ (logior regexp/notbol regexp/noteol))))
+
+ (pass-if "regexp/notbol is set correctly"
+ (equal? '("foo")
+ (fold-matches "^foo" "foofoofoofoo" '()
+ (lambda (match result)
+ (cons (match:substring match)
+ result))))))
;;;
;;; regexp-quote
;;;
-(define (with-latin1-locale thunk)
- ;; Try out several ISO-8859-1 locales and run THUNK under the one that
- ;; works (if any).
- (define %locales
- (append
- (map (lambda (name)
- (string-append name ".ISO-8859-1"))
- '("fr_FR" "es_ES" "en_GB" "en_US" "de_DE" "pt_PT"))
- (map (lambda (name)
- (string-append name ".iso88591"))
- '("fr_FR" "es_ES" "en_GB" "en_US" "de_DE" "pt_PT"))))
-
-
- (let loop ((locales %locales))
- (if (null? locales)
- (throw 'unresolved)
- (catch 'unresolved
- (lambda ()
- (with-locale (car locales) thunk))
- (lambda (key . args)
- (loop (cdr locales)))))))
-
-
+(define-syntax with-ascii-or-latin1-locale
+ (syntax-rules ()
+ ((_ chr body ...)
+ (if (> chr 127)
+ (with-latin1-locale body ...)
+ (begin body ...)))))
+
+;; Since `regexp-quote' uses string ports, and since it is used below
+;; with non-ASCII characters, these ports must be Unicode-capable.
+(define-syntax with-unicode
+ (syntax-rules ()
+ ((_ exp)
+ (with-fluids ((%default-port-encoding "UTF-8"))
+ exp))))
(with-test-prefix "regexp-quote"
(let ((lst `((regexp/basic ,regexp/basic)
(regexp/extended ,regexp/extended)))
- ;; string of all characters, except #\nul which doesn't work because
- ;; it's the usual end-of-string for the underlying C regexec()
- (allchars (list->string (map integer->char
- (cdr (iota char-code-limit))))))
+ ;; String of all latin-1 characters, except #\nul which doesn't
+ ;; work because it's the usual end-of-string for the underlying
+ ;; C regexec().
+ (allchars (list->string (map integer->char (cdr (iota 256))))))
(for-each
(lambda (elem)
(let ((name (car elem))
(with-test-prefix name
- ;; try on each individual character, except #\nul
+ ;; Try on each individual latin-1 character, except #\nul.
(do ((i 1 (1+ i)))
- ((>= i char-code-limit))
+ ((>= i 256))
(let* ((c (integer->char i))
(s (string c)))
(pass-if (list "char" i (format #f "~s ~s" c s))
- (with-latin1-locale
- (let* ((q (regexp-quote s))
+ (with-ascii-or-latin1-locale i
+ (let* ((q (with-unicode (regexp-quote s)))
(m (regexp-exec (make-regexp q flag) s)))
(and (= 0 (match:start m))
(= 1 (match:end m))))))))
- ;; try on pattern "aX" where X is each character, except #\nul
- ;; this exposes things like "?" which are special only when they
- ;; follow a pattern to repeat or whatever ("a" in this case)
+ ;; Try on pattern "aX" where X is each latin-1 character,
+ ;; except #\nul. This exposes things like "?" which are
+ ;; special only when they follow a pattern to repeat or
+ ;; whatever ("a" in this case).
(do ((i 1 (1+ i)))
- ((>= i char-code-limit))
+ ((>= i 256))
(let* ((c (integer->char i))
(s (string #\a c))
- (q (regexp-quote s)))
- (pass-if (list "string \"aX\"" i (format #f "~s ~s" c s))
- (with-latin1-locale
- (let* ((q (regexp-quote s))
- (m (regexp-exec (make-regexp q flag) s)))
+ (q (with-unicode (regexp-quote s))))
+ (pass-if (list "string \"aX\"" i (format #f "~s ~s ~s" c s q))
+ (with-ascii-or-latin1-locale i
+ (let* ((m (regexp-exec (make-regexp q flag) s)))
(and (= 0 (match:start m))
(= 2 (match:end m))))))))
(pass-if "string of all chars"
(with-latin1-locale
- (let ((m (regexp-exec (make-regexp (regexp-quote allchars)
- flag) allchars)))
- (and (= 0 (match:start m))
- (= (string-length allchars) (match:end m)))))))))
+ (let ((m (regexp-exec (make-regexp (with-unicode
+ (regexp-quote allchars))
+ flag) allchars)))
+ (and (= 0 (match:start m))
+ (= (string-length allchars) (match:end m)))))))))
lst)))
;;;
;; Jan Nieuwenhuizen's bug, 2 Sep 1999
(try "" "_" (make-string 500 #\_)
'post))
+
+(with-test-prefix "nonascii locales"
+ (pass-if "match structures refer to char offsets"
+ (with-locale "en_US.utf8"
+ ;; bug #31650
+ (equal? (match:substring (string-match ".*" "calçot") 0)
+ "calçot")))
+
+ (pass-if "match structures refer to char offsets, non-ASCII pattern"
+ (with-locale "en_US.utf8"
+ ;; bug #31650
+ (equal? (match:substring (string-match "λ: The Ultimate (.*)"
+ "λ: The Ultimate GOTO")
+ 1)
+ "GOTO"))))