prefer compilers earlier in list
[bpt/guile.git] / module / ice-9 / regex.scm
CommitLineData
bd6fed8e 1;;;; Copyright (C) 1997, 1999, 2001, 2004, 2005, 2006, 2008, 2010 Free Software Foundation, Inc.
87fefc1c 2;;;;
73be1d9e
MV
3;;;; This library is free software; you can redistribute it and/or
4;;;; modify it under the terms of the GNU Lesser General Public
5;;;; License as published by the Free Software Foundation; either
53befeb7 6;;;; version 3 of the License, or (at your option) any later version.
73be1d9e
MV
7;;;;
8;;;; This library is distributed in the hope that it will be useful,
400d7382 9;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11;;;; Lesser General Public License for more details.
12;;;;
13;;;; You should have received a copy of the GNU Lesser General Public
14;;;; License along with this library; if not, write to the Free Software
92205699 15;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
87fefc1c
TTN
16;;;;
17
18;;; Commentary:
19
20;; These procedures are exported:
21;; (match:count match)
22;; (match:string match)
23;; (match:prefix match)
24;; (match:suffix match)
25;; (regexp-match? match)
26;; (regexp-quote string)
27;; (match:start match . submatch-num)
28;; (match:end match . submatch-num)
29;; (match:substring match . submatch-num)
30;; (string-match pattern str . start)
31;; (regexp-substitute port match . items)
32;; (fold-matches regexp string init proc . flags)
33;; (list-matches regexp string . flags)
34;; (regexp-substitute/global port regexp string . items)
35
36;;; Code:
400d7382
JB
37\f
38;;;; POSIX regex support functions.
39
1a179b03 40(define-module (ice-9 regex)
bd6fed8e
AW
41 #:export (match:count match:string match:prefix match:suffix
42 regexp-match? regexp-quote match:start match:end match:substring
43 string-match regexp-substitute fold-matches list-matches
44 regexp-substitute/global))
05817d9e 45
2b28ce5b
KR
46;; References:
47;;
48;; POSIX spec:
49;; http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
50
400d7382
JB
51;;; FIXME:
52;;; It is not clear what should happen if a `match' function
53;;; is passed a `match number' which is out of bounds for the
54;;; regexp match: return #f, or throw an error? These routines
55;;; throw an out-of-range error.
56
57;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
58;;;; These procedures are not defined in SCSH, but I found them useful.
59
1a179b03 60(define (match:count match)
400d7382
JB
61 (- (vector-length match) 1))
62
1a179b03 63(define (match:string match)
400d7382
JB
64 (vector-ref match 0))
65
1a179b03 66(define (match:prefix match)
4e15fee8 67 (substring (match:string match) 0 (match:start match 0)))
400d7382 68
1a179b03 69(define (match:suffix match)
4e15fee8 70 (substring (match:string match) (match:end match 0)))
400d7382
JB
71
72;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
73;;;; SCSH compatibility routines.
74
1a179b03 75(define (regexp-match? match)
400d7382
JB
76 (and (vector? match)
77 (string? (vector-ref match 0))
78 (let loop ((i 1))
bd6fed8e
AW
79 (cond ((>= i (vector-length match)) #t)
80 ((and (pair? (vector-ref match i))
81 (integer? (car (vector-ref match i)))
82 (integer? (cdr (vector-ref match i))))
83 (loop (+ 1 i)))
84 (else #f)))))
400d7382 85
2b28ce5b
KR
86;; * . \ ^ $ and [ are special in both regexp/basic and regexp/extended and
87;; can be backslash escaped.
88;;
89;; ( ) + ? { } and | are special in regexp/extended so must be quoted. But
90;; that can't be done with a backslash since in regexp/basic where they're
91;; not special, adding a backslash makes them become special. Character
92;; class forms [(] etc are used instead.
93;;
94;; ) is not special when not preceded by a (, and * and ? are not special at
95;; the start of a string, but we quote all of these always, so the result
96;; can be concatenated or merged into some larger regexp.
97;;
98;; ] is not special outside a [ ] character class, so doesn't need to be
99;; quoted.
100;;
87fefc1c 101(define (regexp-quote string)
400d7382
JB
102 (call-with-output-string
103 (lambda (p)
9b694b12 104 (string-for-each (lambda (c)
bd6fed8e
AW
105 (case c
106 ((#\* #\. #\\ #\^ #\$ #\[)
107 (write-char #\\ p)
108 (write-char c p))
109 ((#\( #\) #\+ #\? #\{ #\} #\|)
110 (write-char #\[ p)
111 (write-char c p)
112 (write-char #\] p))
113 (else
114 (write-char c p))))
115 string))))
400d7382 116
ff10e93c
AW
117(define* (match:start match #:optional (n 0))
118 (let ((start (car (vector-ref match (1+ n)))))
400d7382
JB
119 (if (= start -1) #f start)))
120
ff10e93c
AW
121(define* (match:end match #:optional (n 0))
122 (let* ((end (cdr (vector-ref match (1+ n)))))
400d7382
JB
123 (if (= end -1) #f end)))
124
ff10e93c
AW
125(define* (match:substring match #:optional (n 0))
126 (let* ((start (match:start match n))
127 (end (match:end match n)))
4e15fee8 128 (and start end (substring (match:string match) start end))))
400d7382 129
1a179b03 130(define (string-match pattern str . args)
400d7382 131 (let ((rx (make-regexp pattern))
bd6fed8e 132 (start (if (pair? args) (car args) 0)))
400d7382
JB
133 (regexp-exec rx str start)))
134
1a179b03 135(define (regexp-substitute port match . items)
400d7382
JB
136 ;; If `port' is #f, send output to a string.
137 (if (not port)
138 (call-with-output-string
139 (lambda (p)
bd6fed8e 140 (apply regexp-substitute p match items)))
400d7382
JB
141
142 ;; Otherwise, process each substitution argument in `items'.
143 (for-each (lambda (obj)
bd6fed8e
AW
144 (cond ((string? obj) (display obj port))
145 ((integer? obj) (display (match:substring match obj) port))
146 ((eq? 'pre obj) (display (match:prefix match) port))
147 ((eq? 'post obj) (display (match:suffix match) port))
148 (else (error 'wrong-type-arg obj))))
149 items)))
400d7382 150
50ff2ecb
JB
151;;; If we call fold-matches, below, with a regexp that can match the
152;;; empty string, it's not obvious what "all the matches" means. How
153;;; many empty strings are there in the string "a"? Our answer:
154;;;
bd6fed8e 155;;; This function applies PROC to every non-overlapping, maximal
50ff2ecb
JB
156;;; match of REGEXP in STRING.
157;;;
158;;; "non-overlapping": There are two non-overlapping matches of "" in
159;;; "a" --- one before the `a', and one after. There are three
160;;; non-overlapping matches of "q|x*" in "aqb": the empty strings
161;;; before `a' and after `b', and `q'. The two empty strings before
162;;; and after `q' don't count, because they overlap with the match of
163;;; "q".
164;;;
165;;; "maximal": There are three distinct maximal matches of "x*" in
166;;; "axxxb": one before the `a', one covering `xxx', and one after the
167;;; `b'. Around or within `xxx', only the match covering all three
168;;; x's counts, because the rest are not maximal.
169
ff10e93c
AW
170(define* (fold-matches regexp string init proc #:optional (flags 0))
171 (let ((regexp (if (regexp? regexp) regexp (make-regexp regexp))))
50ff2ecb 172 (let loop ((start 0)
bd6fed8e
AW
173 (value init)
174 (abuts #f)) ; True if start abuts a previous match.
d6e1c8bf 175 (define bol (if (zero? start) 0 regexp/notbol))
50ff2ecb 176 (let ((m (if (> start (string-length string)) #f
d6e1c8bf 177 (regexp-exec regexp string start (logior flags bol)))))
bd6fed8e
AW
178 (cond
179 ((not m) value)
180 ((and (= (match:start m) (match:end m)) abuts)
181 ;; We matched an empty string, but that would overlap the
182 ;; match immediately before. Try again at a position
183 ;; further to the right.
184 (loop (+ start 1) value #f))
185 (else
186 (loop (match:end m) (proc m value) #t)))))))
50ff2ecb 187
ff10e93c
AW
188(define* (list-matches regexp string #:optional (flags 0))
189 (reverse! (fold-matches regexp string '() cons flags)))
50ff2ecb 190
1a179b03 191(define (regexp-substitute/global port regexp string . items)
50ff2ecb 192
400d7382
JB
193 ;; If `port' is #f, send output to a string.
194 (if (not port)
195 (call-with-output-string
196 (lambda (p)
bd6fed8e 197 (apply regexp-substitute/global p regexp string items)))
400d7382 198
50ff2ecb
JB
199 ;; Walk the set of non-overlapping, maximal matches.
200 (let next-match ((matches (list-matches regexp string))
bd6fed8e
AW
201 (start 0))
202 (if (null? matches)
203 (display (substring string start) port)
204 (let ((m (car matches)))
205
206 ;; Process all of the items for this match. Don't use
207 ;; for-each, because we need to make sure 'post at the
208 ;; end of the item list is a tail call.
209 (let next-item ((items items))
210
211 (define (do-item item)
212 (cond
213 ((string? item) (display item port))
214 ((integer? item) (display (match:substring m item) port))
215 ((procedure? item) (display (item m) port))
216 ((eq? item 'pre)
217 (display
218 (substring string start (match:start m))
219 port))
220 ((eq? item 'post)
221 (next-match (cdr matches) (match:end m)))
222 (else (error 'wrong-type-arg item))))
223
224 (if (pair? items)
225 (if (null? (cdr items))
226 (do-item (car items)) ; This is a tail call.
227 (begin
228 (do-item (car items)) ; This is not.
229 (next-item (cdr items)))))))))))