Rewritten to take advantage of shy-groups and
[bpt/emacs.git] / lisp / emacs-lisp / regexp-opt.el
CommitLineData
56329bc5
RS
1;;; regexp-opt.el --- generate efficient regexps to match strings.
2
c0056275 3;; Copyright (C) 1994,95,96,97,98,99,2000 Free Software Foundation, Inc.
56329bc5 4
5762abec 5;; Author: Simon Marshall <simon@gnu.org>
fcc31755 6;; Maintainer: FSF
370893a1 7;; Keywords: strings, regexps, extensions
56329bc5
RS
8
9;; This file is part of GNU Emacs.
10
11;; GNU Emacs is free software; you can redistribute it and/or modify
12;; it under the terms of the GNU General Public License as published by
13;; the Free Software Foundation; either version 2, or (at your option)
14;; any later version.
15
16;; GNU Emacs is distributed in the hope that it will be useful,
17;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19;; GNU General Public License for more details.
20
21;; You should have received a copy of the GNU General Public License
22;; along with GNU Emacs; see the file COPYING. If not, write to the
23;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24;; Boston, MA 02111-1307, USA.
25
26;;; Commentary:
27
b02b54a8 28;; The "opt" in "regexp-opt" stands for "optim\\(al\\|i[sz]e\\)".
56329bc5 29;;
25544ce1
SM
30;; This package generates a regexp from a given list of strings (which matches
31;; one of those strings) so that the regexp generated by:
56329bc5 32;;
25544ce1
SM
33;; (regexp-opt strings)
34;;
35;; is equivalent to, but more efficient than, the regexp generated by:
36;;
37;; (mapconcat 'regexp-quote strings "\\|")
56329bc5
RS
38;;
39;; For example:
40;;
41;; (let ((strings '("cond" "if" "when" "unless" "while"
42;; "let" "let*" "progn" "prog1" "prog2"
43;; "save-restriction" "save-excursion" "save-window-excursion"
44;; "save-current-buffer" "save-match-data"
45;; "catch" "throw" "unwind-protect" "condition-case")))
46;; (concat "(" (regexp-opt strings t) "\\>"))
47;; => "(\\(c\\(atch\\|ond\\(ition-case\\)?\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(current-buffer\\|excursion\\|match-data\\|restriction\\|window-excursion\\)\\|throw\\|un\\(less\\|wind-protect\\)\\|wh\\(en\\|ile\\)\\)\\>"
48;;
25544ce1
SM
49;; Searching using the above example `regexp-opt' regexp takes approximately
50;; two-thirds of the time taken using the equivalent `mapconcat' regexp.
51
56329bc5
RS
52;; Since this package was written to produce efficient regexps, not regexps
53;; efficiently, it is probably not a good idea to in-line too many calls in
54;; your code, unless you use the following trick with `eval-when-compile':
55;;
56;; (defvar definition-regexp
57;; (eval-when-compile
58;; (concat "^("
59;; (regexp-opt '("defun" "defsubst" "defmacro" "defalias"
60;; "defvar" "defconst") t)
61;; "\\>")))
62;;
63;; The `byte-compile' code will be as if you had defined the variable thus:
64;;
65;; (defvar definition-regexp
66;; "^(\\(def\\(alias\\|const\\|macro\\|subst\\|un\\|var\\)\\)\\>")
67;;
25544ce1
SM
68;; Note that if you use this trick for all instances of `regexp-opt' and
69;; `regexp-opt-depth' in your code, regexp-opt.el would only have to be loaded
70;; at compile time. But note also that using this trick means that should
71;; regexp-opt.el be changed, perhaps to fix a bug or to add a feature to
72;; improve the efficiency of `regexp-opt' regexps, you would have to recompile
73;; your code for such changes to have effect in your code.
74
75;; Originally written for font-lock.el, from an idea from Stig's hl319.el, with
b02b54a8
GM
76;; thanks for ideas also to Michael Ernst, Bob Glickstein, Dan Nicolaescu and
77;; Stefan Monnier.
78;; No doubt `regexp-opt' doesn't always produce optimal regexps, so code, ideas
79;; or any other information to improve things are welcome.
c0056275
SM
80;;
81;; One possible improvement would be to compile '("aa" "ab" "ba" "bb")
82;; into "[ab][ab]" rather than "a[ab]\\|b[ab]". I'm not sure it's worth
83;; it but if someone knows how to do it without going through too many
84;; contortions, I'm all ears.
56329bc5 85\f
c0056275 86;;; Code:
56329bc5
RS
87
88;;;###autoload
89(defun regexp-opt (strings &optional paren)
90 "Return a regexp to match a string in STRINGS.
582305b0
RS
91Each string should be unique in STRINGS and should not contain any regexps,
92quoted or not. If optional PAREN is non-nil, ensure that the returned regexp
93is enclosed by at least one regexp grouping construct.
56329bc5
RS
94The returned regexp is typically more efficient than the equivalent regexp:
95
25544ce1 96 (let ((open-paren (if PAREN \"\\\\(\" \"\")) (close-paren (if PAREN \"\\\\)\" \"\")))
c0056275 97 (concat open-paren (mapconcat 'regexp-quote STRINGS \"\\\\|\") close-paren))"
56329bc5
RS
98 (save-match-data
99 ;; Recurse on the sorted list.
100 (let ((max-lisp-eval-depth (* 1024 1024))
101 (completion-ignore-case nil))
c0056275 102 (setq paren (cond ((stringp paren) paren) (paren "\\(")))
56329bc5
RS
103 (regexp-opt-group (sort (copy-sequence strings) 'string-lessp) paren))))
104
105;;;###autoload
106(defun regexp-opt-depth (regexp)
107 "Return the depth of REGEXP.
108This means the number of regexp grouping constructs (parenthesised expressions)
109in REGEXP."
110 (save-match-data
111 ;; Hack to signal an error if REGEXP does not have balanced parentheses.
112 (string-match regexp "")
113 ;; Count the number of open parentheses in REGEXP.
114 (let ((count 0) start)
ac5cb26d
SM
115 (while (string-match "\\(\\`\\|[^\\]\\)\\\\\\(\\\\\\\\\\)*([^?]"
116 regexp start)
56329bc5
RS
117 (setq count (1+ count) start (match-end 0)))
118 count)))
119\f
120;;; Workhorse functions.
121
122(eval-when-compile
123 (require 'cl))
124
125(unless (fboundp 'make-bool-vector)
126 (defalias 'make-bool-vector 'make-vector))
127
128(defun regexp-opt-group (strings &optional paren lax)
c0056275
SM
129 "Return a regexp to match a string in STRINGS.
130If PAREN non-nil, output regexp parentheses around returned regexp.
131If LAX non-nil, don't output parentheses if it doesn't require them.
132Merges keywords to avoid backtracking in Emacs' regexp matcher.
133
134The basic idea is to find the shortest common prefix or suffix, remove it
135and recurse. If there is no prefix, we divide the list into two so that
136\(at least) one half will have at least a one-character common prefix.
137
138Also we delay the addition of grouping parenthesis as long as possible
139until we're sure we need them, and try to remove one-character sequences
140so we can use character sets rather than grouping parenthesis."
141 (let* ((open-group (cond ((stringp paren) paren) (paren "\\(?:") (t "")))
56329bc5
RS
142 (close-group (if paren "\\)" ""))
143 (open-charset (if lax "" open-group))
c0056275 144 (close-charset (if lax "" close-group)))
56329bc5 145 (cond
b02b54a8
GM
146 ;;
147 ;; If there are no strings, just return the empty string.
148 ((= (length strings) 0)
149 "")
150 ;;
56329bc5
RS
151 ;; If there is only one string, just return it.
152 ((= (length strings) 1)
153 (if (= (length (car strings)) 1)
154 (concat open-charset (regexp-quote (car strings)) close-charset)
155 (concat open-group (regexp-quote (car strings)) close-group)))
156 ;;
157 ;; If there is an empty string, remove it and recurse on the rest.
158 ((= (length (car strings)) 0)
159 (concat open-charset
160 (regexp-opt-group (cdr strings) t t) "?"
161 close-charset))
162 ;;
c0056275
SM
163 ;; If there are several one-char strings, use charsets
164 ((and (= (length (car strings)) 1)
165 (let ((strs (cdr strings)))
166 (while (and strs (/= (length (car strs)) 1))
167 (pop strs))
168 strs))
169 (let (letters rest)
170 ;; Collect one-char strings
171 (dolist (s strings)
172 (if (= (length s) 1) (push s letters) (push s rest)))
173
174 (if rest
175 ;; several one-char strings: take them and recurse
176 ;; on the rest (first so as to match the longest).
177 (concat open-group
178 (regexp-opt-group (nreverse rest))
179 "\\|" (regexp-opt-charset letters)
180 close-group)
181 ;; all are one-char strings: just return a character set.
182 (concat open-charset
183 (regexp-opt-charset letters)
184 close-charset))))
56329bc5
RS
185 ;;
186 ;; We have a list of different length strings.
187 (t
c0056275
SM
188 (let ((prefix (try-completion "" (mapcar 'list strings))))
189 (if (> (length prefix) 0)
190 ;; common prefix: take it and recurse on the suffixes.
191 (let* ((n (length prefix))
192 (suffixes (mapcar (lambda (s) (substring s n)) strings)))
193 (concat open-charset
194 (regexp-quote prefix)
195 (regexp-opt-group suffixes t t)
196 close-charset))
197
198 (let* ((sgnirts (mapcar (lambda (s)
199 (concat (nreverse (string-to-list s))))
200 strings))
201 (xiffus (try-completion "" (mapcar 'list sgnirts))))
202 (if (> (length xiffus) 0)
203 ;; common suffix: take it and recurse on the prefixes.
204 (let* ((n (- (length xiffus)))
205 (prefixes (mapcar (lambda (s) (substring s 0 n)) strings)))
206 (concat open-charset
207 (regexp-opt-group prefixes t t)
208 (regexp-quote
209 (concat (nreverse (string-to-list xiffus))))
210 close-charset))
211
212 ;; Otherwise, divide the list into those that start with a
213 ;; particular letter and those that do not, and recurse on them.
214 (let* ((char (char-to-string (string-to-char (car strings))))
215 (half1 (all-completions char (mapcar 'list strings)))
216 (half2 (nthcdr (length half1) strings)))
217 (concat open-group
218 (regexp-opt-group half1)
219 "\\|" (regexp-opt-group half2)
220 close-group))))))))))
221
56329bc5
RS
222
223(defun regexp-opt-charset (chars)
224 ;;
225 ;; Return a regexp to match a character in CHARS.
226 ;;
227 ;; The basic idea is to find character ranges. Also we take care in the
228 ;; position of character set meta characters in the character set regexp.
229 ;;
230 (let* ((charwidth 256) ; Yeah, right.
231 (charmap (make-bool-vector charwidth nil))
232 (charset "")
233 (bracket "") (dash "") (caret ""))
234 ;;
235 ;; Make a character map but extract character set meta characters.
25544ce1
SM
236 (dolist (char (mapcar 'string-to-char chars))
237 (case char
238 (?\]
239 (setq bracket "]"))
240 (?^
241 (setq caret "^"))
242 (?-
243 (setq dash "-"))
244 (otherwise
245 (aset charmap char t))))
56329bc5
RS
246 ;;
247 ;; Make a character set from the map using ranges where applicable.
9b51ba9e
SM
248 (dotimes (char charwidth)
249 (let ((start char))
250 (while (and (< char charwidth) (aref charmap char))
251 (incf char))
252 (cond ((> char (+ start 3))
253 (setq charset (format "%s%c-%c" charset start (1- char))))
254 ((> char start)
255 (setq charset (format "%s%c" charset (setq char start)))))))
56329bc5
RS
256 ;;
257 ;; Make sure a caret is not first and a dash is first or last.
258 (if (and (string-equal charset "") (string-equal bracket ""))
259 (concat "[" dash caret "]")
260 (concat "[" bracket charset caret dash "]"))))
261
262(provide 'regexp-opt)
263
264;;; regexp-opt.el ends here