Sync to HEAD
[bpt/emacs.git] / lisp / emacs-lisp / sregex.el
1 ;;; sregex.el --- symbolic regular expressions
2
3 ;; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
4
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
7 ;; Keywords: extensions
8
9 ;; This file is part of GNU Emacs.
10
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; any later version.
15
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
20
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
25
26 ;;; Commentary:
27
28 ;; This package allows you to write regular expressions using a
29 ;; totally new, Lisp-like syntax.
30
31 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
32 ;; that, when evaluated, produces the string form of the specified
33 ;; regular expression. Here's a simple example:
34
35 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
36
37 ;; As you can see, an sregex is specified by placing one or more
38 ;; special clauses in a call to `sregexq'. The clause in this case is
39 ;; the `or' of two strings (not to be confused with the Lisp function
40 ;; `or'). The list of allowable clauses appears below.
41
42 ;; With sregex, it is never necessary to "escape" magic characters
43 ;; that are meant to be taken literally; that happens automatically.
44 ;; For example:
45
46 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
47
48 ;; It is also unnecessary to "group" parts of the expression together
49 ;; to overcome operator precedence; that also happens automatically.
50 ;; For example:
51
52 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?"
53
54 ;; It *is* possible to group parts of the expression in order to refer
55 ;; to them with numbered backreferences:
56
57 ;; (sregexq (group (or "Go" "Run"))
58 ;; ", Spot, "
59 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
60
61 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
62 ;; Lisp expression that then invokes a moderately complex engine to
63 ;; interpret the sregex and render the string form. Because of this,
64 ;; I don't recommend sprinkling calls to `sregexq' throughout your
65 ;; code, the way one normally does with string regexes (which are
66 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
67 ;; you need wherever possible instead of repeatedly constructing the
68 ;; same ones over and over. Example:
69
70 ;; (let ((field-regex (sregexq (opt "resent-")
71 ;; (or "to" "cc" "bcc"))))
72 ;; ...
73 ;; (while ...
74 ;; ...
75 ;; (re-search-forward field-regex ...)
76 ;; ...))
77
78 ;; The arguments to `sregexq' are automatically quoted, but the
79 ;; flipside of this is that it is not straightforward to include
80 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
81 ;; `sregex' is a function that is like `sregexq' but which does not
82 ;; automatically quote its values. Literal sregex clauses must be
83 ;; explicitly quoted like so:
84
85 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
86
87 ;; but computed clauses can be included easily, allowing for the reuse
88 ;; of common clauses:
89
90 ;; (let ((dotstar '(0+ any))
91 ;; (whitespace '(1+ (syntax ?-)))
92 ;; (digits '(1+ (char (?0 . ?9)))))
93 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
94
95 ;; To use this package in a Lisp program, simply (require 'sregex).
96
97 ;; Here are the clauses allowed in an `sregex' or `sregexq'
98 ;; expression:
99
100 ;; - a string
101 ;; This stands for the literal string. If it contains
102 ;; metacharacters, they will be escaped in the resulting regex
103 ;; (using `regexp-quote').
104
105 ;; - the symbol `any'
106 ;; This stands for ".", a regex matching any character except
107 ;; newline.
108
109 ;; - the symbol `bol'
110 ;; Stands for "^", matching the empty string at the beginning of a line
111
112 ;; - the symbol `eol'
113 ;; Stands for "$", matching the empty string at the end of a line
114
115 ;; - (group CLAUSE ...)
116 ;; Groups the given CLAUSEs using "\\(" and "\\)".
117
118 ;; - (sequence CLAUSE ...)
119
120 ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)".
121 ;; Clauses grouped by `sequence' do not count for purposes of
122 ;; numbering backreferences. Use `sequence' in situations like
123 ;; this:
124
125 ;; (sregexq (or "dog" "cat"
126 ;; (sequence (opt "sea ") "monkey")))
127 ;; => "dog\\|cat\\|\\(?:sea \\)?monkey"
128
129 ;; where a single `or' alternate needs to contain multiple
130 ;; subclauses.
131
132 ;; - (backref N)
133 ;; Matches the same string previously matched by the Nth "group" in
134 ;; the same sregex. N is a positive integer.
135
136 ;; - (or CLAUSE ...)
137 ;; Matches any one of the CLAUSEs by separating them with "\\|".
138
139 ;; - (0+ CLAUSE ...)
140 ;; Concatenates the given CLAUSEs and matches zero or more
141 ;; occurrences by appending "*".
142
143 ;; - (1+ CLAUSE ...)
144 ;; Concatenates the given CLAUSEs and matches one or more
145 ;; occurrences by appending "+".
146
147 ;; - (opt CLAUSE ...)
148 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
149 ;; by appending "?".
150
151 ;; - (repeat MIN MAX CLAUSE ...)
152 ;; Concatenates the given CLAUSEs and constructs a regex matching at
153 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
154 ;; non-negative integer. MAX must be a non-negative integer greater
155 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
156
157 ;; - (char CHAR-CLAUSE ...)
158 ;; Creates a "character class" matching one character from the given
159 ;; set. See below for how to construct a CHAR-CLAUSE.
160
161 ;; - (not-char CHAR-CLAUSE ...)
162 ;; Creates a "character class" matching any one character not in the
163 ;; given set. See below for how to construct a CHAR-CLAUSE.
164
165 ;; - the symbol `bot'
166 ;; Stands for "\\`", matching the empty string at the beginning of
167 ;; text (beginning of a string or of a buffer).
168
169 ;; - the symbol `eot'
170 ;; Stands for "\\'", matching the empty string at the end of text.
171
172 ;; - the symbol `point'
173 ;; Stands for "\\=", matching the empty string at point.
174
175 ;; - the symbol `word-boundary'
176 ;; Stands for "\\b", matching the empty string at the beginning or
177 ;; end of a word.
178
179 ;; - the symbol `not-word-boundary'
180 ;; Stands for "\\B", matching the empty string not at the beginning
181 ;; or end of a word.
182
183 ;; - the symbol `bow'
184 ;; Stands for "\\<", matching the empty string at the beginning of a
185 ;; word.
186
187 ;; - the symbol `eow'
188 ;; Stands for "\\>", matching the empty string at the end of a word.
189
190 ;; - the symbol `wordchar'
191 ;; Stands for the regex "\\w", matching a word-constituent character
192 ;; (as determined by the current syntax table)
193
194 ;; - the symbol `not-wordchar'
195 ;; Stands for the regex "\\W", matching a non-word-constituent
196 ;; character.
197
198 ;; - (syntax CODE)
199 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
200 ;; (a single character). Matches any character with the requested
201 ;; syntax.
202
203 ;; - (not-syntax CODE)
204 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
205 ;; (a single character). Matches any character without the
206 ;; requested syntax.
207
208 ;; - (regex REGEX)
209 ;; This is a "trapdoor" for including ordinary regular expression
210 ;; strings in the result. Some regular expressions are clearer when
211 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
212 ;; instance. However, see the note under "Bugs," below.
213
214 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
215 ;; has one of the following forms:
216
217 ;; - a character
218 ;; Adds that character to the set.
219
220 ;; - a string
221 ;; Adds all the characters in the string to the set.
222
223 ;; - A pair (MIN . MAX)
224 ;; Where MIN and MAX are characters, adds the range of characters
225 ;; from MIN through MAX to the set.
226
227 ;;; To do:
228
229 ;; An earlier version of this package could optionally translate the
230 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
231 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
232 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
233 ;; such a facility.
234
235 ;; - handle multibyte chars in sregex--char-aux
236 ;; - add support for character classes ([:blank:], ...)
237 ;; - add support for non-greedy operators *? and +?
238 ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?"
239
240 ;;; Bugs:
241
242 ;;; Code:
243
244 (eval-when-compile (require 'cl))
245
246 ;; Compatibility code for when we didn't have shy-groups
247 (defvar sregex--current-sregex nil)
248 (defun sregex-info () nil)
249 (defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms))
250 (defun sregex-replace-match (r &optional f l str subexp x)
251 (replace-match r f l str subexp))
252 (defun sregex-match-string (c &optional i x) (match-string c i))
253 (defun sregex-match-string-no-properties (count &optional in-string sregex)
254 (match-string-no-properties count in-string))
255 (defun sregex-match-beginning (count &optional sregex) (match-beginning count))
256 (defun sregex-match-end (count &optional sregex) (match-end count))
257 (defun sregex-match-data (&optional sregex) (match-data))
258 (defun sregex-backref-num (n &optional sregex) n)
259
260
261 (defun sregex (&rest exps)
262 "Symbolic regular expression interpreter.
263 This is exactly like `sregexq' (q.v.) except that it evaluates all its
264 arguments, so literal sregex clauses must be quoted. For example:
265
266 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
267
268 An argument-evaluating sregex interpreter lets you reuse sregex
269 subexpressions:
270
271 (let ((dotstar '(0+ any))
272 (whitespace '(1+ (syntax ?-)))
273 (digits '(1+ (char (?0 . ?9)))))
274 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
275 (sregex--sequence exps nil))
276
277 (defmacro sregexq (&rest exps)
278 "Symbolic regular expression interpreter.
279 This macro allows you to specify a regular expression (regexp) in
280 symbolic form, and converts it into the string form required by Emacs's
281 regex functions such as `re-search-forward' and `looking-at'. Here is
282 a simple example:
283
284 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
285
286 As you can see, an sregex is specified by placing one or more special
287 clauses in a call to `sregexq'. The clause in this case is the `or'
288 of two strings (not to be confused with the Lisp function `or'). The
289 list of allowable clauses appears below.
290
291 With `sregex', it is never necessary to \"escape\" magic characters
292 that are meant to be taken literally; that happens automatically.
293 For example:
294
295 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
296
297 It is also unnecessary to \"group\" parts of the expression together
298 to overcome operator precedence; that also happens automatically.
299 For example:
300
301 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
302
303 It *is* possible to group parts of the expression in order to refer
304 to them with numbered backreferences:
305
306 (sregexq (group (or \"Go\" \"Run\"))
307 \", Spot, \"
308 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
309
310 If `sregexq' needs to introduce its own grouping parentheses, it will
311 automatically renumber your backreferences:
312
313 (sregexq (opt \"resent-\")
314 (group (or \"to\" \"cc\" \"bcc\"))
315 \": \"
316 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
317
318 `sregexq' is a macro. Each time it is used, it constructs a simple
319 Lisp expression that then invokes a moderately complex engine to
320 interpret the sregex and render the string form. Because of this, I
321 don't recommend sprinkling calls to `sregexq' throughout your code,
322 the way one normally does with string regexes (which are cheap to
323 evaluate). Instead, it's wiser to precompute the regexes you need
324 wherever possible instead of repeatedly constructing the same ones
325 over and over. Example:
326
327 (let ((field-regex (sregexq (opt \"resent-\")
328 (or \"to\" \"cc\" \"bcc\"))))
329 ...
330 (while ...
331 ...
332 (re-search-forward field-regex ...)
333 ...))
334
335 The arguments to `sregexq' are automatically quoted, but the
336 flipside of this is that it is not straightforward to include
337 computed (i.e., non-constant) values in `sregexq' expressions. So
338 `sregex' is a function that is like `sregexq' but which does not
339 automatically quote its values. Literal sregex clauses must be
340 explicitly quoted like so:
341
342 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
343
344 but computed clauses can be included easily, allowing for the reuse
345 of common clauses:
346
347 (let ((dotstar '(0+ any))
348 (whitespace '(1+ (syntax ?-)))
349 (digits '(1+ (char (?0 . ?9)))))
350 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
351
352 Here are the clauses allowed in an `sregex' or `sregexq' expression:
353
354 - a string
355 This stands for the literal string. If it contains
356 metacharacters, they will be escaped in the resulting regex
357 (using `regexp-quote').
358
359 - the symbol `any'
360 This stands for \".\", a regex matching any character except
361 newline.
362
363 - the symbol `bol'
364 Stands for \"^\", matching the empty string at the beginning of a line
365
366 - the symbol `eol'
367 Stands for \"$\", matching the empty string at the end of a line
368
369 - (group CLAUSE ...)
370 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
371
372 - (sequence CLAUSE ...)
373
374 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
375 Clauses grouped by `sequence' do not count for purposes of
376 numbering backreferences. Use `sequence' in situations like
377 this:
378
379 (sregexq (or \"dog\" \"cat\"
380 (sequence (opt \"sea \") \"monkey\")))
381 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\"
382
383 where a single `or' alternate needs to contain multiple
384 subclauses.
385
386 - (backref N)
387 Matches the same string previously matched by the Nth \"group\" in
388 the same sregex. N is a positive integer.
389
390 - (or CLAUSE ...)
391 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
392
393 - (0+ CLAUSE ...)
394 Concatenates the given CLAUSEs and matches zero or more
395 occurrences by appending \"*\".
396
397 - (1+ CLAUSE ...)
398 Concatenates the given CLAUSEs and matches one or more
399 occurrences by appending \"+\".
400
401 - (opt CLAUSE ...)
402 Concatenates the given CLAUSEs and matches zero or one occurrence
403 by appending \"?\".
404
405 - (repeat MIN MAX CLAUSE ...)
406 Concatenates the given CLAUSEs and constructs a regex matching at
407 least MIN occurrences and at most MAX occurrences. MIN must be a
408 non-negative integer. MAX must be a non-negative integer greater
409 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
410
411 - (char CHAR-CLAUSE ...)
412 Creates a \"character class\" matching one character from the given
413 set. See below for how to construct a CHAR-CLAUSE.
414
415 - (not-char CHAR-CLAUSE ...)
416 Creates a \"character class\" matching any one character not in the
417 given set. See below for how to construct a CHAR-CLAUSE.
418
419 - the symbol `bot'
420 Stands for \"\\\\`\", matching the empty string at the beginning of
421 text (beginning of a string or of a buffer).
422
423 - the symbol `eot'
424 Stands for \"\\\\'\", matching the empty string at the end of text.
425
426 - the symbol `point'
427 Stands for \"\\\\=\", matching the empty string at point.
428
429 - the symbol `word-boundary'
430 Stands for \"\\\\b\", matching the empty string at the beginning or
431 end of a word.
432
433 - the symbol `not-word-boundary'
434 Stands for \"\\\\B\", matching the empty string not at the beginning
435 or end of a word.
436
437 - the symbol `bow'
438 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
439 word.
440
441 - the symbol `eow'
442 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
443
444 - the symbol `wordchar'
445 Stands for the regex \"\\\\w\", matching a word-constituent character
446 (as determined by the current syntax table)
447
448 - the symbol `not-wordchar'
449 Stands for the regex \"\\\\W\", matching a non-word-constituent
450 character.
451
452 - (syntax CODE)
453 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
454 (a single character). Matches any character with the requested
455 syntax.
456
457 - (not-syntax CODE)
458 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
459 (a single character). Matches any character without the
460 requested syntax.
461
462 - (regex REGEX)
463 This is a \"trapdoor\" for including ordinary regular expression
464 strings in the result. Some regular expressions are clearer when
465 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
466 instance.
467
468 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
469 has one of the following forms:
470
471 - a character
472 Adds that character to the set.
473
474 - a string
475 Adds all the characters in the string to the set.
476
477 - A pair (MIN . MAX)
478 Where MIN and MAX are characters, adds the range of characters
479 from MIN through MAX to the set."
480 `(apply 'sregex ',exps))
481
482 (defun sregex--engine (exp combine)
483 (cond
484 ((stringp exp)
485 (if (and combine
486 (eq combine 'suffix)
487 (/= (length exp) 1))
488 (concat "\\(?:" (regexp-quote exp) "\\)")
489 (regexp-quote exp)))
490 ((symbolp exp)
491 (ecase exp
492 (any ".")
493 (bol "^")
494 (eol "$")
495 (wordchar "\\w")
496 (not-wordchar "\\W")
497 (bot "\\`")
498 (eot "\\'")
499 (point "\\=")
500 (word-boundary "\\b")
501 (not-word-boundary "\\B")
502 (bow "\\<")
503 (eow "\\>")))
504 ((consp exp)
505 (funcall (intern (concat "sregex--"
506 (symbol-name (car exp))))
507 (cdr exp)
508 combine))
509 (t (error "Invalid expression: %s" exp))))
510
511 (defun sregex--sequence (exps combine)
512 (if (= (length exps) 1) (sregex--engine (car exps) combine)
513 (let ((re (mapconcat
514 (lambda (e) (sregex--engine e 'concat))
515 exps "")))
516 (if (eq combine 'suffix)
517 (concat "\\(?:" re "\\)")
518 re))))
519
520 (defun sregex--or (exps combine)
521 (if (= (length exps) 1) (sregex--engine (car exps) combine)
522 (let ((re (mapconcat
523 (lambda (e) (sregex--engine e 'or))
524 exps "\\|")))
525 (if (not (eq combine 'or))
526 (concat "\\(?:" re "\\)")
527 re))))
528
529 (defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)"))
530
531 (defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps))))
532 (defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?"))
533 (defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*"))
534 (defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+"))
535
536 (defun sregex--char (exps combine) (sregex--char-aux nil exps))
537 (defun sregex--not-char (exps combine) (sregex--char-aux t exps))
538
539 (defun sregex--syntax (exps combine) (format "\\s%c" (car exps)))
540 (defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps)))
541
542 (defun sregex--regex (exps combine)
543 (if combine (concat "\\(?:" (car exps) "\\)") (car exps)))
544
545 (defun sregex--repeat (exps combine)
546 (let* ((min (or (pop exps) 0))
547 (minstr (number-to-string min))
548 (max (pop exps)))
549 (concat (sregex--sequence exps 'suffix)
550 (concat "\\{" minstr ","
551 (when max (number-to-string max)) "\\}"))))
552
553 (defun sregex--char-range (start end)
554 (let ((startc (char-to-string start))
555 (endc (char-to-string end)))
556 (cond
557 ((> end (+ start 2)) (concat startc "-" endc))
558 ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc))
559 ((> end start) (concat startc endc))
560 (t startc))))
561
562 (defun sregex--char-aux (complement args)
563 ;; regex-opt does the same, we should join effort.
564 (let ((chars (make-bool-vector 256 nil))) ; Yeah, right!
565 (dolist (arg args)
566 (cond ((integerp arg) (aset chars arg t))
567 ((stringp arg) (mapcar (lambda (c) (aset chars c t)) arg))
568 ((consp arg)
569 (let ((start (car arg))
570 (end (cdr arg)))
571 (when (> start end)
572 (let ((tmp start)) (setq start end) (setq end tmp)))
573 ;; now start <= end
574 (let ((i start))
575 (while (<= i end)
576 (aset chars i t)
577 (setq i (1+ i))))))))
578 ;; now chars is a map of the characters in the class
579 (let ((caret (aref chars ?^))
580 (dash (aref chars ?-))
581 (class (if (aref chars ?\]) "]" "")))
582 (aset chars ?^ nil)
583 (aset chars ?- nil)
584 (aset chars ?\] nil)
585
586 (let (start end)
587 (dotimes (i 256)
588 (if (aref chars i)
589 (progn
590 (unless start (setq start i))
591 (setq end i)
592 (aset chars i nil))
593 (when start
594 (setq class (concat class (sregex--char-range start end)))
595 (setq start nil))))
596 (if start
597 (setq class (concat class (sregex--char-range start end)))))
598
599 (if (> (length class) 0)
600 (setq class (concat class (if caret "^") (if dash "-")))
601 (setq class (concat class (if dash "-") (if caret "^"))))
602 (if (and (not complement) (= (length class) 1))
603 (regexp-quote class)
604 (concat "[" (if complement "^") class "]")))))
605
606 (provide 'sregex)
607
608 ;;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492
609 ;;; sregex.el ends here