Commit | Line | Data |
---|---|---|
e65f17fe DN |
1 | ;;; sregex.el --- symbolic regular expressions |
2 | ||
d59c3137 | 3 | ;; Copyright (C) 1997, 1998, 2000, 2001, 2002, 2003, 2004, |
5df4f04c | 4 | ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. |
e65f17fe DN |
5 | |
6 | ;; Author: Bob Glickstein <bobg+sregex@zanshin.com> | |
7 | ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com> | |
c2413c67 | 8 | ;; Keywords: extensions |
723ee192 | 9 | ;; Obsolete-since: 24.1 |
e65f17fe DN |
10 | |
11 | ;; This file is part of GNU Emacs. | |
12 | ||
d6cba7ae | 13 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
e65f17fe | 14 | ;; it under the terms of the GNU General Public License as published by |
d6cba7ae GM |
15 | ;; the Free Software Foundation, either version 3 of the License, or |
16 | ;; (at your option) any later version. | |
e65f17fe DN |
17 | |
18 | ;; GNU Emacs is distributed in the hope that it will be useful, | |
19 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 | ;; GNU General Public License for more details. | |
22 | ||
23 | ;; You should have received a copy of the GNU General Public License | |
d6cba7ae | 24 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
e65f17fe DN |
25 | |
26 | ;;; Commentary: | |
27 | ||
28 | ;; This package allows you to write regular expressions using a | |
29 | ;; totally new, Lisp-like syntax. | |
30 | ||
31 | ;; A "symbolic regular expression" (sregex for short) is a Lisp form | |
32 | ;; that, when evaluated, produces the string form of the specified | |
33 | ;; regular expression. Here's a simple example: | |
34 | ||
35 | ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert" | |
36 | ||
37 | ;; As you can see, an sregex is specified by placing one or more | |
38 | ;; special clauses in a call to `sregexq'. The clause in this case is | |
39 | ;; the `or' of two strings (not to be confused with the Lisp function | |
40 | ;; `or'). The list of allowable clauses appears below. | |
41 | ||
42 | ;; With sregex, it is never necessary to "escape" magic characters | |
43 | ;; that are meant to be taken literally; that happens automatically. | |
44 | ;; For example: | |
45 | ||
46 | ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H" | |
47 | ||
48 | ;; It is also unnecessary to "group" parts of the expression together | |
49 | ;; to overcome operator precedence; that also happens automatically. | |
50 | ;; For example: | |
51 | ||
40aeecad | 52 | ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?" |
e65f17fe DN |
53 | |
54 | ;; It *is* possible to group parts of the expression in order to refer | |
55 | ;; to them with numbered backreferences: | |
56 | ||
57 | ;; (sregexq (group (or "Go" "Run")) | |
58 | ;; ", Spot, " | |
59 | ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1" | |
60 | ||
e65f17fe DN |
61 | ;; `sregexq' is a macro. Each time it is used, it constructs a simple |
62 | ;; Lisp expression that then invokes a moderately complex engine to | |
63 | ;; interpret the sregex and render the string form. Because of this, | |
64 | ;; I don't recommend sprinkling calls to `sregexq' throughout your | |
65 | ;; code, the way one normally does with string regexes (which are | |
66 | ;; cheap to evaluate). Instead, it's wiser to precompute the regexes | |
67 | ;; you need wherever possible instead of repeatedly constructing the | |
68 | ;; same ones over and over. Example: | |
69 | ||
70 | ;; (let ((field-regex (sregexq (opt "resent-") | |
71 | ;; (or "to" "cc" "bcc")))) | |
72 | ;; ... | |
73 | ;; (while ... | |
74 | ;; ... | |
75 | ;; (re-search-forward field-regex ...) | |
76 | ;; ...)) | |
77 | ||
78 | ;; The arguments to `sregexq' are automatically quoted, but the | |
79 | ;; flipside of this is that it is not straightforward to include | |
80 | ;; computed (i.e., non-constant) values in `sregexq' expressions. So | |
81 | ;; `sregex' is a function that is like `sregexq' but which does not | |
82 | ;; automatically quote its values. Literal sregex clauses must be | |
83 | ;; explicitly quoted like so: | |
84 | ||
85 | ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert" | |
86 | ||
87 | ;; but computed clauses can be included easily, allowing for the reuse | |
88 | ;; of common clauses: | |
89 | ||
90 | ;; (let ((dotstar '(0+ any)) | |
91 | ;; (whitespace '(1+ (syntax ?-))) | |
92 | ;; (digits '(1+ (char (?0 . ?9))))) | |
93 | ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+" | |
94 | ||
e65f17fe DN |
95 | ;; To use this package in a Lisp program, simply (require 'sregex). |
96 | ||
97 | ;; Here are the clauses allowed in an `sregex' or `sregexq' | |
98 | ;; expression: | |
99 | ||
100 | ;; - a string | |
101 | ;; This stands for the literal string. If it contains | |
102 | ;; metacharacters, they will be escaped in the resulting regex | |
103 | ;; (using `regexp-quote'). | |
104 | ||
105 | ;; - the symbol `any' | |
106 | ;; This stands for ".", a regex matching any character except | |
107 | ;; newline. | |
108 | ||
109 | ;; - the symbol `bol' | |
110 | ;; Stands for "^", matching the empty string at the beginning of a line | |
111 | ||
112 | ;; - the symbol `eol' | |
113 | ;; Stands for "$", matching the empty string at the end of a line | |
114 | ||
115 | ;; - (group CLAUSE ...) | |
116 | ;; Groups the given CLAUSEs using "\\(" and "\\)". | |
117 | ||
118 | ;; - (sequence CLAUSE ...) | |
119 | ||
40aeecad SM |
120 | ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)". |
121 | ;; Clauses grouped by `sequence' do not count for purposes of | |
e65f17fe DN |
122 | ;; numbering backreferences. Use `sequence' in situations like |
123 | ;; this: | |
124 | ||
125 | ;; (sregexq (or "dog" "cat" | |
126 | ;; (sequence (opt "sea ") "monkey"))) | |
40aeecad | 127 | ;; => "dog\\|cat\\|\\(?:sea \\)?monkey" |
e65f17fe DN |
128 | |
129 | ;; where a single `or' alternate needs to contain multiple | |
130 | ;; subclauses. | |
131 | ||
132 | ;; - (backref N) | |
133 | ;; Matches the same string previously matched by the Nth "group" in | |
40aeecad | 134 | ;; the same sregex. N is a positive integer. |
e65f17fe DN |
135 | |
136 | ;; - (or CLAUSE ...) | |
137 | ;; Matches any one of the CLAUSEs by separating them with "\\|". | |
138 | ||
139 | ;; - (0+ CLAUSE ...) | |
140 | ;; Concatenates the given CLAUSEs and matches zero or more | |
141 | ;; occurrences by appending "*". | |
142 | ||
143 | ;; - (1+ CLAUSE ...) | |
144 | ;; Concatenates the given CLAUSEs and matches one or more | |
145 | ;; occurrences by appending "+". | |
146 | ||
147 | ;; - (opt CLAUSE ...) | |
148 | ;; Concatenates the given CLAUSEs and matches zero or one occurrence | |
149 | ;; by appending "?". | |
150 | ||
151 | ;; - (repeat MIN MAX CLAUSE ...) | |
152 | ;; Concatenates the given CLAUSEs and constructs a regex matching at | |
153 | ;; least MIN occurrences and at most MAX occurrences. MIN must be a | |
154 | ;; non-negative integer. MAX must be a non-negative integer greater | |
155 | ;; than or equal to MIN; or MAX can be nil to mean "infinity." | |
156 | ||
157 | ;; - (char CHAR-CLAUSE ...) | |
158 | ;; Creates a "character class" matching one character from the given | |
159 | ;; set. See below for how to construct a CHAR-CLAUSE. | |
160 | ||
161 | ;; - (not-char CHAR-CLAUSE ...) | |
162 | ;; Creates a "character class" matching any one character not in the | |
163 | ;; given set. See below for how to construct a CHAR-CLAUSE. | |
164 | ||
165 | ;; - the symbol `bot' | |
166 | ;; Stands for "\\`", matching the empty string at the beginning of | |
167 | ;; text (beginning of a string or of a buffer). | |
168 | ||
169 | ;; - the symbol `eot' | |
170 | ;; Stands for "\\'", matching the empty string at the end of text. | |
171 | ||
172 | ;; - the symbol `point' | |
173 | ;; Stands for "\\=", matching the empty string at point. | |
174 | ||
175 | ;; - the symbol `word-boundary' | |
176 | ;; Stands for "\\b", matching the empty string at the beginning or | |
177 | ;; end of a word. | |
178 | ||
179 | ;; - the symbol `not-word-boundary' | |
180 | ;; Stands for "\\B", matching the empty string not at the beginning | |
181 | ;; or end of a word. | |
182 | ||
183 | ;; - the symbol `bow' | |
184 | ;; Stands for "\\<", matching the empty string at the beginning of a | |
185 | ;; word. | |
186 | ||
187 | ;; - the symbol `eow' | |
188 | ;; Stands for "\\>", matching the empty string at the end of a word. | |
189 | ||
190 | ;; - the symbol `wordchar' | |
191 | ;; Stands for the regex "\\w", matching a word-constituent character | |
192 | ;; (as determined by the current syntax table) | |
193 | ||
194 | ;; - the symbol `not-wordchar' | |
195 | ;; Stands for the regex "\\W", matching a non-word-constituent | |
196 | ;; character. | |
197 | ||
198 | ;; - (syntax CODE) | |
199 | ;; Stands for the regex "\\sCODE", where CODE is a syntax table code | |
200 | ;; (a single character). Matches any character with the requested | |
201 | ;; syntax. | |
202 | ||
203 | ;; - (not-syntax CODE) | |
204 | ;; Stands for the regex "\\SCODE", where CODE is a syntax table code | |
205 | ;; (a single character). Matches any character without the | |
206 | ;; requested syntax. | |
207 | ||
208 | ;; - (regex REGEX) | |
209 | ;; This is a "trapdoor" for including ordinary regular expression | |
210 | ;; strings in the result. Some regular expressions are clearer when | |
211 | ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for | |
212 | ;; instance. However, see the note under "Bugs," below. | |
213 | ||
214 | ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) | |
215 | ;; has one of the following forms: | |
216 | ||
217 | ;; - a character | |
218 | ;; Adds that character to the set. | |
219 | ||
220 | ;; - a string | |
221 | ;; Adds all the characters in the string to the set. | |
222 | ||
223 | ;; - A pair (MIN . MAX) | |
224 | ;; Where MIN and MAX are characters, adds the range of characters | |
225 | ;; from MIN through MAX to the set. | |
226 | ||
227 | ;;; To do: | |
228 | ||
e65f17fe DN |
229 | ;; An earlier version of this package could optionally translate the |
230 | ;; symbolic regex into other languages' syntaxes, e.g. Perl. For | |
231 | ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would | |
232 | ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore | |
233 | ;; such a facility. | |
234 | ||
40aeecad SM |
235 | ;; - handle multibyte chars in sregex--char-aux |
236 | ;; - add support for character classes ([:blank:], ...) | |
237 | ;; - add support for non-greedy operators *? and +? | |
238 | ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?" | |
e65f17fe | 239 | |
40aeecad | 240 | ;;; Bugs: |
e65f17fe DN |
241 | |
242 | ;;; Code: | |
243 | ||
40aeecad | 244 | (eval-when-compile (require 'cl)) |
e65f17fe | 245 | |
40aeecad SM |
246 | ;; Compatibility code for when we didn't have shy-groups |
247 | (defvar sregex--current-sregex nil) | |
248 | (defun sregex-info () nil) | |
249 | (defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms)) | |
250 | (defun sregex-replace-match (r &optional f l str subexp x) | |
251 | (replace-match r f l str subexp)) | |
252 | (defun sregex-match-string (c &optional i x) (match-string c i)) | |
e65f17fe | 253 | (defun sregex-match-string-no-properties (count &optional in-string sregex) |
40aeecad SM |
254 | (match-string-no-properties count in-string)) |
255 | (defun sregex-match-beginning (count &optional sregex) (match-beginning count)) | |
256 | (defun sregex-match-end (count &optional sregex) (match-end count)) | |
257 | (defun sregex-match-data (&optional sregex) (match-data)) | |
258 | (defun sregex-backref-num (n &optional sregex) n) | |
259 | ||
e65f17fe DN |
260 | |
261 | (defun sregex (&rest exps) | |
262 | "Symbolic regular expression interpreter. | |
263 | This is exactly like `sregexq' (q.v.) except that it evaluates all its | |
264 | arguments, so literal sregex clauses must be quoted. For example: | |
265 | ||
266 | (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
267 | ||
268 | An argument-evaluating sregex interpreter lets you reuse sregex | |
269 | subexpressions: | |
270 | ||
271 | (let ((dotstar '(0+ any)) | |
272 | (whitespace '(1+ (syntax ?-))) | |
273 | (digits '(1+ (char (?0 . ?9))))) | |
274 | (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"" | |
40aeecad | 275 | (sregex--sequence exps nil)) |
e65f17fe DN |
276 | |
277 | (defmacro sregexq (&rest exps) | |
278 | "Symbolic regular expression interpreter. | |
279 | This macro allows you to specify a regular expression (regexp) in | |
280 | symbolic form, and converts it into the string form required by Emacs's | |
281 | regex functions such as `re-search-forward' and `looking-at'. Here is | |
282 | a simple example: | |
283 | ||
284 | (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
285 | ||
286 | As you can see, an sregex is specified by placing one or more special | |
287 | clauses in a call to `sregexq'. The clause in this case is the `or' | |
288 | of two strings (not to be confused with the Lisp function `or'). The | |
289 | list of allowable clauses appears below. | |
290 | ||
291 | With `sregex', it is never necessary to \"escape\" magic characters | |
292 | that are meant to be taken literally; that happens automatically. | |
293 | For example: | |
294 | ||
295 | (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\" | |
296 | ||
297 | It is also unnecessary to \"group\" parts of the expression together | |
298 | to overcome operator precedence; that also happens automatically. | |
299 | For example: | |
300 | ||
301 | (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\" | |
302 | ||
303 | It *is* possible to group parts of the expression in order to refer | |
304 | to them with numbered backreferences: | |
305 | ||
306 | (sregexq (group (or \"Go\" \"Run\")) | |
307 | \", Spot, \" | |
308 | (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\" | |
309 | ||
310 | If `sregexq' needs to introduce its own grouping parentheses, it will | |
311 | automatically renumber your backreferences: | |
312 | ||
313 | (sregexq (opt \"resent-\") | |
314 | (group (or \"to\" \"cc\" \"bcc\")) | |
315 | \": \" | |
316 | (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\" | |
317 | ||
318 | `sregexq' is a macro. Each time it is used, it constructs a simple | |
319 | Lisp expression that then invokes a moderately complex engine to | |
320 | interpret the sregex and render the string form. Because of this, I | |
321 | don't recommend sprinkling calls to `sregexq' throughout your code, | |
322 | the way one normally does with string regexes (which are cheap to | |
323 | evaluate). Instead, it's wiser to precompute the regexes you need | |
324 | wherever possible instead of repeatedly constructing the same ones | |
325 | over and over. Example: | |
326 | ||
327 | (let ((field-regex (sregexq (opt \"resent-\") | |
328 | (or \"to\" \"cc\" \"bcc\")))) | |
329 | ... | |
330 | (while ... | |
331 | ... | |
332 | (re-search-forward field-regex ...) | |
333 | ...)) | |
334 | ||
335 | The arguments to `sregexq' are automatically quoted, but the | |
336 | flipside of this is that it is not straightforward to include | |
337 | computed (i.e., non-constant) values in `sregexq' expressions. So | |
338 | `sregex' is a function that is like `sregexq' but which does not | |
339 | automatically quote its values. Literal sregex clauses must be | |
340 | explicitly quoted like so: | |
341 | ||
342 | (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\" | |
343 | ||
344 | but computed clauses can be included easily, allowing for the reuse | |
345 | of common clauses: | |
346 | ||
347 | (let ((dotstar '(0+ any)) | |
348 | (whitespace '(1+ (syntax ?-))) | |
349 | (digits '(1+ (char (?0 . ?9))))) | |
350 | (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\" | |
351 | ||
352 | Here are the clauses allowed in an `sregex' or `sregexq' expression: | |
353 | ||
354 | - a string | |
355 | This stands for the literal string. If it contains | |
356 | metacharacters, they will be escaped in the resulting regex | |
357 | (using `regexp-quote'). | |
358 | ||
359 | - the symbol `any' | |
360 | This stands for \".\", a regex matching any character except | |
361 | newline. | |
362 | ||
363 | - the symbol `bol' | |
364 | Stands for \"^\", matching the empty string at the beginning of a line | |
365 | ||
366 | - the symbol `eol' | |
367 | Stands for \"$\", matching the empty string at the end of a line | |
368 | ||
369 | - (group CLAUSE ...) | |
370 | Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\". | |
371 | ||
372 | - (sequence CLAUSE ...) | |
373 | ||
374 | Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\". | |
40aeecad | 375 | Clauses grouped by `sequence' do not count for purposes of |
e65f17fe DN |
376 | numbering backreferences. Use `sequence' in situations like |
377 | this: | |
378 | ||
379 | (sregexq (or \"dog\" \"cat\" | |
380 | (sequence (opt \"sea \") \"monkey\"))) | |
40aeecad | 381 | => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\" |
e65f17fe DN |
382 | |
383 | where a single `or' alternate needs to contain multiple | |
384 | subclauses. | |
385 | ||
386 | - (backref N) | |
387 | Matches the same string previously matched by the Nth \"group\" in | |
40aeecad | 388 | the same sregex. N is a positive integer. |
e65f17fe DN |
389 | |
390 | - (or CLAUSE ...) | |
391 | Matches any one of the CLAUSEs by separating them with \"\\\\|\". | |
392 | ||
393 | - (0+ CLAUSE ...) | |
394 | Concatenates the given CLAUSEs and matches zero or more | |
395 | occurrences by appending \"*\". | |
396 | ||
397 | - (1+ CLAUSE ...) | |
398 | Concatenates the given CLAUSEs and matches one or more | |
399 | occurrences by appending \"+\". | |
400 | ||
401 | - (opt CLAUSE ...) | |
402 | Concatenates the given CLAUSEs and matches zero or one occurrence | |
403 | by appending \"?\". | |
404 | ||
405 | - (repeat MIN MAX CLAUSE ...) | |
406 | Concatenates the given CLAUSEs and constructs a regex matching at | |
407 | least MIN occurrences and at most MAX occurrences. MIN must be a | |
408 | non-negative integer. MAX must be a non-negative integer greater | |
409 | than or equal to MIN; or MAX can be nil to mean \"infinity.\" | |
410 | ||
411 | - (char CHAR-CLAUSE ...) | |
412 | Creates a \"character class\" matching one character from the given | |
413 | set. See below for how to construct a CHAR-CLAUSE. | |
414 | ||
415 | - (not-char CHAR-CLAUSE ...) | |
416 | Creates a \"character class\" matching any one character not in the | |
417 | given set. See below for how to construct a CHAR-CLAUSE. | |
418 | ||
419 | - the symbol `bot' | |
420 | Stands for \"\\\\`\", matching the empty string at the beginning of | |
421 | text (beginning of a string or of a buffer). | |
422 | ||
423 | - the symbol `eot' | |
424 | Stands for \"\\\\'\", matching the empty string at the end of text. | |
425 | ||
426 | - the symbol `point' | |
faf9e188 | 427 | Stands for \"\\\\=\\=\", matching the empty string at point. |
e65f17fe DN |
428 | |
429 | - the symbol `word-boundary' | |
430 | Stands for \"\\\\b\", matching the empty string at the beginning or | |
431 | end of a word. | |
432 | ||
433 | - the symbol `not-word-boundary' | |
434 | Stands for \"\\\\B\", matching the empty string not at the beginning | |
435 | or end of a word. | |
436 | ||
437 | - the symbol `bow' | |
67e2f487 | 438 | Stands for \"\\\\=\\<\", matching the empty string at the beginning of a |
e65f17fe DN |
439 | word. |
440 | ||
441 | - the symbol `eow' | |
67e2f487 | 442 | Stands for \"\\\\=\\>\", matching the empty string at the end of a word. |
e65f17fe DN |
443 | |
444 | - the symbol `wordchar' | |
445 | Stands for the regex \"\\\\w\", matching a word-constituent character | |
446 | (as determined by the current syntax table) | |
447 | ||
448 | - the symbol `not-wordchar' | |
449 | Stands for the regex \"\\\\W\", matching a non-word-constituent | |
450 | character. | |
451 | ||
452 | - (syntax CODE) | |
453 | Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code | |
454 | (a single character). Matches any character with the requested | |
455 | syntax. | |
456 | ||
457 | - (not-syntax CODE) | |
458 | Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code | |
459 | (a single character). Matches any character without the | |
460 | requested syntax. | |
461 | ||
462 | - (regex REGEX) | |
463 | This is a \"trapdoor\" for including ordinary regular expression | |
464 | strings in the result. Some regular expressions are clearer when | |
465 | written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for | |
40aeecad | 466 | instance. |
e65f17fe DN |
467 | |
468 | Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...) | |
469 | has one of the following forms: | |
470 | ||
471 | - a character | |
472 | Adds that character to the set. | |
473 | ||
474 | - a string | |
475 | Adds all the characters in the string to the set. | |
476 | ||
477 | - A pair (MIN . MAX) | |
478 | Where MIN and MAX are characters, adds the range of characters | |
479 | from MIN through MAX to the set." | |
480 | `(apply 'sregex ',exps)) | |
481 | ||
482 | (defun sregex--engine (exp combine) | |
40aeecad SM |
483 | (cond |
484 | ((stringp exp) | |
485 | (if (and combine | |
486 | (eq combine 'suffix) | |
487 | (/= (length exp) 1)) | |
488 | (concat "\\(?:" (regexp-quote exp) "\\)") | |
489 | (regexp-quote exp))) | |
490 | ((symbolp exp) | |
491 | (ecase exp | |
492 | (any ".") | |
493 | (bol "^") | |
494 | (eol "$") | |
495 | (wordchar "\\w") | |
496 | (not-wordchar "\\W") | |
497 | (bot "\\`") | |
498 | (eot "\\'") | |
499 | (point "\\=") | |
500 | (word-boundary "\\b") | |
501 | (not-word-boundary "\\B") | |
502 | (bow "\\<") | |
503 | (eow "\\>"))) | |
504 | ((consp exp) | |
505 | (funcall (intern (concat "sregex--" | |
506 | (symbol-name (car exp)))) | |
507 | (cdr exp) | |
508 | combine)) | |
509 | (t (error "Invalid expression: %s" exp)))) | |
e65f17fe DN |
510 | |
511 | (defun sregex--sequence (exps combine) | |
40aeecad SM |
512 | (if (= (length exps) 1) (sregex--engine (car exps) combine) |
513 | (let ((re (mapconcat | |
514 | (lambda (e) (sregex--engine e 'concat)) | |
515 | exps ""))) | |
e65f17fe | 516 | (if (eq combine 'suffix) |
40aeecad SM |
517 | (concat "\\(?:" re "\\)") |
518 | re)))) | |
e65f17fe DN |
519 | |
520 | (defun sregex--or (exps combine) | |
40aeecad SM |
521 | (if (= (length exps) 1) (sregex--engine (car exps) combine) |
522 | (let ((re (mapconcat | |
523 | (lambda (e) (sregex--engine e 'or)) | |
524 | exps "\\|"))) | |
525 | (if (not (eq combine 'or)) | |
526 | (concat "\\(?:" re "\\)") | |
527 | re)))) | |
528 | ||
529 | (defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)")) | |
530 | ||
531 | (defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps)))) | |
532 | (defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?")) | |
533 | (defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*")) | |
534 | (defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+")) | |
535 | ||
536 | (defun sregex--char (exps combine) (sregex--char-aux nil exps)) | |
537 | (defun sregex--not-char (exps combine) (sregex--char-aux t exps)) | |
538 | ||
539 | (defun sregex--syntax (exps combine) (format "\\s%c" (car exps))) | |
540 | (defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps))) | |
541 | ||
542 | (defun sregex--regex (exps combine) | |
543 | (if combine (concat "\\(?:" (car exps) "\\)") (car exps))) | |
544 | ||
545 | (defun sregex--repeat (exps combine) | |
546 | (let* ((min (or (pop exps) 0)) | |
547 | (minstr (number-to-string min)) | |
548 | (max (pop exps))) | |
549 | (concat (sregex--sequence exps 'suffix) | |
550 | (concat "\\{" minstr "," | |
551 | (when max (number-to-string max)) "\\}")))) | |
552 | ||
553 | (defun sregex--char-range (start end) | |
554 | (let ((startc (char-to-string start)) | |
555 | (endc (char-to-string end))) | |
556 | (cond | |
557 | ((> end (+ start 2)) (concat startc "-" endc)) | |
558 | ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc)) | |
559 | ((> end start) (concat startc endc)) | |
560 | (t startc)))) | |
e65f17fe DN |
561 | |
562 | (defun sregex--char-aux (complement args) | |
40aeecad SM |
563 | ;; regex-opt does the same, we should join effort. |
564 | (let ((chars (make-bool-vector 256 nil))) ; Yeah, right! | |
565 | (dolist (arg args) | |
566 | (cond ((integerp arg) (aset chars arg t)) | |
415741a7 | 567 | ((stringp arg) (mapc (lambda (c) (aset chars c t)) arg)) |
40aeecad SM |
568 | ((consp arg) |
569 | (let ((start (car arg)) | |
570 | (end (cdr arg))) | |
571 | (when (> start end) | |
572 | (let ((tmp start)) (setq start end) (setq end tmp))) | |
573 | ;; now start <= end | |
574 | (let ((i start)) | |
575 | (while (<= i end) | |
576 | (aset chars i t) | |
577 | (setq i (1+ i)))))))) | |
e65f17fe | 578 | ;; now chars is a map of the characters in the class |
40aeecad SM |
579 | (let ((caret (aref chars ?^)) |
580 | (dash (aref chars ?-)) | |
581 | (class (if (aref chars ?\]) "]" ""))) | |
e65f17fe | 582 | (aset chars ?^ nil) |
40aeecad SM |
583 | (aset chars ?- nil) |
584 | (aset chars ?\] nil) | |
585 | ||
586 | (let (start end) | |
587 | (dotimes (i 256) | |
588 | (if (aref chars i) | |
589 | (progn | |
590 | (unless start (setq start i)) | |
591 | (setq end i) | |
592 | (aset chars i nil)) | |
593 | (when start | |
594 | (setq class (concat class (sregex--char-range start end))) | |
595 | (setq start nil)))) | |
596 | (if start | |
597 | (setq class (concat class (sregex--char-range start end))))) | |
598 | ||
599 | (if (> (length class) 0) | |
600 | (setq class (concat class (if caret "^") (if dash "-"))) | |
601 | (setq class (concat class (if dash "-") (if caret "^")))) | |
602 | (if (and (not complement) (= (length class) 1)) | |
603 | (regexp-quote class) | |
604 | (concat "[" (if complement "^") class "]"))))) | |
e65f17fe DN |
605 | |
606 | (provide 'sregex) | |
607 | ||
cbee283d | 608 | ;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492 |
e65f17fe | 609 | ;;; sregex.el ends here |