lisp/emacs-lisp/rx.el

   1 ;;; rx.el --- sexp notation for regular expressions
   2
   3 ;; Copyright (C) 2001, 2003, 2004, 2005  Free Software Foundation, Inc.
   4
   5 ;; Author: Gerd Moellmann <gerd@gnu.org>
   6 ;; Maintainer: FSF
   7 ;; Keywords: strings, regexps, extensions
   8
   9 ;; This file is part of GNU Emacs.
  10
  11 ;; GNU Emacs is free software; you can redistribute it and/or modify
  12 ;; it under the terms of the GNU General Public License as published by
  13 ;; the Free Software Foundation; either version 2, or (at your option)
  14 ;; any later version.
  15
  16 ;; GNU Emacs is distributed in the hope that it will be useful,
  17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 ;; GNU General Public License for more details.
  20
  21 ;; You should have received a copy of the GNU General Public License
  22 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 ;; Boston, MA 02111-1307, USA.
  25
  26 ;;; Commentary:
  27
  28 ;; This is another implementation of sexp-form regular expressions.
  29 ;; It was unfortunately written without being aware of the Sregex
  30 ;; package coming with Emacs, but as things stand, Rx completely
  31 ;; covers all regexp features, which Sregex doesn't, doesn't suffer
  32 ;; from the bugs mentioned in the commentary section of Sregex, and
  33 ;; uses a nicer syntax (IMHO, of course :-).
  34
  35 ;; This significantly extended version of the original, is almost
  36 ;; compatible with Sregex.  The only incompatibility I (fx) know of is
  37 ;; that the `repeat' form can't have multiple regexp args.
  38
  39 ;; Now alternative forms are provided for a degree of compatibility
  40 ;; with Shivers' attempted definitive SRE notation
  41 ;; <URL:http://www.ai.mit.edu/~/shivers/sre.txt>.  SRE forms not
  42 ;; catered for include: dsm, uncase, w/case, w/nocase, ,@<exp>,
  43 ;; ,<exp>, (word ...), word+, posix-string, and character class forms.
  44 ;; Some forms are inconsistent with SRE, either for historical reasons
  45 ;; or because of the implementation -- simple translation into Emacs
  46 ;; regexp strings.  These include: any, word.  Also, case-sensitivity
  47 ;; and greediness are controlled by variables external to the regexp,
  48 ;; and you need to feed the forms to the `posix-' functions to get
  49 ;; SRE's POSIX semantics.  There are probably more difficulties.
  50
  51 ;; Rx translates a sexp notation for regular expressions into the
  52 ;; usual string notation.  The translation can be done at compile-time
  53 ;; by using the `rx' macro.  It can be done at run-time by calling
  54 ;; function `rx-to-string'.  See the documentation of `rx' for a
  55 ;; complete description of the sexp notation.
  56 ;;
  57 ;; Some examples of string regexps and their sexp counterparts:
  58 ;;
  59 ;; "^[a-z]*"
  60 ;; (rx (and line-start (0+ (in "a-z"))))
  61 ;;
  62 ;; "\n[^ \t]"
  63 ;; (rx (and "\n" (not blank))), or
  64 ;; (rx (and "\n" (not (any " \t"))))
  65 ;;
  66 ;; "\\*\\*\\* EOOH \\*\\*\\*\n"
  67 ;; (rx "*** EOOH ***\n")
  68 ;;
  69 ;; "\\<\\(catch\\|finally\\)\\>[^_]"
  70 ;; (rx (and word-start (submatch (or "catch" "finally")) word-end
  71 ;;          (not (any ?_))))
  72 ;;
  73 ;; "[ \t\n]*:\\([^:]+\\|$\\)"
  74 ;; (rx (and (zero-or-more (in " \t\n")) ":"
  75 ;;          (submatch (or line-end (one-or-more (not (any ?:)))))))
  76 ;;
  77 ;; "^content-transfer-encoding:\\(\n?[\t ]\\)*quoted-printable\\(\n?[\t ]\\)*"
  78 ;; (rx (and line-start
  79 ;;          "content-transfer-encoding:"
  80 ;;          (+ (? ?\n)) blank
  81 ;;          "quoted-printable"
  82 ;;          (+ (? ?\n)) blank))
  83 ;;
  84 ;; (concat "^\\(?:" something-else "\\)")
  85 ;; (rx (and line-start (eval something-else))), statically or
  86 ;; (rx-to-string '(and line-start ,something-else)), dynamically.
  87 ;;
  88 ;; (regexp-opt '(STRING1 STRING2 ...))
  89 ;; (rx (or STRING1 STRING2 ...)), or in other words, `or' automatically
  90 ;; calls `regexp-opt' as needed.
  91 ;;
  92 ;; "^;;\\s-*\n\\|^\n"
  93 ;; (rx (or (and line-start ";;" (0+ space) ?\n)
  94 ;;         (and line-start ?\n)))
  95 ;;
  96 ;; "\\$[I]d: [^ ]+ \\([^ ]+\\) "
  97 ;; (rx (and "$Id: "
  98 ;;          (1+ (not (in " ")))
  99 ;;          " "
 100 ;;          (submatch (1+ (not (in " "))))
 101 ;;          " "))
 102 ;;
 103 ;; "\\\\\\\\\\[\\w+"
 104 ;; (rx (and ?\\ ?\\ ?\[ (1+ word)))
 105 ;;
 106 ;; etc.
 107
 108 ;;; History:
 109 ;;
 110
 111 ;;; Code:
 112
 113 (defconst rx-constituents
 114   '((and                . (rx-and 1 nil))
 115     (seq                . and)          ; SRE
 116     (:                  . and)          ; SRE
 117     (sequence           . and)          ; sregex
 118     (or                 . (rx-or 1 nil))
 119     (|                  . or)           ; SRE
 120     (not-newline        . ".")
 121     (nonl               . not-newline)  ; SRE
 122     (anything           . ".\\|\n")
 123     (any                . (rx-any 1 nil rx-check-any)) ; inconsistent with SRE
 124     (in                 . any)
 125     (char               . any)          ; sregex
 126     (not-char           . (rx-not-char 1 nil rx-check-any)) ; sregex
 127     (not                . (rx-not 1 1 rx-check-not))
 128     ;; Partially consistent with sregex, whose `repeat' is like our
 129     ;; `**'.  (`repeat' with optional max arg and multiple sexp forms
 130     ;; is ambiguous.)
 131     (repeat             . (rx-repeat 2 3))
 132     (=                  . (rx-= 2 nil))    ; SRE
 133     (>=                 . (rx->= 2 nil))   ; SRE
 134     (**                 . (rx-** 2 nil))   ; SRE
 135     (submatch           . (rx-submatch 1 nil)) ; SRE
 136     (group              . submatch)
 137     (zero-or-more       . (rx-kleene 1 nil))
 138     (one-or-more        . (rx-kleene 1 nil))
 139     (zero-or-one        . (rx-kleene 1 nil))
 140     (\?                 . zero-or-one)  ; SRE
 141     (\??                . zero-or-one)
 142     (*                  . zero-or-more) ; SRE
 143     (*?                 . zero-or-more)
 144     (0+                 . zero-or-more)
 145     (+                  . one-or-more)  ; SRE
 146     (+?                 . one-or-more)
 147     (1+                 . one-or-more)
 148     (optional           . zero-or-one)
 149     (opt                . zero-or-one)  ; sregex
 150     (minimal-match      . (rx-greedy 1 1))
 151     (maximal-match      . (rx-greedy 1 1))
 152     (backref            . (rx-backref 1 1 rx-check-backref))
 153     (line-start         . "^")
 154     (bol                . line-start)   ; SRE
 155     (line-end           . "$")
 156     (eol                . line-end)     ; SRE
 157     (string-start       . "\\`")
 158     (bos                . string-start) ; SRE
 159     (bot                . string-start) ; sregex
 160     (string-end         . "\\'")
 161     (eos                . string-end)   ; SRE
 162     (eot                . string-end)   ; sregex
 163     (buffer-start       . "\\`")
 164     (buffer-end         . "\\'")
 165     (point              . "\\=")
 166     (word-start         . "\\<")
 167     (bow                . word-start)   ; SRE
 168     (word-end           . "\\>")
 169     (eow                . word-end)     ; SRE
 170     (word-boundary      . "\\b")
 171     (not-word-boundary  . "\\B")        ; sregex
 172     (symbol-start       . "\\_<")
 173     (symbol-end         . "\\_>")
 174     (syntax             . (rx-syntax 1 1))
 175     (not-syntax         . (rx-not-syntax 1 1)) ; sregex
 176     (category           . (rx-category 1 1 rx-check-category))
 177     (eval               . (rx-eval 1 1))
 178     (regexp             . (rx-regexp 1 1 stringp))
 179     (digit              . "[[:digit:]]")
 180     (numeric            . digit)        ; SRE
 181     (num                . digit)        ; SRE
 182     (control            . "[[:cntrl:]]") ; SRE
 183     (cntrl              . control)       ; SRE
 184     (hex-digit          . "[[:xdigit:]]") ; SRE
 185     (hex                . hex-digit)      ; SRE
 186     (xdigit             . hex-digit)      ; SRE
 187     (blank              . "[[:blank:]]")  ; SRE
 188     (graphic            . "[[:graph:]]")  ; SRE
 189     (graph              . graphic)        ; SRE
 190     (printing           . "[[:print:]]")  ; SRE
 191     (print              . printing)       ; SRE
 192     (alphanumeric       . "[[:alnum:]]")  ; SRE
 193     (alnum              . alphanumeric)   ; SRE
 194     (letter             . "[[:alpha:]]")
 195     (alphabetic         . letter)       ; SRE
 196     (alpha              . letter)       ; SRE
 197     (ascii              . "[[:ascii:]]") ; SRE
 198     (nonascii           . "[[:nonascii:]]")
 199     (lower              . "[[:lower:]]") ; SRE
 200     (lower-case         . lower)         ; SRE
 201     (punctuation        . "[[:punct:]]") ; SRE
 202     (punct              . punctuation)   ; SRE
 203     (space              . "[[:space:]]") ; SRE
 204     (whitespace         . space)         ; SRE
 205     (white              . space)         ; SRE
 206     (upper              . "[[:upper:]]") ; SRE
 207     (upper-case         . upper)         ; SRE
 208     (word               . "[[:word:]]")  ; inconsistent with SRE
 209     (wordchar           . word)          ; sregex
 210     (not-wordchar       . "[^[:word:]]") ; sregex (use \\W?)
 211     )
 212   "Alist of sexp form regexp constituents.
 213 Each element of the alist has the form (SYMBOL . DEFN).
 214 SYMBOL is a valid constituent of sexp regular expressions.
 215 If DEFN is a string, SYMBOL is translated into DEFN.
 216 If DEFN is a symbol, use the definition of DEFN, recursively.
 217 Otherwise, DEFN must be a list (FUNCTION MIN-ARGS MAX-ARGS PREDICATE).
 218 FUNCTION is used to produce code for SYMBOL.  MIN-ARGS and MAX-ARGS
 219 are the minimum and maximum number of arguments the function-form
 220 sexp constituent SYMBOL may have in sexp regular expressions.
 221 MAX-ARGS nil means no limit.  PREDICATE, if specified, means that
 222 all arguments must satisfy PREDICATE.")
 223
 224
 225 (defconst rx-syntax
 226   '((whitespace         . ?-)
 227     (punctuation        . ?.)
 228     (word               . ?w)
 229     (symbol             . ?_)
 230     (open-parenthesis   . ?\()
 231     (close-parenthesis  . ?\))
 232     (expression-prefix  . ?\')
 233     (string-quote       . ?\")
 234     (paired-delimiter   . ?$)
 235     (escape             . ?\\)
 236     (character-quote    . ?/)
 237     (comment-start      . ?<)
 238     (comment-end        . ?>)
 239     (string-delimiter   . ?|)
 240     (comment-delimiter  . ?!))
 241   "Alist mapping Rx syntax symbols to syntax characters.
 242 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 243 symbol in `(syntax SYMBOL)', and CHAR is the syntax character
 244 corresponding to SYMBOL, as it would be used with \\s or \\S in
 245 regular expressions.")
 246
 247
 248 (defconst rx-categories
 249   '((consonant                  . ?0)
 250     (base-vowel                 . ?1)
 251     (upper-diacritical-mark     . ?2)
 252     (lower-diacritical-mark     . ?3)
 253     (tone-mark                  . ?4)
 254     (symbol                     . ?5)
 255     (digit                      . ?6)
 256     (vowel-modifying-diacritical-mark . ?7)
 257     (vowel-sign                 . ?8)
 258     (semivowel-lower            . ?9)
 259     (not-at-end-of-line         . ?<)
 260     (not-at-beginning-of-line   . ?>)
 261     (alpha-numeric-two-byte     . ?A)
 262     (chinse-two-byte            . ?C)
 263     (greek-two-byte             . ?G)
 264     (japanese-hiragana-two-byte . ?H)
 265     (indian-two-byte            . ?I)
 266     (japanese-katakana-two-byte . ?K)
 267     (korean-hangul-two-byte     . ?N)
 268     (cyrillic-two-byte          . ?Y)
 269     (combining-diacritic        . ?^)
 270     (ascii                      . ?a)
 271     (arabic                     . ?b)
 272     (chinese                    . ?c)
 273     (ethiopic                   . ?e)
 274     (greek                      . ?g)
 275     (korean                     . ?h)
 276     (indian                     . ?i)
 277     (japanese                   . ?j)
 278     (japanese-katakana          . ?k)
 279     (latin                      . ?l)
 280     (lao                        . ?o)
 281     (tibetan                    . ?q)
 282     (japanese-roman             . ?r)
 283     (thai                       . ?t)
 284     (vietnamese                 . ?v)
 285     (hebrew                     . ?w)
 286     (cyrillic                   . ?y)
 287     (can-break                  . ?|))
 288   "Alist mapping symbols to category characters.
 289 Each entry has the form (SYMBOL . CHAR), where SYMBOL is a valid
 290 symbol in `(category SYMBOL)', and CHAR is the category character
 291 corresponding to SYMBOL, as it would be used with `\\c' or `\\C' in
 292 regular expression strings.")
 293
 294
 295 (defvar rx-greedy-flag t
 296   "Non-nil means produce greedy regular expressions for `zero-or-one',
 297 `zero-or-more', and `one-or-more'.  Dynamically bound.")
 298
 299
 300 (defun rx-info (op)
 301   "Return parsing/code generation info for OP.
 302 If OP is the space character ASCII 32, return info for the symbol `?'.
 303 If OP is the character `?', return info for the symbol `??'.
 304 See also `rx-constituents'."
 305   (cond ((eq op ? ) (setq op '\?))
 306         ((eq op ??) (setq op '\??)))
 307   (while (and (not (null op)) (symbolp op))
 308     (setq op (cdr (assq op rx-constituents))))
 309   op)
 310
 311
 312 (defun rx-check (form)
 313   "Check FORM according to its car's parsing info."
 314   (unless (listp form)
 315     (error "rx `%s' needs argument(s)" form))
 316   (let* ((rx (rx-info (car form)))
 317          (nargs (1- (length form)))
 318          (min-args (nth 1 rx))
 319          (max-args (nth 2 rx))
 320          (type-pred (nth 3 rx)))
 321     (when (and (not (null min-args))
 322                (< nargs min-args))
 323       (error "rx form `%s' requires at least %d args"
 324              (car form) min-args))
 325     (when (and (not (null max-args))
 326                (> nargs max-args))
 327       (error "rx form `%s' accepts at most %d args"
 328              (car form) max-args))
 329     (when (not (null type-pred))
 330       (dolist (sub-form (cdr form))
 331         (unless (funcall type-pred sub-form)
 332           (error "rx form `%s' requires args satisfying `%s'"
 333                  (car form) type-pred))))))
 334
 335
 336 (defun rx-and (form)
 337   "Parse and produce code from FORM.
 338 FORM is of the form `(and FORM1 ...)'."
 339   (rx-check form)
 340   (concat "\\(?:"
 341           (mapconcat
 342            (function (lambda (x) (rx-to-string x 'no-group)))
 343            (cdr form) nil)
 344           "\\)"))
 345
 346
 347 (defun rx-or (form)
 348   "Parse and produce code from FORM, which is `(or FORM1 ...)'."
 349   (rx-check form)
 350   (let ((all-args-strings t))
 351     (dolist (arg (cdr form))
 352       (unless (stringp arg)
 353         (setq all-args-strings nil)))
 354     (concat "\\(?:"
 355             (if all-args-strings
 356                 (regexp-opt (cdr form))
 357               (mapconcat #'rx-to-string (cdr form) "\\|"))
 358             "\\)")))
 359
 360
 361 (defvar rx-bracket)                    ; dynamically bound in `rx-any'
 362
 363 (defun rx-check-any (arg)
 364    "Check arg ARG for Rx `any'."
 365    (if (integerp arg)
 366        (setq arg (string arg)))
 367    (when (stringp arg)
 368      (if (zerop (length arg))
 369          (error "String arg for Rx `any' must not be empty"))
 370      ;; Quote ^ at start; don't bother to check whether this is first arg.
 371      (if (eq ?^ (aref arg 0))
 372          (setq arg (concat "\\" arg)))
 373      ;; Remove ] and set flag for adding it to start of overall result.
 374      (when (string-match "]" arg)
 375        (setq arg (replace-regexp-in-string "]" "" arg)
 376              rx-bracket "]")))
 377    (when (symbolp arg)
 378      (let ((translation (condition-case nil
 379                             (rx-to-string arg 'no-group)
 380                           (error nil))))
 381        (unless translation (error "Invalid char class `%s' in Rx `any'" arg))
 382        (setq arg (substring translation 1 -1)))) ; strip outer brackets
 383    ;; sregex compatibility
 384    (when (and (integerp (car-safe arg))
 385               (integerp (cdr-safe arg)))
 386      (setq arg (string (car arg) ?- (cdr arg))))
 387    (unless (stringp arg)
 388      (error "rx `any' requires string, character, char pair or char class args"))
 389    arg)
 390
 391 (defun rx-any (form)
 392   "Parse and produce code from FORM, which is `(any ARG ...)'.
 393 ARG is optional."
 394   (rx-check form)
 395   (let* ((rx-bracket nil)
 396          (args (mapcar #'rx-check-any (cdr form)))) ; side-effects `rx-bracket'
 397     ;; If there was a ?- in the form, move it to the front to avoid
 398     ;; accidental range.
 399     (if (member "-" args)
 400         (setq args (cons "-" (delete "-" args))))
 401     (apply #'concat "[" rx-bracket (append args '("]")))))
 402
 403
 404 (defun rx-check-not (arg)
 405   "Check arg ARG for Rx `not'."
 406   (unless (or (and (symbolp arg)
 407                    (string-match "\\`\\[\\[:[-a-z]:]]\\'"
 408                                  (condition-case nil
 409                                      (rx-to-string arg 'no-group)
 410                                    (error ""))))
 411               (eq arg 'word-boundary)
 412               (and (consp arg)
 413                    (memq (car arg) '(not any in syntax category))))
 414     (error "rx `not' syntax error: %s" arg))
 415   t)
 416
 417
 418 (defun rx-not (form)
 419   "Parse and produce code from FORM.  FORM is `(not ...)'."
 420   (rx-check form)
 421   (let ((result (rx-to-string (cadr form) 'no-group))
 422         case-fold-search)
 423     (cond ((string-match "\\`\\[^" result)
 424            (if (= (length result) 4)
 425                (substring result 2 3)
 426              (concat "[" (substring result 2))))
 427           ((eq ?\[ (aref result 0))
 428            (concat "[^" (substring result 1)))
 429           ((string-match "\\`\\\\[scb]" result)
 430            (concat (capitalize (substring result 0 2)) (substring result 2)))
 431           (t
 432            (concat "[^" result "]")))))
 433
 434
 435 (defun rx-not-char (form)
 436   "Parse and produce code from FORM.  FORM is `(not-char ...)'."
 437   (rx-check form)
 438   (rx-not `(not (in ,@(cdr form)))))
 439
 440
 441 (defun rx-not-syntax (form)
 442   "Parse and produce code from FORM.  FORM is `(not-syntax SYNTAX)'."
 443   (rx-check form)
 444   (rx-not `(not (syntax ,@(cdr form)))))
 445
 446
 447 (defun rx-trans-forms (form &optional skip)
 448   "If FORM's length is greater than two, transform it to length two.
 449 A form (HEAD REST ...) becomes (HEAD (and REST ...)).
 450 If SKIP is non-nil, allow that number of items after the head, i.e.
 451 `(= N REST ...)' becomes `(= N (and REST ...))' if SKIP is 1."
 452   (unless skip (setq skip 0))
 453   (let ((tail (nthcdr (1+ skip) form)))
 454     (if (= (length tail) 1)
 455         form
 456       (let ((form (copy-sequence form)))
 457         (setcdr (nthcdr skip form) (list (cons 'and tail)))
 458         form))))
 459
 460
 461 (defun rx-= (form)
 462   "Parse and produce code from FORM `(= N ...)'."
 463   (rx-check form)
 464   (setq form (rx-trans-forms form 1))
 465   (unless (and (integerp (nth 1 form))
 466                (> (nth 1 form) 0))
 467     (error "rx `=' requires positive integer first arg"))
 468   (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 469
 470
 471 (defun rx->= (form)
 472   "Parse and produce code from FORM `(>= N ...)'."
 473   (rx-check form)
 474   (setq form (rx-trans-forms form 1))
 475   (unless (and (integerp (nth 1 form))
 476                (> (nth 1 form) 0))
 477     (error "rx `>=' requires positive integer first arg"))
 478   (format "%s\\{%d,\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 479
 480
 481 (defun rx-** (form)
 482   "Parse and produce code from FORM `(** N M ...)'."
 483   (rx-check form)
 484   (setq form (cons 'repeat (cdr (rx-trans-forms form 2))))
 485   (rx-to-string form))
 486
 487
 488 (defun rx-repeat (form)
 489   "Parse and produce code from FORM.
 490 FORM is either `(repeat N FORM1)' or `(repeat N M FORM1)'."
 491   (rx-check form)
 492   (cond ((= (length form) 3)
 493          (unless (and (integerp (nth 1 form))
 494                       (> (nth 1 form) 0))
 495            (error "rx `repeat' requires positive integer first arg"))
 496          (format "%s\\{%d\\}" (rx-to-string (nth 2 form)) (nth 1 form)))
 497         ((or (not (integerp (nth 2 form)))
 498              (< (nth 2 form) 0)
 499              (not (integerp (nth 1 form)))
 500              (< (nth 1 form) 0)
 501              (< (nth 2 form) (nth 1 form)))
 502          (error "rx `repeat' range error"))
 503         (t
 504          (format "%s\\{%d,%d\\}" (rx-to-string (nth 3 form))
 505                  (nth 1 form) (nth 2 form)))))
 506
 507
 508 (defun rx-submatch (form)
 509   "Parse and produce code from FORM, which is `(submatch ...)'."
 510   (concat "\\("
 511           (mapconcat (function (lambda (x) (rx-to-string x 'no-group)))
 512                      (cdr form) nil)
 513           "\\)"))
 514
 515 (defun rx-backref (form)
 516   "Parse and produce code from FORM, which is `(backref N)'."
 517   (rx-check form)
 518   (format "\\%d" (nth 1 form)))
 519
 520 (defun rx-check-backref (arg)
 521   "Check arg ARG for Rx `backref'."
 522   (or (and (integerp arg) (>= arg 1) (<= arg 9))
 523       (error "rx `backref' requires numeric 1<=arg<=9: %s" arg)))
 524
 525 (defun rx-kleene (form)
 526   "Parse and produce code from FORM.
 527 FORM is `(OP FORM1)', where OP is one of the `zero-or-one',
 528 `zero-or-more' etc.  operators.
 529 If OP is one of `*', `+', `?', produce a greedy regexp.
 530 If OP is one of `*?', `+?', `??', produce a non-greedy regexp.
 531 If OP is anything else, produce a greedy regexp if `rx-greedy-flag'
 532 is non-nil."
 533   (rx-check form)
 534   (setq form (rx-trans-forms form))
 535   (let ((suffix (cond ((memq (car form) '(* + ? )) "")
 536                       ((memq (car form) '(*? +? ??)) "?")
 537                       (rx-greedy-flag "")
 538                       (t "?")))
 539         (op (cond ((memq (car form) '(* *? 0+ zero-or-more)) "*")
 540                   ((memq (car form) '(+ +? 1+ one-or-more))  "+")
 541                   (t "?")))
 542         (result (rx-to-string (cadr form) 'no-group)))
 543     (if (not (rx-atomic-p result))
 544         (setq result (concat "\\(?:" result "\\)")))
 545     (concat result op suffix)))
 546
 547 (defun rx-atomic-p (r)
 548   "Return non-nil if regexp string R is atomic.
 549 An atomic regexp R is one such that a suffix operator
 550 appended to R will apply to all of R.  For example, \"a\"
 551 \"[abc]\" and \"\\(ab\\|ab*c\\)\" are atomic and \"ab\",
 552 \"[ab]c\", and \"ab\\|ab*c\" are not atomic.
 553
 554 This function may return false negatives, but it will not
 555 return false positives.  It is nevertheless useful in
 556 situations where an efficiency shortcut can be taken iff a
 557 regexp is atomic.  The function can be improved to detect
 558 more cases of atomic regexps.  Presently, this function
 559 detects the following categories of atomic regexp;
 560
 561   a group or shy group:  \\(...\\)
 562   a character class:     [...]
 563   a single character:    a
 564
 565 On the other hand, false negatives will be returned for
 566 regexps that are atomic but end in operators, such as
 567 \"a+\".  I think these are rare.  Probably such cases could
 568 be detected without much effort.  A guarantee of no false
 569 negatives would require a theoretic specification of the set
 570 of all atomic regexps."
 571   (let ((l (length r)))
 572     (or (equal l 1)
 573         (and (>= l 6)
 574              (equal (substring r 0 2) "\\(")
 575              (equal (substring r -2) "\\)"))
 576         (and (>= l 2)
 577              (equal (substring r 0 1) "[")
 578              (equal (substring r -1) "]")))))
 579
 580
 581 (defun rx-syntax (form)
 582   "Parse and produce code from FORM, which is `(syntax SYMBOL)'."
 583   (rx-check form)
 584   (let* ((sym (cadr form))
 585          (syntax (assq sym rx-syntax)))
 586     (unless syntax
 587       ;; Try sregex compatibility.
 588       (let ((name (symbol-name sym)))
 589         (if (= 1 (length name))
 590             (setq syntax (rassq (aref name 0) rx-syntax))))
 591       (unless syntax
 592         (error "Unknown rx syntax `%s'" (cadr form))))
 593     (format "\\s%c" (cdr syntax))))
 594
 595
 596 (defun rx-check-category (form)
 597   "Check the argument FORM of a `(category FORM)'."
 598   (unless (or (integerp form)
 599               (cdr (assq form rx-categories)))
 600     (error "Unknown category `%s'" form))
 601   t)
 602
 603
 604 (defun rx-category (form)
 605   "Parse and produce code from FORM, which is `(category SYMBOL)'."
 606   (rx-check form)
 607   (let ((char (if (integerp (cadr form))
 608                   (cadr form)
 609                 (cdr (assq (cadr form) rx-categories)))))
 610     (format "\\c%c" char)))
 611
 612
 613 (defun rx-eval (form)
 614   "Parse and produce code from FORM, which is `(eval FORM)'."
 615   (rx-check form)
 616   (rx-to-string (eval (cadr form))))
 617
 618
 619 (defun rx-greedy (form)
 620   "Parse and produce code from FORM.
 621 If FORM is '(minimal-match FORM1)', non-greedy versions of `*',
 622 `+', and `?' operators will be used in FORM1.  If FORM is
 623 '(maximal-match FORM1)', greedy operators will be used."
 624   (rx-check form)
 625   (let ((rx-greedy-flag (eq (car form) 'maximal-match)))
 626     (rx-to-string (cadr form))))
 627
 628
 629 (defun rx-regexp (form)
 630   "Parse and produce code from FORM, which is `(regexp STRING)'."
 631   (rx-check form)
 632   (concat "\\(?:" (cadr form) "\\)"))
 633
 634
 635 ;;;###autoload
 636 (defun rx-to-string (form &optional no-group)
 637   "Parse and produce code for regular expression FORM.
 638 FORM is a regular expression in sexp form.
 639 NO-GROUP non-nil means don't put shy groups around the result."
 640   (cond ((stringp form)
 641          (regexp-quote form))
 642         ((integerp form)
 643          (regexp-quote (char-to-string form)))
 644         ((symbolp form)
 645          (let ((info (rx-info form)))
 646            (cond ((stringp info)
 647                   info)
 648                  ((null info)
 649                   (error "Unknown rx form `%s'" form))
 650                  (t
 651                   (funcall (nth 0 info) form)))))
 652         ((consp form)
 653          (let ((info (rx-info (car form))))
 654            (unless (consp info)
 655              (error "Unknown rx form `%s'" (car form)))
 656            (let ((result (funcall (nth 0 info) form)))
 657              (if (or no-group (string-match "\\`\\\\[(]" result))
 658                  result
 659                (concat "\\(?:" result "\\)")))))
 660         (t
 661          (error "rx syntax error at `%s'" form))))
 662
 663
 664 ;;;###autoload
 665 (defmacro rx (&rest regexps)
 666   "Translate regular expressions REGEXPS in sexp form to a regexp string.
 667 REGEXPS is a non-empty sequence of forms of the sort listed below.
 668 See also `rx-to-string' for how to do such a translation at run-time.
 669
 670 The following are valid subforms of regular expressions in sexp
 671 notation.
 672
 673 STRING
 674      matches string STRING literally.
 675
 676 CHAR
 677      matches character CHAR literally.
 678
 679 `not-newline', `nonl'
 680      matches any character except a newline.
 681                         .
 682 `anything'
 683      matches any character
 684
 685 `(any SET ...)'
 686 `(in SET ...)'
 687 `(char SET ...)'
 688      matches any character in SET ....  SET may be a character or string.
 689      Ranges of characters can be specified as `A-Z' in strings.
 690      Ranges may also be specified as conses like `(?A . ?Z)'.
 691
 692      SET may also be the name of a character class: `digit',
 693      `control', `hex-digit', `blank', `graph', `print', `alnum',
 694      `alpha', `ascii', `nonascii', `lower', `punct', `space', `upper',
 695      `word', or one of their synonyms.
 696
 697 `(not (any SET ...))'
 698      matches any character not in SET ...
 699
 700 `line-start', `bol'
 701      matches the empty string, but only at the beginning of a line
 702      in the text being matched
 703
 704 `line-end', `eol'
 705      is similar to `line-start' but matches only at the end of a line
 706
 707 `string-start', `bos', `bot'
 708      matches the empty string, but only at the beginning of the
 709      string being matched against.
 710
 711 `string-end', `eos', `eot'
 712      matches the empty string, but only at the end of the
 713      string being matched against.
 714
 715 `buffer-start'
 716      matches the empty string, but only at the beginning of the
 717      buffer being matched against.  Actually equivalent to `string-start'.
 718
 719 `buffer-end'
 720      matches the empty string, but only at the end of the
 721      buffer being matched against.  Actually equivalent to `string-end'.
 722
 723 `point'
 724      matches the empty string, but only at point.
 725
 726 `word-start', `bow'
 727      matches the empty string, but only at the beginning or end of a
 728      word.
 729
 730 `word-end', `eow'
 731      matches the empty string, but only at the end of a word.
 732
 733 `word-boundary'
 734      matches the empty string, but only at the beginning or end of a
 735      word.
 736
 737 `(not word-boundary)'
 738 `not-word-boundary'
 739      matches the empty string, but not at the beginning or end of a
 740      word.
 741
 742 `digit', `numeric', `num'
 743      matches 0 through 9.
 744
 745 `control', `cntrl'
 746      matches ASCII control characters.
 747
 748 `hex-digit', `hex', `xdigit'
 749      matches 0 through 9, a through f and A through F.
 750
 751 `blank'
 752      matches space and tab only.
 753
 754 `graphic', `graph'
 755      matches graphic characters--everything except ASCII control chars,
 756      space, and DEL.
 757
 758 `printing', `print'
 759      matches printing characters--everything except ASCII control chars
 760      and DEL.
 761
 762 `alphanumeric', `alnum'
 763      matches letters and digits.  (But at present, for multibyte characters,
 764      it matches anything that has word syntax.)
 765
 766 `letter', `alphabetic', `alpha'
 767      matches letters.  (But at present, for multibyte characters,
 768      it matches anything that has word syntax.)
 769
 770 `ascii'
 771      matches ASCII (unibyte) characters.
 772
 773 `nonascii'
 774      matches non-ASCII (multibyte) characters.
 775
 776 `lower', `lower-case'
 777      matches anything lower-case.
 778
 779 `upper', `upper-case'
 780      matches anything upper-case.
 781
 782 `punctuation', `punct'
 783      matches punctuation.  (But at present, for multibyte characters,
 784      it matches anything that has non-word syntax.)
 785
 786 `space', `whitespace', `white'
 787      matches anything that has whitespace syntax.
 788
 789 `word', `wordchar'
 790      matches anything that has word syntax.
 791
 792 `not-wordchar'
 793      matches anything that has non-word syntax.
 794
 795 `(syntax SYNTAX)'
 796      matches a character with syntax SYNTAX.  SYNTAX must be one
 797      of the following symbols, or a symbol corresponding to the syntax
 798      character, e.g. `\\.' for `\\s.'.
 799
 800      `whitespace'               (\\s- in string notation)
 801      `punctuation'              (\\s.)
 802      `word'                     (\\sw)
 803      `symbol'                   (\\s_)
 804      `open-parenthesis'         (\\s()
 805      `close-parenthesis'        (\\s))
 806      `expression-prefix'        (\\s')
 807      `string-quote'             (\\s\")
 808      `paired-delimiter'         (\\s$)
 809      `escape'                   (\\s\\)
 810      `character-quote'          (\\s/)
 811      `comment-start'            (\\s<)
 812      `comment-end'              (\\s>)
 813      `string-delimiter'         (\\s|)
 814      `comment-delimiter'        (\\s!)
 815
 816 `(not (syntax SYNTAX))'
 817      matches a character that doesn't have syntax SYNTAX.
 818
 819 `(category CATEGORY)'
 820      matches a character with category CATEGORY.  CATEGORY must be
 821      either a character to use for C, or one of the following symbols.
 822
 823      `consonant'                        (\\c0 in string notation)
 824      `base-vowel'                       (\\c1)
 825      `upper-diacritical-mark'           (\\c2)
 826      `lower-diacritical-mark'           (\\c3)
 827      `tone-mark'                        (\\c4)
 828      `symbol'                           (\\c5)
 829      `digit'                            (\\c6)
 830      `vowel-modifying-diacritical-mark' (\\c7)
 831      `vowel-sign'                       (\\c8)
 832      `semivowel-lower'                  (\\c9)
 833      `not-at-end-of-line'               (\\c<)
 834      `not-at-beginning-of-line'         (\\c>)
 835      `alpha-numeric-two-byte'           (\\cA)
 836      `chinse-two-byte'                  (\\cC)
 837      `greek-two-byte'                   (\\cG)
 838      `japanese-hiragana-two-byte'       (\\cH)
 839      `indian-tow-byte'                  (\\cI)
 840      `japanese-katakana-two-byte'       (\\cK)
 841      `korean-hangul-two-byte'           (\\cN)
 842      `cyrillic-two-byte'                (\\cY)
 843      `combining-diacritic'              (\\c^)
 844      `ascii'                            (\\ca)
 845      `arabic'                           (\\cb)
 846      `chinese'                          (\\cc)
 847      `ethiopic'                         (\\ce)
 848      `greek'                            (\\cg)
 849      `korean'                           (\\ch)
 850      `indian'                           (\\ci)
 851      `japanese'                         (\\cj)
 852      `japanese-katakana'                (\\ck)
 853      `latin'                            (\\cl)
 854      `lao'                              (\\co)
 855      `tibetan'                          (\\cq)
 856      `japanese-roman'                   (\\cr)
 857      `thai'                             (\\ct)
 858      `vietnamese'                       (\\cv)
 859      `hebrew'                           (\\cw)
 860      `cyrillic'                         (\\cy)
 861      `can-break'                        (\\c|)
 862
 863 `(not (category CATEGORY))'
 864      matches a character that doesn't have category CATEGORY.
 865
 866 `(and SEXP1 SEXP2 ...)'
 867 `(: SEXP1 SEXP2 ...)'
 868 `(seq SEXP1 SEXP2 ...)'
 869 `(sequence SEXP1 SEXP2 ...)'
 870      matches what SEXP1 matches, followed by what SEXP2 matches, etc.
 871
 872 `(submatch SEXP1 SEXP2 ...)'
 873 `(group SEXP1 SEXP2 ...)'
 874      like `and', but makes the match accessible with `match-end',
 875      `match-beginning', and `match-string'.
 876
 877 `(group SEXP1 SEXP2 ...)'
 878      another name for `submatch'.
 879
 880 `(or SEXP1 SEXP2 ...)'
 881 `(| SEXP1 SEXP2 ...)'
 882      matches anything that matches SEXP1 or SEXP2, etc.  If all
 883      args are strings, use `regexp-opt' to optimize the resulting
 884      regular expression.
 885
 886 `(minimal-match SEXP)'
 887      produce a non-greedy regexp for SEXP.  Normally, regexps matching
 888      zero or more occurrences of something are \"greedy\" in that they
 889      match as much as they can, as long as the overall regexp can
 890      still match.  A non-greedy regexp matches as little as possible.
 891
 892 `(maximal-match SEXP)'
 893      produce a greedy regexp for SEXP.  This is the default.
 894
 895 Below, `SEXP ...' represents a sequence of regexp forms, treated as if
 896 enclosed in `(and ...)'.
 897
 898 `(zero-or-more SEXP ...)'
 899 `(0+ SEXP ...)'
 900      matches zero or more occurrences of what SEXP ... matches.
 901
 902 `(* SEXP ...)'
 903      like `zero-or-more', but always produces a greedy regexp, independent
 904      of `rx-greedy-flag'.
 905
 906 `(*? SEXP ...)'
 907      like `zero-or-more', but always produces a non-greedy regexp,
 908      independent of `rx-greedy-flag'.
 909
 910 `(one-or-more SEXP ...)'
 911 `(1+ SEXP ...)'
 912      matches one or more occurrences of SEXP ...
 913
 914 `(+ SEXP ...)'
 915      like `one-or-more', but always produces a greedy regexp.
 916
 917 `(+? SEXP ...)'
 918      like `one-or-more', but always produces a non-greedy regexp.
 919
 920 `(zero-or-one SEXP ...)'
 921 `(optional SEXP ...)'
 922 `(opt SEXP ...)'
 923      matches zero or one occurrences of A.
 924
 925 `(? SEXP ...)'
 926      like `zero-or-one', but always produces a greedy regexp.
 927
 928 `(?? SEXP ...)'
 929      like `zero-or-one', but always produces a non-greedy regexp.
 930
 931 `(repeat N SEXP)'
 932 `(= N SEXP ...)'
 933      matches N occurrences.
 934
 935 `(>= N SEXP ...)'
 936      matches N or more occurrences.
 937
 938 `(repeat N M SEXP)'
 939 `(** N M SEXP ...)'
 940      matches N to M occurrences.
 941
 942 `(backref N)'
 943     matches what was matched previously by submatch N.
 944
 945 `(backref N)'
 946      matches what was matched previously by submatch N.
 947
 948 `(backref N)'
 949     matches what was matched previously by submatch N.
 950
 951 `(eval FORM)'
 952      evaluate FORM and insert result.  If result is a string,
 953      `regexp-quote' it.
 954
 955 `(regexp REGEXP)'
 956      include REGEXP in string notation in the result."
 957   (cond ((null regexps)
 958          (error "No regexp"))
 959         ((cdr regexps)
 960          (rx-to-string `(and ,@regexps) t))
 961         (t
 962          (rx-to-string (car regexps) t))))
 963 \f
 964 ;; ;; sregex.el replacement
 965
 966 ;; ;;;###autoload (provide 'sregex)
 967 ;; ;;;###autoload (autoload 'sregex "rx")
 968 ;; (defalias 'sregex 'rx-to-string)
 969 ;; ;;;###autoload (autoload 'sregexq "rx" nil nil 'macro)
 970 ;; (defalias 'sregexq 'rx)
 971 \f
 972 (provide 'rx)
 973
 974 ;; arch-tag: 12d01a63-0008-42bb-ab8c-1c7d63be370b
 975 ;;; rx.el ends here