Component closures

You are here: All Systems / cl-ppcre / closures

;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/closures.lisp,v 1.34 2007/01/15 23:57:41 edi Exp $

;;; Here we create the closures which together build the final
;;; scanner.

;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package #:cl-ppcre)

(declaim (inline *string*= *string*-equal))

(defun *string*= (string2 start1 end1 start2 end2)
  "Like STRING=, i.e. compares the special string *STRING* from START1
to END1 with STRING2 from START2 to END2. Note that there's no
boundary check - this has to be implemented by the caller."

  (declare #.*standard-optimize-settings*)
  (declare (type fixnum start1 end1 start2 end2))
  (loop for string1-idx of-type fixnum from start1 below end1
        for string2-idx of-type fixnum from start2 below end2
        always (char= (schar *string* string1-idx)
                      (schar string2 string2-idx)
)
)
)


(defun *string*-equal (string2 start1 end1 start2 end2)
  "Like STRING-EQUAL, i.e. compares the special string *STRING* from
START1 to END1 with STRING2 from START2 to END2. Note that there's no
boundary check - this has to be implemented by the caller."

  (declare #.*standard-optimize-settings*)
  (declare (type fixnum start1 end1 start2 end2))
  (loop for string1-idx of-type fixnum from start1 below end1
        for string2-idx of-type fixnum from start2 below end2
        always (char-equal (schar *string* string1-idx)
                           (schar string2 string2-idx)
)
)
)


(defgeneric create-matcher-aux (regex next-fn)
  (declare #.*standard-optimize-settings*)
  (:documentation "Creates a closure which takes one parameter,
START-POS, and tests whether REGEX can match *STRING* at START-POS
such that the call to NEXT-FN after the match would succeed."
)
)

                
(defmethod create-matcher-aux ((seq seq) next-fn)
  (declare #.*standard-optimize-settings*)
  ;; the closure for a SEQ is a chain of closures for the elements of
 ;; this sequence which call each other in turn; the last closure
 ;; calls NEXT-FN
 (loop for element in (reverse (elements seq))
        for curr-matcher = next-fn then next-matcher
        for next-matcher = (create-matcher-aux element curr-matcher)
        finally (return next-matcher)
)
)


(defmethod create-matcher-aux ((alternation alternation) next-fn)
  (declare #.*standard-optimize-settings*)
  ;; first create closures for all alternations of ALTERNATION
 (let ((all-matchers (mapcar #'(lambda (choice)
                                  (create-matcher-aux choice next-fn)
)

                              (choices alternation)
)
)
)

    ;; now create a closure which checks if one of the closures
   ;; created above can succeed
   (lambda (start-pos)
      (declare (type fixnum start-pos))
      (loop for matcher in all-matchers
            thereis (funcall (the function matcher) start-pos)
)
)
)
)


(defmethod create-matcher-aux ((register register) next-fn)
  (declare #.*standard-optimize-settings*)
  ;; the position of this REGISTER within the whole regex; we start to
 ;; count at 0
 (let ((num (num register)))
    (declare (type fixnum num))
    ;; STORE-END-OF-REG is a thin wrapper around NEXT-FN which will
   ;; update the corresponding values of *REGS-START* and *REGS-END*
   ;; after the inner matcher has succeeded
   (flet ((store-end-of-reg (start-pos)
               (declare (type fixnum start-pos)
                        (type function next-fn)
)

               (setf (svref *reg-starts* num) (svref *regs-maybe-start* num)
                     (svref *reg-ends* num) start-pos
)

           (funcall next-fn start-pos)
)
)

      ;; the inner matcher is a closure corresponding to the regex
     ;; wrapped by this REGISTER
     (let ((inner-matcher (create-matcher-aux (regex register)
                                               #'store-end-of-reg
)
)
)

        (declare (type function inner-matcher))
        ;; here comes the actual closure for REGISTER
       (lambda (start-pos)
          (declare (type fixnum start-pos))
          ;; remember the old values of *REGS-START* and friends in
         ;; case we cannot match
         (let ((old-*reg-starts* (svref *reg-starts* num))
                (old-*regs-maybe-start* (svref *regs-maybe-start* num))
                (old-*reg-ends* (svref *reg-ends* num))
)

            ;; we cannot use *REGS-START* here because Perl allows
           ;; regular expressions like /(a|\1x)*/
           (setf (svref *regs-maybe-start* num) start-pos)
            (let ((next-pos (funcall inner-matcher start-pos)))
              (unless next-pos
                ;; restore old values on failure
               (setf (svref *reg-starts* num) old-*reg-starts*
                      (svref *regs-maybe-start* num) old-*regs-maybe-start*
                      (svref *reg-ends* num) old-*reg-ends*
)
)

              next-pos
)
)
)
)
)
)
)


(defmethod create-matcher-aux ((lookahead lookahead) next-fn)
  (declare #.*standard-optimize-settings*)
  ;; create a closure which just checks for the inner regex and
 ;; doesn't care about NEXT-FN
 (let ((test-matcher (create-matcher-aux (regex lookahead) #'identity)))
    (declare (type function next-fn test-matcher))
    (if (positivep lookahead)
      ;; positive look-ahead: check success of inner regex, then call
     ;; NEXT-FN
     (lambda (start-pos)
        (and (funcall test-matcher start-pos)
             (funcall next-fn start-pos)
)
)

      ;; negative look-ahead: check failure of inner regex, then call
     ;; NEXT-FN
     (lambda (start-pos)
        (and (not (funcall test-matcher start-pos))
             (funcall next-fn start-pos)
)
)
)
)
)


(defmethod create-matcher-aux ((lookbehind lookbehind) next-fn)
  (declare #.*standard-optimize-settings*)
  (let ((len (len lookbehind))
        ;; create a closure which just checks for the inner regex and
       ;; doesn't care about NEXT-FN
       (test-matcher (create-matcher-aux (regex lookbehind) #'identity))
)

    (declare (type function next-fn test-matcher)
             (type fixnum len)
)

    (if (positivep lookbehind)
      ;; positive look-behind: check success of inner regex (if we're
     ;; far enough from the start of *STRING*), then call NEXT-FN
     (lambda (start-pos)
        (declare (type fixnum start-pos))
        (and (>= (- start-pos (or *real-start-pos* *start-pos*)) len)
             (funcall test-matcher (- start-pos len))
             (funcall next-fn start-pos)
)
)

      ;; negative look-behind: check failure of inner regex (if we're
     ;; far enough from the start of *STRING*), then call NEXT-FN
     (lambda (start-pos)
        (declare (type fixnum start-pos))
        (and (or (< (- start-pos (or *real-start-pos* *start-pos*)) len)
                 (not (funcall test-matcher (- start-pos len)))
)

             (funcall next-fn start-pos)
)
)
)
)
)


(defmacro insert-char-class-tester ((char-class chr-expr) &body body)
  "Utility macro to replace each occurence of '(CHAR-CLASS-TEST)
within BODY with the correct test (corresponding to CHAR-CLASS)
against CHR-EXPR."

  (with-unique-names (%char-class)
    ;; the actual substitution is done here: replace
   ;; '(CHAR-CLASS-TEST) with NEW
   (flet ((substitute-char-class-tester (new)
               (subst new '(char-class-test) body
                      :test #'equalp
)
)
)

      `(let* ((,%char-class ,char-class)
              (hash (hash ,%char-class))
              (count (if hash
                       (hash-table-count hash)
                       most-positive-fixnum
)
)

              ;; collect a list of "all" characters in the hash if
             ;; there aren't more than two
             (key-list (if (<= count 2)
                          (loop for chr being the hash-keys of hash
                                collect chr
)

                          nil
)
)

              downcasedp
)

        (declare (type fixnum count))
        ;; check if we can partition the hash into three ranges (or
       ;; less)
       (multiple-value-bind (min1 max1 min2 max2 min3 max3)
            (create-ranges-from-hash hash)
          ;; if that didn't work and CHAR-CLASS is case-insensitive we
         ;; try it again with every character downcased
         (when (and (not min1)
                     (case-insensitive-p ,%char-class)
)

            (multiple-value-setq (min1 max1 min2 max2 min3 max3)
              (create-ranges-from-hash hash :downcasep t)
)

            (setq downcasedp t)
)

          (cond ((= count 1)
                  ;; hash contains exactly one character so we just
                 ;; check for this single character; (note that this
                 ;; actually can't happen because this case is
                 ;; optimized away in CONVERT already...)
                 (let ((chr1 (first key-list)))
                    ,@(substitute-char-class-tester
                       `(char= ,chr-expr chr1)
)
)
)

                ((= count 2)
                  ;; hash contains exactly two characters
                 (let ((chr1 (first key-list))
                        (chr2 (second key-list))
)

                    ,@(substitute-char-class-tester
                       `(let ((chr ,chr-expr))
                         (or (char= chr chr1)
                             (char= chr chr2)
)
)
)
)
)

                ((word-char-class-p ,%char-class)
                  ;; special-case: hash is \w, \W, [\w], [\W] or
                 ;; something equivalent
                 ,@(substitute-char-class-tester
                     `(word-char-p ,chr-expr)
)
)

                ((= count *regex-char-code-limit*)
                  ;; according to the ANSI standard we might have all
                 ;; possible characters in the hash even if it
                 ;; doesn't contain CHAR-CODE-LIMIT characters but
                 ;; this doesn't seem to be the case for current
                 ;; implementations (also note that this optimization
                 ;; implies that you must not have characters with
                 ;; character codes beyond *REGEX-CHAR-CODE-LIMIT* in
                 ;; your regexes if you've changed this limit); we
                 ;; expect the compiler to optimize this T "test"
                 ;; away
                 ,@(substitute-char-class-tester t)
)

                ((and downcasedp min1 min2 min3)
                  ;; three different ranges, downcased
                 ,@(substitute-char-class-tester
                     `(let ((chr ,chr-expr))
                       (or (char-not-greaterp min1 chr max1)
                           (char-not-greaterp min2 chr max2)
                           (char-not-greaterp min3 chr max3)
)
)
)
)

                ((and downcasedp min1 min2)
                  ;; two ranges, downcased
                 ,@(substitute-char-class-tester
                     `(let ((chr ,chr-expr))
                       (or (char-not-greaterp min1 chr max1)
                           (char-not-greaterp min2 chr max2)
)
)
)
)

                ((and downcasedp min1)
                  ;; one downcased range
                 ,@(substitute-char-class-tester
                     `(char-not-greaterp min1 ,chr-expr max1)
)
)

                ((and min1 min2 min3)
                  ;; three ranges
                 ,@(substitute-char-class-tester
                     `(let ((chr ,chr-expr))
                       (or (char<= min1 chr max1)
                           (char<= min2 chr max2)
                           (char<= min3 chr max3)
)
)
)
)

                ((and min1 min2)
                  ;; two ranges
                 ,@(substitute-char-class-tester
                     `(let ((chr ,chr-expr))
                       (or (char<= min1 chr max1)
                           (char<= min2 chr max2)
)
)
)
)

                (min1
                  ;; one range
                 ,@(substitute-char-class-tester
                     `(char<= min1 ,chr-expr max1)
)
)

                (t
                  ;; the general case; note that most of the above
                 ;; "optimizations" are based on experiences and
                 ;; benchmarks with CMUCL - if you're really
                 ;; concerned with speed you might find out that the
                 ;; general case is almost always the best one for
                 ;; other implementations (because the speed of their
                 ;; hash-table access in relation to other operations
                 ;; might be better than in CMUCL)
                 ,@(substitute-char-class-tester
                     `(gethash ,chr-expr hash)
)
)
)
)
)
)
)
)


(defmethod create-matcher-aux ((char-class char-class) next-fn)
  (declare #.*standard-optimize-settings*)
  (declare (type function next-fn))
  ;; insert a test against the current character within *STRING*
 (insert-char-class-tester (char-class (schar *string* start-pos))
    (if (invertedp char-class)
      (lambda (start-pos)
        (declare (type fixnum start-pos))
        (and (< start-pos *end-pos*)
             (not (char-class-test))
             (funcall next-fn (1+ start-pos))
)
)

      (lambda (start-pos)
        (declare (type fixnum start-pos))
        (and (< start-pos *end-pos*)
             (char-class-test)
             (funcall next-fn (1+ start-pos))
)
)
)
)
)


(defmethod create-matcher-aux ((str str) next-fn)
  (declare #.*standard-optimize-settings*)
  (declare (type fixnum *end-string-pos*)
           (type function next-fn)
           ;; this special value is set by CREATE-SCANNER when the
          ;; closures are built
          (special end-string)
)

  (let* ((len (len str))
         (case-insensitive-p (case-insensitive-p str))
         (start-of-end-string-p (start-of-end-string-p str))
         (skip (skip str))
         (str (str str))
         (chr (schar str 0))
         (end-string (and end-string (str end-string)))
         (end-string-len (if end-string
                           (length end-string)
                           nil
)
)
)

    (declare (type fixnum len))
    (cond ((and start-of-end-string-p case-insensitive-p)
            ;; closure for the first STR which belongs to the constant
           ;; string at the end of the regular expression;
           ;; case-insensitive version
           (lambda (start-pos)
              (declare (type fixnum start-pos end-string-len))
              (let ((test-end-pos (+ start-pos end-string-len)))
                (declare (type fixnum test-end-pos))
                ;; either we're at *END-STRING-POS* (which means that
               ;; it has already been confirmed that end-string
               ;; starts here) or we really have to test
               (and (or (= start-pos *end-string-pos*)
                         (and (<= test-end-pos *end-pos*)
                              (*string*-equal end-string start-pos test-end-pos
                                              0 end-string-len
)
)
)

                     (funcall next-fn (+ start-pos len))
)
)
)
)

          (start-of-end-string-p
            ;; closure for the first STR which belongs to the constant
           ;; string at the end of the regular expression;
           ;; case-sensitive version
           (lambda (start-pos)
              (declare (type fixnum start-pos end-string-len))
              (let ((test-end-pos (+ start-pos end-string-len)))
                (declare (type fixnum test-end-pos))
                ;; either we're at *END-STRING-POS* (which means that
               ;; it has already been confirmed that end-string
               ;; starts here) or we really have to test
               (and (or (= start-pos *end-string-pos*)
                         (and (<= test-end-pos *end-pos*)
                              (*string*= end-string start-pos test-end-pos
                                         0 end-string-len
)
)
)

                     (funcall next-fn (+ start-pos len))
)
)
)
)

          (skip
            ;; a STR which can be skipped because some other function
           ;; has already confirmed that it matches
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (funcall next-fn (+ start-pos len))
)
)

          ((and (= len 1) case-insensitive-p)
            ;; STR represent exactly one character; case-insensitive
           ;; version
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (and (< start-pos *end-pos*)
                   (char-equal (schar *string* start-pos) chr)
                   (funcall next-fn (1+ start-pos))
)
)
)

          ((= len 1)
            ;; STR represent exactly one character; case-sensitive
           ;; version
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (and (< start-pos *end-pos*)
                   (char= (schar *string* start-pos) chr)
                   (funcall next-fn (1+ start-pos))
)
)
)

          (case-insensitive-p
            ;; general case, case-insensitive version
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (let ((next-pos (+ start-pos len)))
                (declare (type fixnum next-pos))
                (and (<= next-pos *end-pos*)
                     (*string*-equal str start-pos next-pos 0 len)
                     (funcall next-fn next-pos)
)
)
)
)

          (t
            ;; general case, case-sensitive version
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (let ((next-pos (+ start-pos len)))
                (declare (type fixnum next-pos))
                (and (<= next-pos *end-pos*)
                     (*string*= str start-pos next-pos 0 len)
                     (funcall next-fn next-pos)
)
)
)
)
)
)
)


(declaim (inline word-boundary-p))

(defun word-boundary-p (start-pos)
  "Check whether START-POS is a word-boundary within *STRING*."
  (declare #.*standard-optimize-settings*)
  (declare (type fixnum start-pos))
  (let ((1-start-pos (1- start-pos))
        (*start-pos* (or *real-start-pos* *start-pos*))
)

    ;; either the character before START-POS is a word-constituent and
   ;; the character at START-POS isn't...
   (or (and (or (= start-pos *end-pos*)
                 (and (< start-pos *end-pos*)
                      (not (word-char-p (schar *string* start-pos)))
)
)

             (and (< 1-start-pos *end-pos*)
                  (<= *start-pos* 1-start-pos)
                  (word-char-p (schar *string* 1-start-pos))
)
)

        ;; ...or vice versa
       (and (or (= start-pos *start-pos*)
                 (and (< 1-start-pos *end-pos*)
                      (<= *start-pos* 1-start-pos)
                      (not (word-char-p (schar *string* 1-start-pos)))
)
)

             (and (< start-pos *end-pos*)
                  (word-char-p (schar *string* start-pos))
)
)
)
)
)


(defmethod create-matcher-aux ((word-boundary word-boundary) next-fn)
  (declare #.*standard-optimize-settings*)
  (declare (type function next-fn))
  (if (negatedp word-boundary)
    (lambda (start-pos)
      (and (not (word-boundary-p start-pos))
           (funcall next-fn start-pos)
)
)

    (lambda (start-pos)
      (and (word-boundary-p start-pos)
           (funcall next-fn start-pos)
)
)
)
)


(defmethod create-matcher-aux ((everything everything) next-fn)
  (declare #.*standard-optimize-settings*)
  (declare (type function next-fn))
  (if (single-line-p everything)
    ;; closure for single-line-mode: we really match everything, so we
   ;; just advance the index into *STRING* by one and carry on
   (lambda (start-pos)
      (declare (type fixnum start-pos))
      (and (< start-pos *end-pos*)
           (funcall next-fn (1+ start-pos))
)
)

    ;; not single-line-mode, so we have to make sure we don't match
   ;; #\Newline
   (lambda (start-pos)
      (declare (type fixnum start-pos))
      (and (< start-pos *end-pos*)
           (char/= (schar *string* start-pos) #\Newline)
           (funcall next-fn (1+ start-pos))
)
)
)
)


(defmethod create-matcher-aux ((anchor anchor) next-fn)
  (declare #.*standard-optimize-settings*)
  (declare (type function next-fn))
  (let ((startp (startp anchor))
        (multi-line-p (multi-line-p anchor))
)

    (cond ((no-newline-p anchor)
            ;; this must be an end-anchor and it must be modeless, so
           ;; we just have to check whether START-POS equals
           ;; *END-POS*
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (and (= start-pos *end-pos*)
                   (funcall next-fn start-pos)
)
)
)

          ((and startp multi-line-p)
            ;; a start-anchor in multi-line-mode: check if we're at
           ;; *START-POS* or if the last character was #\Newline
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (let ((*start-pos* (or *real-start-pos* *start-pos*)))
                (and (or (= start-pos *start-pos*)
                         (and (<= start-pos *end-pos*)
                              (> start-pos *start-pos*)
                              (char= #\Newline
                                     (schar *string* (1- start-pos))
)
)
)

                     (funcall next-fn start-pos)
)
)
)
)

          (startp
            ;; a start-anchor which is not in multi-line-mode, so just
           ;; check whether we're at *START-POS*
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (and (= start-pos (or *real-start-pos* *start-pos*))
                   (funcall next-fn start-pos)
)
)
)

          (multi-line-p
            ;; an end-anchor in multi-line-mode: check if we're at
           ;; *END-POS* or if the character we're looking at is
           ;; #\Newline
           (lambda (start-pos)
              (declare (type fixnum start-pos))
              (and (or (= start-pos *end-pos*)
                       (and (< start-pos *end-pos*)
                            (char= #\Newline