Component scanner

You are here: All Systems / cl-ppcre / scanner

;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.28 2007/01/01 23:43:11 edi Exp $

;;; Here the scanner for the actual regex as well as utility scanners
;;; for the constant start and end strings are created.

;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package #:cl-ppcre)

(defmacro bmh-matcher-aux (&key case-insensitive-p)
  "Auxiliary macro used by CREATE-BMH-MATCHER."
  (let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
    `(lambda (start-pos)
      (declare (type fixnum start-pos))
      (if (or (minusp start-pos)
              (> (the fixnum (+ start-pos m)) *end-pos*)
)

        nil
        (loop named bmh-matcher
              for k of-type fixnum = (+ start-pos m -1)
                then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
              while (< k *end-pos*)
              do (loop for j of-type fixnum downfrom (1- m)
                       for i of-type fixnum downfrom k
                       while (and (>= j 0)
                                  (,char-compare (schar *string* i)
                                                 (schar pattern j)
)
)

                       finally (if (minusp j)
                                 (return-from bmh-matcher (1+ i))
)
)
)
)
)
)
)


(defun create-bmh-matcher (pattern case-insensitive-p)
  (declare #.*standard-optimize-settings*)
  "Returns a Boyer-Moore-Horspool matcher which searches the (special)
simple-string *STRING* for the first occurence of the substring
PATTERN. The search starts at the position START-POS within *STRING*
and stops before *END-POS* is reached. Depending on the second
argument the search is case-insensitive or not. If the special
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
instead. (BMH matchers are faster but need much more space.)"

  ;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
 ;; details
 (unless *use-bmh-matchers*
    (let ((test (if case-insensitive-p #'char-equal #'char=)))
      (return-from create-bmh-matcher
        (lambda (start-pos)
          (declare (type fixnum start-pos))
          (and (not (minusp start-pos))
               (search pattern
                       *string*
                       :start2 start-pos
                       :end2 *end-pos*
                       :test test
)
)
)
)
)
)

  (let* ((m (length pattern))
         (skip (make-array *regex-char-code-limit*
                          :element-type 'fixnum
                          :initial-element m
)
)
)

    (declare (type fixnum m))
    (loop for k of-type fixnum below m
          if case-insensitive-p
            do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
                     (aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1)
)

          else
            do (setf (aref skip (char-code (schar pattern k))) (- m k 1))
)

    (if case-insensitive-p
      (bmh-matcher-aux :case-insensitive-p t)
      (bmh-matcher-aux)
)
)
)


(defmacro char-searcher-aux (&key case-insensitive-p)
  "Auxiliary macro used by CREATE-CHAR-SEARCHER."
  (let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
    `(lambda (start-pos)
      (declare (type fixnum start-pos))
      (and (not (minusp start-pos))
           (loop for i of-type fixnum from start-pos below *end-pos*
                 thereis (and (,char-compare (schar *string* i) chr) i)
)
)
)
)
)


(defun create-char-searcher (chr case-insensitive-p)
  (declare #.*standard-optimize-settings*)
  "Returns a function which searches the (special) simple-string
*STRING* for the first occurence of the character CHR. The search
starts at the position START-POS within *STRING* and stops before
*END-POS* is reached. Depending on the second argument the search is
case-insensitive or not."

  (if case-insensitive-p
    (char-searcher-aux :case-insensitive-p t)
    (char-searcher-aux)
)
)


(declaim (inline newline-skipper))

(defun newline-skipper (start-pos)
  (declare #.*standard-optimize-settings*)
  (declare (type fixnum start-pos))
  "Find the next occurence of a character in *STRING* which is behind
a #
\Newline."
  ;; we can start with (1- START-POS) without testing for (PLUSP
 ;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
 ;; the first iteration
 (loop for i of-type fixnum from (1- start-pos) below *end-pos*
        thereis (and (char= (schar *string* i)
                            #\Newline
)

                     (1+ i)
)
)
)


(defmacro insert-advance-fn (advance-fn)
  "Creates the actual closure returned by CREATE-SCANNER-AUX by
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."

  (subst
   advance-fn '(advance-fn-definition)
   '(lambda (string start end)
     (block scan
       ;; initialize a couple of special variables used by the
      ;; matchers (see file specials.lisp)
      (let* ((*string* string)
              (*start-pos* start)
              (*end-pos* end)
              ;; we will search forward for END-STRING if this value
             ;; isn't at least as big as POS (see ADVANCE-FN), so it
             ;; is safe to start to the left of *START-POS*; note
             ;; that this value will _never_ be decremented - this
             ;; is crucial to the scanning process
             (*end-string-pos* (1- *start-pos*))
              ;; the next five will shadow the variables defined by
             ;; DEFPARAMETER; at this point, we don't know if we'll
             ;; actually use them, though
             (*repeat-counters* *repeat-counters*)
              (*last-pos-stores* *last-pos-stores*)
              (*reg-starts* *reg-starts*)
              (*regs-maybe-start* *regs-maybe-start*)
              (*reg-ends* *reg-ends*)
              ;; we might be able to optimize the scanning process by
             ;; (virtually) shifting *START-POS* to the right
             (scan-start-pos *start-pos*)
              (starts-with-str (if start-string-test
                                 (str starts-with)
                                 nil
)
)

              ;; we don't need to try further than MAX-END-POS
             (max-end-pos (- *end-pos* min-len))
)

         (declare (type fixnum scan-start-pos)
                  (type function match-fn)
)

         ;; definition of ADVANCE-FN will be inserted here by macrology
        (labels ((advance-fn-definition))
           (declare (inline advance-fn))
           (when (plusp rep-num)
             ;; we have at least one REPETITION which needs to count
            ;; the number of repetitions
            (setq *repeat-counters* (make-array rep-num
                                                 :initial-element 0
                                                 :element-type 'fixnum
)
)
)

           (when (plusp zero-length-num)
             ;; we have at least one REPETITION which needs to watch
            ;; out for zero-length repetitions
            (setq *last-pos-stores* (make-array zero-length-num
                                                 :initial-element nil
)
)
)

           (when (plusp reg-num)
             ;; we have registers in our regular expression
            (setq *reg-starts* (make-array reg-num :initial-element nil)
                   *regs-maybe-start* (make-array reg-num :initial-element nil)
                   *reg-ends* (make-array reg-num :initial-element nil)
)
)

           (when end-anchored-p
             ;; the regular expression has a constant end string which
            ;; is anchored at the very end of the target string
            ;; (perhaps modulo a #\Newline)
            (let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
               (declare (type fixnum end-test-pos)
                        (type function end-string-test)
)

               (unless (setq *end-string-pos* (funcall end-string-test
                                                       end-test-pos
)
)

                 (when (and (= 1 (the fixnum end-anchored-p))
                            (> *end-pos* scan-start-pos)
                            (char= #\Newline (schar *string* (1- *end-pos*)))
)

                   ;; if we didn't find an end string candidate from
                  ;; END-TEST-POS and if a #\Newline at the end is
                  ;; allowed we try it again from one position to the
                  ;; left
                  (setq *end-string-pos* (funcall end-string-test
                                                   (1- end-test-pos)
)
)
)
)
)

             (unless (and *end-string-pos*
                          (<= *start-pos* *end-string-pos*)
)

               ;; no end string candidate found, so give up
              (return-from scan nil)
)

             (when end-string-offset
               ;; if the offset of the constant end string from the
              ;; left of the regular expression is known we can start
              ;; scanning further to the right; this is similar to
              ;; what we might do in ADVANCE-FN
              (setq scan-start-pos (max scan-start-pos
                                         (- (the fixnum *end-string-pos*)
                                            (the fixnum end-string-offset)
)
)
)
)
)

             (cond
               (start-anchored-p
                 ;; we're anchored at the start of the target string,
                ;; so no need to try again after first failure
                (when (or (/= *start-pos* scan-start-pos)
                           (< max-end-pos *start-pos*)
)

                   ;; if END-STRING-OFFSET has proven that we don't
                  ;; need to bother to scan from *START-POS* or if the
                  ;; minimal length of the regular expression is
                  ;; longer than the target string we give up
                  (return-from scan nil)
)

                 (when starts-with-str
                   (locally
                     (declare (type fixnum starts-with-len))
                     (cond ((and (case-insensitive-p starts-with)
                                 (not (*string*-equal starts-with-str
                                                      *start-pos*
                                                      (+ *start-pos*
                                                         starts-with-len
)

                                                      0 starts-with-len
)
)
)

                             ;; the regular expression has a
                            ;; case-insensitive constant start string
                            ;; and we didn't find it
                            (return-from scan nil)
)

                           ((and (not (case-insensitive-p starts-with))
                                 (not (*string*= starts-with-str
                                            *start-pos*
                                            (+ *start-pos* starts-with-len)
                                            0 starts-with-len
)
)
)

                             ;; the regular expression has a
                            ;; case-sensitive constant start string
                            ;; and we didn't find it
                            (return-from scan nil)
)

                           (t nil)
)
)
)

                 (when (and end-string-test
                            (not end-anchored-p)
)

                   ;; the regular expression has a constant end string
                  ;; which isn't anchored so we didn't check for it
                  ;; already
                  (block end-string-loop
                     ;; we temporarily use *END-STRING-POS* as our
                    ;; starting position to look for end string
                    ;; candidates
                    (setq *end-string-pos* *start-pos*)
                     (loop
                       (unless (setq *end-string-pos*
                                       (funcall (the function end-string-test)
                                                *end-string-pos*
)
)

                         ;; no end string candidate found, so give up
                        (return-from scan nil)
)

                       (unless end-string-offset
                         ;; end string doesn't have an offset so we
                        ;; can start scanning now
                        (return-from end-string-loop)
)

                       (let ((maybe-start-pos (- (the fixnum *end-string-pos*)
                                                 (the fixnum end-string-offset)
)
)
)

                         (cond ((= maybe-start-pos *start-pos*)
                                 ;; offset of end string into regular
                                ;; expression matches start anchor -
                                ;; fine...
                                (return-from end-string-loop)
)

                               ((and (< maybe-start-pos *start-pos*)
                                     (< (+ *end-string-pos* end-string-len) *end-pos*)
)

                                 ;; no match but maybe we find another
                                ;; one to the right - try again
                                (incf *end-string-pos*)
)

                               (t
                                 ;; otherwise give up
                                (return-from scan nil)
)
)
)
)
)
)

                 ;; if we got here we scan exactly once
                (let ((next-pos (funcall match-fn *start-pos*)))
                   (when next-pos
                     (values (if next-pos *start-pos* nil)
                             next-pos
                             *reg-starts*
                             *reg-ends*
)
)
)
)

               (t
                 (loop for pos = (if starts-with-everything
                                   ;; don't jump to the next
                                  ;; #\Newline on the first
                                  ;; iteration
                                  scan-start-pos
                                   (advance-fn scan-start-pos)
)

                           then (advance-fn pos)
                       ;; give up if the regular expression can't fit
                      ;; into the rest of the target string
                      while (and pos
                                  (<= (the fixnum pos) max-end-pos)
)

                       do (let ((next-pos (funcall match-fn pos)))
                            (when next-pos
                              (return-from scan (values pos
                                                        next-pos
                                                        *reg-starts*
                                                        *reg-ends*
)
)
)

                            ;; not yet found, increment POS
                           #-cormanlisp (incf (the fixnum pos))
                            #+cormanlisp (incf pos)
)
)
)
)
)
)
)
)

    :test #'equalp
)
)


(defun create-scanner-aux (match-fn
                           min-len
                           start-anchored-p
                           starts-with
                           start-string-test
                           end-anchored-p
                           end-string-test
                           end-string-len
                           end-string-offset
                           rep-num
                           zero-length-num
                           reg-num
)

  (declare #.*standard-optimize-settings*)
  (declare (type fixnum min-len zero-length-num rep-num reg-num))
  "Auxiliary function to create and return a scanner \(which is
actually a closure). Used by CREATE-SCANNER."

  (let ((starts-with-len (if (typep starts-with 'str)
                           (len starts-with)
)
)

        (starts-with-everything (typep starts-with 'everything))
)

    (cond
      ;; this COND statement dispatches on the different versions we
     ;; have for ADVANCE-FN and creates different closures for each;
     ;; note that you see only the bodies of ADVANCE-FN below - the
     ;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
     ;; could have done this with closures instead of macrology but
     ;; would have consed a lot more)
     ((and start-string-test end-string-test end-string-offset)
        ;; we know that the regular expression has constant start and
       ;; end strings and we know the end string's offset (from the
       ;; left)
       (insert-advance-fn
          (advance-fn (pos)
            (declare (type fixnum end-string-offset starts-with-len)
                     (type function start-string-test end-string-test)
)

            (loop
              (unless (setq pos (funcall start-string-test pos))
                ;; give up completely if we can't find a start string
               ;; candidate
               (return-from scan nil)
)

              (locally
                ;; from here we know that POS is a FIXNUM
               (declare (type fixnum pos))
                (when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
                  ;; if we already found an end string candidate the
                 ;; position of which matches the start string
                 ;; candidate we're done
                 (return-from advance-fn pos)
)

                (let ((try-pos (+ pos starts-with-len)))
                  ;; otherwise try (again) to find an end string
                 ;; candidate which starts behind the start string
                 ;; candidate
                 (loop
                    (unless (setq *end-string-pos*
                                    (funcall end-string-test try-pos)
)

                      ;; no end string candidate found, so give up
                     (return-from scan nil)
)

                    ;; NEW-POS is where we should start scanning
                   ;; according to the end string candidate
                   (let ((new-pos (- (the fixnum *end-string-pos*)
                                      end-string-offset
)
)
)

                      (declare (type fixnum new-pos *end-string-pos*))
                      (cond ((= new-pos pos)
                              ;; if POS and NEW-POS are equal then the
                             ;; two candidates agree so we're fine
                             (return-from advance-fn pos)
)

                            ((> new-pos pos)
                              ;; if NEW-POS is further to the right we
                             ;; advance POS and try again, i.e. we go
                             ;; back to the start of the outer LOOP
                             (setq pos new-pos)
                              ;; this means "return from inner LOOP"
                             (return)
)

                            (t
                              ;; otherwise NEW-POS is smaller than POS,
                             ;; so we have to redo the inner LOOP to
                             ;; find another end string candidate
                             ;; further to the right
                             (setq try-pos (1+ *end-string-pos*))
)
)
)
)
)
)
)
)
)
)

      ((and starts-with-everything end-string-test end-string-offset)
        ;; we know that the regular expression starts with ".*" (which
       ;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
       ;; with a constant end string and we know the end string's
       ;; offset (from the left)
       (insert-advance-fn
          (advance-fn (pos)
            (declare (type fixnum end-string-offset)
                     (type function end-string-test)
)

            (loop
              (unless (setq pos (newline-skipper pos))
                ;; if we can't find a #\Newline we give up immediately
               (return-from scan nil)
)

              (locally
                ;; from here we know that POS is a FIXNUM
               (declare (type fixnum pos))
                (when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
                  ;; if we already found an end string candidate the
                 ;; position of which matches the place behind the
                 ;; #\Newline we're done
                 (return-from advance-fn pos)
)

                (let ((try-pos pos))
                  ;; otherwise try (again) to find an end string
                 ;; candidate which starts behind the #\Newline
                 (loop
                    (unless (setq *end-string-pos*
                                    (funcall end-string-test try-pos)
)

                      ;; no end string candidate found, so we give up
                     (return-from scan nil)
)

                    ;; NEW-POS is where we should start scanning
                   ;; according to the end string candidate
                   (let ((new-pos (- (the fixnum *end-string-pos*)
                                      end-string-offset
)
)
)

                      (declare (type fixnum new-pos *end-string-pos*))
                      (cond ((= new-pos pos)
                              ;; if POS and NEW-POS are equal then the
                             ;; the end string candidate agrees with
                             ;; the #\Newline so we're fine
                             (return-from advance-fn pos)
)

                            ((> new-pos pos)
                              ;; if NEW-POS is further to the right we
                             ;; advance POS and try again, i.e. we go
                             ;; back to the start of the outer LOOP
                             (setq pos new-pos)
                              ;; this means "return from inner LOOP"
                             (return)
)

                            (t
                              ;; otherwise NEW-POS is smaller than POS,
                             ;; so we have to redo the inner LOOP to
                             ;; find another end string candidate
                             ;; further to the right
                             (setq try-pos (1+ *end-string-pos*))
)
)
)
)
)
)
)
)
)
)

      ((and start-string-test end-string-test)
        ;; we know that the regular expression has constant start and
       ;; end strings; similar to the first case but we only need to
       ;; check for the end string, it doesn't provide enough
       ;; information to advance POS
       (insert-advance-fn
          (advance-fn (pos)
            (declare (type function start-string-test end-string-test))
            (unless (setq pos (funcall start-string-test pos))
              (return-from scan nil)
)

&