Component convert

You are here: All Systems / cl-ppcre / convert

;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
;;; $Header: /usr/local/cvsrep/cl-ppcre/convert.lisp,v 1.25 2007/01/01 23:43:10 edi Exp $

;;; Here the parse tree is converted into its internal representation
;;; using REGEX objects.  At the same time some optimizations are
;;; already applied.

;;; Copyright (c) 2002-2007, Dr. Edmund Weitz. All rights reserved.

;;; Redistribution and use in source and binary forms, with or without
;;; modification, are permitted provided that the following conditions
;;; are met:

;;;   * Redistributions of source code must retain the above copyright
;;;     notice, this list of conditions and the following disclaimer.

;;;   * Redistributions in binary form must reproduce the above
;;;     copyright notice, this list of conditions and the following
;;;     disclaimer in the documentation and/or other materials
;;;     provided with the distribution.

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

(in-package #:cl-ppcre)

;;; The flags that represent the "ism" modifiers are always kept
;;; together in a three-element list. We use the following macros to
;;; access individual elements.

(defmacro case-insensitive-mode-p (flags)
  "Accessor macro to extract the first flag out of a three-element flag list."
  `(first ,flags)
)


(defmacro multi-line-mode-p (flags)
  "Accessor macro to extract the second flag out of a three-element flag list."
  `(second ,flags)
)


(defmacro single-line-mode-p (flags)
  "Accessor macro to extract the third flag out of a three-element flag list."
  `(third ,flags)
)


(defun set-flag (token)
  (declare #.*standard-optimize-settings*)
  (declare (special flags))
  "Reads a flag token and sets or unsets the corresponding entry in
the special FLAGS list."

  (case token
    ((:case-insensitive-p)
      (setf (case-insensitive-mode-p flags) t)
)

    ((:case-sensitive-p)
      (setf (case-insensitive-mode-p flags) nil)
)

    ((:multi-line-mode-p)
      (setf (multi-line-mode-p flags) t)
)

    ((:not-multi-line-mode-p)
      (setf (multi-line-mode-p flags) nil)
)

    ((:single-line-mode-p)
      (setf (single-line-mode-p flags) t)
)

    ((:not-single-line-mode-p)
      (setf (single-line-mode-p flags) nil)
)

    (otherwise
      (signal-ppcre-syntax-error "Unknown flag token ~A" token)
)
)
)


(defun add-range-to-hash (hash from to)
  (declare #.*standard-optimize-settings*)
  (declare (special flags))
  "Adds all characters from character FROM to character TO (inclusive)
to the char class hash HASH. Does the right thing with respect to
case-(in)sensitivity as specified by the special variable FLAGS."

  (let ((from-code (char-code from))
        (to-code (char-code to))
)

    (when (> from-code to-code)
      (signal-ppcre-syntax-error "Invalid range from ~A to ~A in char-class"
                                 from to
)
)

    (cond ((case-insensitive-mode-p flags)
            (loop for code from from-code to to-code
                  for chr = (code-char code)
                  do (setf (gethash (char-upcase chr) hash) t
                           (gethash (char-downcase chr) hash) t
)
)
)

          (t
            (loop for code from from-code to to-code
                  do (setf (gethash (code-char code) hash) t)
)
)
)

    hash
)
)


(defun convert-char-class-to-hash (list)
  (declare #.*standard-optimize-settings*)
  "Combines all items in LIST into one char class hash and returns it.
Items can be single characters, character ranges like
\(:RANGE #\\A
#
\\E), or special character classes like :DIGIT-CLASS. Does the right
thing with respect to case-
\(in)sensitivity as specified by the
special variable FLAGS."

  (loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4)))
                                     :rehash-size (float (expt *regex-char-code-limit* (/ 1 4)))
                                     :rehash-threshold #-genera 1.0 #+genera 0.99
)

        for item in list
        if (characterp item)
          ;; treat a single character C like a range (:RANGE C C)
         do (add-range-to-hash hash item item)
        else if (symbolp item)
          ;; special character classes
         do (setq hash
                     (case item
                       ((:digit-class)
                         (merge-hash hash +digit-hash+)
)

                       ((:non-digit-class)
                         (merge-inverted-hash hash +digit-hash+)
)

                       ((:whitespace-char-class)
                         (merge-hash hash +whitespace-char-hash+)
)

                       ((:non-whitespace-char-class)
                         (merge-inverted-hash hash +whitespace-char-hash+)
)

                       ((:word-char-class)
                         (merge-hash hash +word-char-hash+)
)

                       ((:non-word-char-class)
                         (merge-inverted-hash hash +word-char-hash+)
)

                       (otherwise
                         (signal-ppcre-syntax-error
                          "Unknown symbol ~A in character class"
                          item
)
)
)
)

        else if (and (consp item)
                     (eq (car item) :range)
)

          ;; proper ranges
         do (add-range-to-hash hash
                                (second item)
                                (third item)
)

        else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list"
                                           item
)

        finally (return hash)
)
)


(defun maybe-split-repetition (regex
                               greedyp
                               minimum
                               maximum
                               min-len
                               length
                               reg-seen
)

  (declare #.*standard-optimize-settings*)
  (declare (type fixnum minimum)
           (type (or fixnum null) maximum)
)

  "Splits a REPETITION object into a constant and a varying part if
applicable, i.e. something like
  a{3,} -> a{3}a*
The arguments to this function correspond to the REPETITION slots of
the same name."

  ;; note the usage of COPY-REGEX here; we can't use the same REGEX
 ;; object in both REPETITIONS because they will have different
 ;; offsets
 (when maximum
    (when (zerop maximum)
      ;; trivial case: don't repeat at all
     (return-from maybe-split-repetition
        (make-instance 'void)
)
)

    (when (= 1 minimum maximum)
      ;; another trivial case: "repeat" exactly once
     (return-from maybe-split-repetition
        regex
)
)
)

  ;; first set up the constant part of the repetition
 ;; maybe that's all we need
 (let ((constant-repetition (if (plusp minimum)
                               (make-instance 'repetition
                                              :regex (copy-regex regex)
                                              :greedyp greedyp
                                              :minimum minimum
                                              :maximum minimum
                                              :min-len min-len
                                              :len length
                                              :contains-register-p reg-seen
)

                               ;; don't create garbage if minimum is 0
                              nil
)
)
)

    (when (and maximum
               (= maximum minimum)
)

      (return-from maybe-split-repetition
        ;; no varying part needed because min = max
       constant-repetition
)
)

    ;; now construct the varying part
   (let ((varying-repetition
            (make-instance 'repetition
                           :regex regex
                           :greedyp greedyp
                           :minimum 0
                           :maximum (if maximum (- maximum minimum) nil)
                           :min-len min-len
                           :len length
                           :contains-register-p reg-seen
)
)
)

      (cond ((zerop minimum)
              ;; min = 0, no constant part needed
             varying-repetition
)

            ((= 1 minimum)
              ;; min = 1, constant part needs no REPETITION wrapped around
             (make-instance 'seq
                             :elements (list (copy-regex regex)
                                             varying-repetition
)
)
)

            (t
              ;; general case
             (make-instance 'seq
                             :elements (list constant-repetition
                                             varying-repetition
)
)
)
)
)
)
)


;; During the conversion of the parse tree we keep track of the start
;; of the parse tree in the special variable STARTS-WITH which'll
;; either hold a STR object or an EVERYTHING object. The latter is the
;; case if the regex starts with ".*" which implicitely anchors the
;; regex at the start (perhaps modulo #\Newline).

(defun maybe-accumulate (str)
  (declare #.*standard-optimize-settings*)
  (declare (special accumulate-start-p starts-with))
  (declare (ftype (function (t) fixnum) len))
  "Accumulate STR into the special variable STARTS-WITH if
ACCUMULATE-START-P (also special) is true and STARTS-WITH is either
NIL or a STR object of the same case mode. Always returns NIL."

  (when accumulate-start-p
    (etypecase starts-with
      (str
        ;; STARTS-WITH already holds a STR, so we check if we can
       ;; concatenate
       (cond ((eq (case-insensitive-p starts-with)
                   (case-insensitive-p str)
)

                ;; we modify STARTS-WITH in place
               (setf (len starts-with)
                        (+ (len starts-with) (len str))
)

                ;; note that we use SLOT-VALUE because the accessor
               ;; STR has a declared FTYPE which doesn't fit here
               (adjust-array (slot-value starts-with 'str)
                              (len starts-with)
                              :fill-pointer t
)

                (setf (subseq (slot-value starts-with 'str)
                              (- (len starts-with) (len str))
)

                        (str str)
                      ;; STR objects that are parts of STARTS-WITH
                     ;; always have their SKIP slot set to true
                     ;; because the SCAN function will take care of
                     ;; them, i.e. the matcher can ignore them
                     (skip str) t
)
)

              (t (setq accumulate-start-p nil))
)
)

      (null
        ;; STARTS-WITH is still empty, so we create a new STR object
       (setf starts-with
                (make-instance 'str
                               :str ""
                               :case-insensitive-p (case-insensitive-p str)
)

              ;; INITIALIZE-INSTANCE will coerce the STR to a simple
             ;; string, so we have to fill it afterwards
             (slot-value starts-with 'str)
                (make-array (len str)
                            :initial-contents (str str)
                            :element-type 'character
                            :fill-pointer t
                            :adjustable t
)

              (len starts-with)
                (len str)
              ;; see remark about SKIP above
             (skip str) t
)
)

      (everything
        ;; STARTS-WITH already holds an EVERYTHING object - we can't
       ;; concatenate
       (setq accumulate-start-p nil)
)
)
)

  nil
)


(defun convert-aux (parse-tree)
  (declare #.*standard-optimize-settings*)
  (declare (special flags reg-num accumulate-start-p starts-with max-back-ref))
  "Converts the parse tree PARSE-TREE into a REGEX object and returns it.

Will also
  - split and optimize repetitions,
  - accumulate strings or EVERYTHING objects into the special variable
    STARTS-WITH,
  - keep track of all registers seen in the special variable REG-NUM,
  - keep track of the highest backreference seen in the special
    variable MAX-BACK-REF,
  - maintain and adher to the currently applicable modifiers in the special
    variable FLAGS, and
  - maybe even wash your car..."

  (cond ((consp parse-tree)
          (case (first parse-tree)
            ;; (:SEQUENCE {<regex>}*)
           ((:sequence)
              (cond ((cddr parse-tree)
                      ;; this is essentially like
                     ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE))
                     ;; but we don't cons a new list
                     (loop for parse-tree-rest on (rest parse-tree)
                            while parse-tree-rest
                            do (setf (car parse-tree-rest)
                                       (convert-aux (car parse-tree-rest))
)
)

                      (make-instance 'seq
                                     :elements (rest parse-tree)
)
)

                    (t (convert-aux (second parse-tree)))
)
)

            ;; (:GROUP {<regex>}*)
           ;; this is a syntactical construct equivalent to :SEQUENCE
           ;; intended to keep the effect of modifiers local
           ((:group)
              ;; make a local copy of FLAGS and shadow the global
             ;; value while we descend into the enclosed regexes
             (let ((flags (copy-list flags)))
                (declare (special flags))
                (cond ((cddr parse-tree)
                        (loop for parse-tree-rest on (rest parse-tree)
                              while parse-tree-rest
                              do (setf (car parse-tree-rest)
                                         (convert-aux (car parse-tree-rest))
)
)

                        (make-instance 'seq
                                       :elements (rest parse-tree)
)
)

                      (t (convert-aux (second parse-tree)))
)
)
)

            ;; (:ALTERNATION {<regex>}*)
           ((:alternation)
              ;; we must stop accumulating objects into STARTS-WITH
             ;; once we reach an alternation
             (setq accumulate-start-p nil)
              (loop for parse-tree-rest on (rest parse-tree)
                    while parse-tree-rest
                    do (setf (car parse-tree-rest)
                               (convert-aux (car parse-tree-rest))
)
)

              (make-instance 'alternation
                             :choices (rest parse-tree)
)
)

            ;; (:BRANCH <test> <regex>)
           ;; <test> must be look-ahead, look-behind or number;
           ;; if <regex> is an alternation it must have one or two
           ;; choices
           ((:branch)
              (setq accumulate-start-p nil)
              (let* ((test-candidate (second parse-tree))
                     (test (cond ((numberp test-candidate)
                                   (when (zerop (the fixnum test-candidate))
                                     (signal-ppcre-syntax-error
                                      "Register 0 doesn't exist: ~S"
                                      parse-tree
)
)

                                   (1- (the fixnum test-candidate))
)

                                 (t (convert-aux test-candidate))
)
)

                     (alternations (convert-aux (third parse-tree)))
)

                (when (and (not (numberp test))
                           (not (typep test 'lookahead))
                           (not (typep test 'lookbehind))
)

                  (signal-ppcre-syntax-error
                   "Branch test must be look-ahead, look-behind or number: ~S"
                   parse-tree
)
)

                (typecase alternations
                  (alternation
                    (case (length (choices alternations))
                      ((0)
                        (signal-ppcre-syntax-error "No choices in branch: ~S"
                                                   parse-tree
)
)

                      ((1)
                        (make-instance 'branch
                                       :test test
                                       :then-regex (first
                                                    (choices alternations)
)
)
)

                      ((2)
                        (make-instance 'branch
                                       :test test
                                       :then-regex (first
                                                    (choices alternations)
)

                                       :else-regex (second
                                                    (choices alternations)
)
)
)

                      (otherwise
                        (signal-ppcre-syntax-error
                         "Too much choices in branch: ~S"
                         parse-tree
)
)
)
)

                  (t
                    (make-instance 'branch
                                   :test test
                                   :then-regex alternations
)
)
)
)
)

            ;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD <regex>)
           ((:positive-lookahead :negative-lookahead)
              ;; keep the effect of modifiers local to the enclosed
             ;; regex and stop accumulating into STARTS-WITH
             (setq accumulate-start-p nil)
              (let ((flags (copy-list flags)))
                (declare (special flags))
                (make-instance 'lookahead
                               :regex (convert-aux (second parse-tree))
                               :positivep (eq (first parse-tree)
                                              :positive-lookahead
)
)
)
)

            ;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND <regex>)
           ((:positive-lookbehind :negative-lookbehind)
              ;; keep the effect of modifiers local to the enclosed
             ;; regex and stop accumulating into STARTS-WITH
             (setq accumulate-start-p nil)
              (let* ((flags (copy-list flags))
                     (regex (convert-aux (second parse-tree)))
                     (len (regex-length regex))
)

                (declare (special flags))
                ;; lookbehind assertions must be of fixed length
               (unless len
                  (signal-ppcre-syntax-error
                   "Variable length look-behind not implemented (yet): ~S"
                   parse-tree
)
)

                (make-instance 'lookbehind
                               :regex regex
                               :positivep (eq (first parse-tree)
                                              :positive-lookbehind
)

                               :len len
)
)
)

            ;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>)
           ((:greedy-repetition :non-greedy-repetition)
              ;; remember the value of ACCUMULATE-START-P upon entering
             (let ((local-accumulate-start-p accumulate-start-p))
                (let ((minimum (second parse-tree))
                      (maximum (third parse-tree))
)

                  (declare (type fixnum minimum))
                  (declare (type (or null fixnum) maximum))
                  (unless (and maximum
                               (= 1 minimum maximum)
)

                    ;; set ACCUMULATE-START-P to NIL for the rest of
                   ;; the conversion because we can't continue to
                   ;; accumulate inside as well as after a proper
                   ;; repetition
                   (setq accumulate-start-p nil)
)

                  (let* (reg-seen
                         (regex (convert-aux (fourth parse-tree)))
                         (min-len (regex-min-length regex))
                         (greedyp (eq (first parse-tree) :greedy-repetition))
                         (length (regex-length regex))
)

                    ;; note that this declaration already applies to
                   ;; the call to CONVERT-AUX above
                   (declare (special reg-seen))
                    (when (and local-accumulate-start-p
                               (not starts-with)
                               (zerop minimum)
                               (not maximum)
)

                      ;; if this repetition is (equivalent to) ".*"
                     ;; and if we're at the start of the regex we
                     ;; remember it for ADVANCE-FN (see the SCAN
                     ;; function)
                     (setq starts-with (everythingp regex))
)

                    (if (or (not reg-seen)
                            (not greedyp)
                            (not length)
                            (zerop length)
                            (and maximum (= minimum maximum))
)

                      ;; the repetition doesn't enclose a register, or
                     ;; it's not greedy, or we can't determine it's
                     ;; (inner) length, or the length is zero, or the
                     ;; number of repetitions is fixed; in all of
                     ;; these cases we don't bother to optimize
                     (maybe-split-repetition regex
                                              greedyp
                                              minimum
                                              maximum
                                              min-len
                                              length
                                              reg-seen
)

                      ;; otherwise we make a transformation that looks
                     ;; roughly like one of
                     ;;   <regex>* -> (?:<regex'>*<regex>)?
                     ;;   <regex>+ -> <regex'>*<regex>
                     ;; where the trick is that as much as possible
                     ;; registers from <regex> are removed in
                     ;; <regex'>
                     (let* (reg-seen   ; new instance for REMOVE-REGISTERS
                            (remove-registers-p t)
                             (inner-regex (remove-registers regex))
                             (inner-repetition
                               ;; this is the "<regex'>" part
                              (maybe-split-repetition inner-regex
                                                       ;; always greedy
                                                      t
                                                       ;; reduce minimum by 1
                                                      ;; unless it's already 0
                                                      (if (zerop minimum)
                                                         0
                                                         (1- minimum)
)

                                                       ;; reduce maximum by 1
                                                      ;; unless it's NIL
                                                      (and maximum
                                                            (1- maximum)
)

                                                       min-len
                                                       length
                                                       reg-seen
)
)

                             (inner-seq
                               ;; this is the "<regex'>*<regex>" part
                              (make-instance 'seq
                                              :elements (list inner-repetition
                                                              regex
)
)
)
)

                        ;; note that this declaration already applies
                       ;; to the call to REMOVE-REGISTERS above
                       (declare (special remove-registers-p reg-seen))
                        ;; wrap INNER-SEQ with a greedy
                       ;; {0,1}-repetition (i.e. "?") if necessary
                       (if (plusp minimum)
                          inner-seq
                          (maybe-split-repetition inner-seq
                                                  t
                                                  0
                                                  1
                                                  min-len
                                                  nil
                                                  t
)
)
)
)
)
)
)
)

       &nbs