Paste number 77541: SSE Intrinsics for string=

Index of paste annotations: 1 | 2

Paste number 77541: SSE Intrinsics for string=
Pasted by: pkhuong
When:1 year, 5 months ago
Share:Tweet this! | http://paste.lisp.org/+1NTX
Channel:#lisp
Paste contents:
Raw Source | XML | Display As
Lots of boilerplate code. I'm hoping I can find a way to DSL/macroize 
it away.

On long strings, I see a speed-up of ~10x, and no sensible slowdown on
strings that exhibit a difference in the first characters (or even a
speed-up) [Core 2 Duo].

(defun compare-sbs-ss (base-string string)
  "Compare a simple-base-string and a simple-string for equality."
  (declare (simple-base-string base-string)
           ((simple-array character 1) string)
           (optimize speed (safety 0)))
  (let ((length (length base-string)))
    (unless (= length (length string))
      (return-from compare-sbs-ss nil))
    (with-pinned-objects (base-string string)
      (let* ((bs (vector-sap base-string))
             (ss (vector-sap string)))
        (declare (system-area-pointer bs ss))
        (labels ((cmp-4-chars (base-char-pack string-offset zero)
                   (declare (type sse-value base-char-pack zero)
                            (type fixnum string-offset))
                   ;; The magic step:
                   ;; - take the (low 32) bits of the base-char-pack
                   ;; - interleave the individual bytes with 0s
                   ;; - interleave the resulting padded words with 0s
                   ;;
                   ;; The 4 low chars remain at the bottom of their
                   ;; subpack of 32 bits which are otherwise filled
                   ;; with 0s.
                   (let ((expanded-bc (punpcklwd
                                       (punpcklbw base-char-pack zero)
                                       zero)))
                     (pxor expanded-bc
                           (truly-the sse-value
                                      (sap-ref-pack-128-align ss string-offset)))))
                 (pzerop (x)
                   (declare (type sse-value x))
                   (let ((x-low (psrldq x 8))) ; shift right by 8 bytes
                     (zerop (%sse-value-low (por x x-low)))))
                 (cmp-16-chars (base-offset string-offset)
                   (declare (type fixnum base-offset string-offset))
                   (let ((base-chars (sap-ref-pack-128-align bs base-offset))
                         (diff (%make-sse-value 0 0))
                         (zero (%make-sse-value 0 0)))
                     (declare (type sse-value diff base-chars zero)
                              (optimize (safety 0)))
                     (macrolet ((inner (i)
                                  (declare (optimize (speed 0)))
                                  `(progn
                                     ,(unless (zerop i)
                                        `(setf base-chars (psrldq base-chars 4)))
                                     (setf diff
                                           (por diff
                                                (cmp-4-chars base-chars (+ ,i string-offset) zero))))))
                       (inner 0)
                       (inner 1)
                       (inner 2)
                       (inner 3))
                     (unless (pzerop diff)
                       (return-from compare-sbs-ss nil))
                     nil)))
          (declare (inline pzerop cmp-4-chars cmp-16-chars))
          (multiple-value-bind (16-chunks remainder)
              (floor length 16)
            (unless (zerop 16-chunks)
              (locally (declare (optimize (safety 0)))
                (loop for base-char-chunk below 16-chunks
                      for char-chunk of-type fixnum from 0 by 4
                      do (cmp-16-chars base-char-chunk char-chunk))))
            (multiple-value-bind (4-chunks remainder)
                (floor remainder 4)
              (unless (zerop 4-chunks)
                (let* ((base-chunk 16-chunks)
                       (char-chunk (* 4 16-chunks))
                       (base-chars (sap-ref-pack-128-align bs base-chunk))
                       (diff (%make-sse-value 0 0))
                       (zero (%make-sse-value 0 0)))
                  (declare (fixnum base-chunk char-chunk)
                           (sse-value diff base-chars))
                  (block nil
                    (macrolet ((inner (i)
                                 (declare (optimize (speed 0)))
                                 `(progn
                                    ,(unless (zerop i)
                                       `(setf base-chars (psrldq base-chars 4)))
                                    (setf diff
                                          (por diff
                                               (cmp-4-chars base-chars
                                                            (+ ,i char-chunk) zero)))
                                    ,(unless (= i 3)
                                       `(when (= 4-chunks ,(1+ i))
                                          (return))))))
                      (inner 0)
                      (inner 1)
                      (inner 2)
                      (inner 3)))
                  (unless (pzerop diff)
                    (return-from compare-sbs-ss nil))))
              (unless (zerop remainder)
                (loop for i from (- length remainder) below length
                      unless (char= (aref base-string i) (aref string i))
                        do (return-from compare-sbs-ss nil)))
              t)))))))

Annotations for this paste:

Annotation number 1: Instruction definitions
Pasted by: pkhuong
When:1 year, 5 months ago
Share:Tweet this! | http://paste.lisp.org/+1NTX/1
Paste contents:
Raw Source | Display As
(defun emit-sse-inst-with-imm (segment dst/src imm
                               prefix opcode /i
                               &key operand-size)
  (aver (<= 0 /i 7))
  (when prefix
    (emit-byte segment prefix))
  (maybe-emit-rex-prefix segment operand-size dst/src nil nil)
  (emit-byte segment #x0F)
  (emit-byte segment opcode)
  (emit-byte segment (logior (ash (logior #b11000 /i) 3)
                             (reg-tn-encoding dst/src)))
  (emit-byte segment imm))

(define-instruction psrlw (segment dst/src imm)
  (:emitter
   (emit-sse-inst-with-imm segment dst/src imm
                           #x66 #x71 2
                           :operand-size :do-not-set)))

(macrolet ((define-mov-sse-inst (name prefix opcode-from opcode-to)
             `(define-instruction ,name (segment dst src)
                (:printer ext-rex-xmm-xmm/mem-dir ((prefix ,prefix)
                                                   (op ,)))
                (:emitter
                 (cond ((xmm-register-p dst)
                        (emit-regular-sse-inst segment dst src ,prefix ,opcode-from))
                       (t
                        (aver (xmm-register-p src))
                        (emit-regular-sse-inst segment src dst ,prefix ,opcode-to)))))))
  (define-mov-sse-inst movdqa #x66 #x6f #x7f)
  (define-mov-sse-inst movdqu #xf3 #x6f #x7f)
  )

Annotation number 2: Possibly somewhat broken, but should be a start
Pasted by: nyef
When:1 year, 5 months ago
Share:Tweet this! | http://paste.lisp.org/+1NTX/2
Paste contents:
Raw Source | Display As
(sb!disassem:define-instruction-format (ext-rex-xmm/mem-imm 40
                                        :default-printer
                                        '(:name :tab reg/mem ", " imm))
  (prefix  :field (byte 8 0))
  (rex     :field (byte 4 12)   :value #b0100)
  (wrxb    :field (byte 4 8)    :type 'wrxb)
  (x0f     :field (byte 8 16)   :value #x0f)
  (op      :fields (list (byte 8 24) (byte 3 35)))
  (reg/mem :fields (list (byte 2 38) (byte 3 32))
                                :type 'xmmreg/mem)
  (imm     :type 'imm-byte))

(define-instruction psrlw (segment dst/src imm)
  (:printer ext-rex-xmm/mem-imm ((prefix #x66) (op #x71 2)))
  (:emitter
   (emit-sse-inst-with-imm segment dst/src imm
                           #x66 #x71 2
                           :operand-size :do-not-set)))

Colorize as:
Show Line Numbers
Index of paste annotations: 1 | 2

Lisppaste pastes can be made by anyone at any time. Imagine a fearsomely comprehensive disclaimer of liability. Now fear, comprehensively.