| Paste number 77541: | SSE Intrinsics for string= |
| Pasted by: | pkhuong |
| When: | 1 year, 5 months ago |
| Share: | Tweet this! | http://paste.lisp.org/+1NTX |
| Channel: | #lisp |
| Paste contents: |
Lots of boilerplate code. I'm hoping I can find a way to DSL/macroize
it away.
On long strings, I see a speed-up of ~10x, and no sensible slowdown on
strings that exhibit a difference in the first characters (or even a
speed-up) [Core 2 Duo].
(defun compare-sbs-ss (base-string string)
"Compare a simple-base-string and a simple-string for equality."
(declare (simple-base-string base-string)
((simple-array character 1) string)
(optimize speed (safety 0)))
(let ((length (length base-string)))
(unless (= length (length string))
(return-from compare-sbs-ss nil))
(with-pinned-objects (base-string string)
(let* ((bs (vector-sap base-string))
(ss (vector-sap string)))
(declare (system-area-pointer bs ss))
(labels ((cmp-4-chars (base-char-pack string-offset zero)
(declare (type sse-value base-char-pack zero)
(type fixnum string-offset))
;; The magic step:
;; - take the (low 32) bits of the base-char-pack
;; - interleave the individual bytes with 0s
;; - interleave the resulting padded words with 0s
;;
;; The 4 low chars remain at the bottom of their
;; subpack of 32 bits which are otherwise filled
;; with 0s.
(let ((expanded-bc (punpcklwd
(punpcklbw base-char-pack zero)
zero)))
(pxor expanded-bc
(truly-the sse-value
(sap-ref-pack-128-align ss string-offset)))))
(pzerop (x)
(declare (type sse-value x))
(let ((x-low (psrldq x 8))) ; shift right by 8 bytes
(zerop (%sse-value-low (por x x-low)))))
(cmp-16-chars (base-offset string-offset)
(declare (type fixnum base-offset string-offset))
(let ((base-chars (sap-ref-pack-128-align bs base-offset))
(diff (%make-sse-value 0 0))
(zero (%make-sse-value 0 0)))
(declare (type sse-value diff base-chars zero)
(optimize (safety 0)))
(macrolet ((inner (i)
(declare (optimize (speed 0)))
`(progn
,(unless (zerop i)
`(setf base-chars (psrldq base-chars 4)))
(setf diff
(por diff
(cmp-4-chars base-chars (+ ,i string-offset) zero))))))
(inner 0)
(inner 1)
(inner 2)
(inner 3))
(unless (pzerop diff)
(return-from compare-sbs-ss nil))
nil)))
(declare (inline pzerop cmp-4-chars cmp-16-chars))
(multiple-value-bind (16-chunks remainder)
(floor length 16)
(unless (zerop 16-chunks)
(locally (declare (optimize (safety 0)))
(loop for base-char-chunk below 16-chunks
for char-chunk of-type fixnum from 0 by 4
do (cmp-16-chars base-char-chunk char-chunk))))
(multiple-value-bind (4-chunks remainder)
(floor remainder 4)
(unless (zerop 4-chunks)
(let* ((base-chunk 16-chunks)
(char-chunk (* 4 16-chunks))
(base-chars (sap-ref-pack-128-align bs base-chunk))
(diff (%make-sse-value 0 0))
(zero (%make-sse-value 0 0)))
(declare (fixnum base-chunk char-chunk)
(sse-value diff base-chars))
(block nil
(macrolet ((inner (i)
(declare (optimize (speed 0)))
`(progn
,(unless (zerop i)
`(setf base-chars (psrldq base-chars 4)))
(setf diff
(por diff
(cmp-4-chars base-chars
(+ ,i char-chunk) zero)))
,(unless (= i 3)
`(when (= 4-chunks ,(1+ i))
(return))))))
(inner 0)
(inner 1)
(inner 2)
(inner 3)))
(unless (pzerop diff)
(return-from compare-sbs-ss nil))))
(unless (zerop remainder)
(loop for i from (- length remainder) below length
unless (char= (aref base-string i) (aref string i))
do (return-from compare-sbs-ss nil)))
t)))))))
Annotations for this paste:
| Annotation number 1: | Instruction definitions |
| Pasted by: | pkhuong |
| When: | 1 year, 5 months ago |
| Share: | Tweet this! | http://paste.lisp.org/+1NTX/1 |
| Paste contents: |
(defun emit-sse-inst-with-imm (segment dst/src imm
prefix opcode /i
&key operand-size)
(aver (<= 0 /i 7))
(when prefix
(emit-byte segment prefix))
(maybe-emit-rex-prefix segment operand-size dst/src nil nil)
(emit-byte segment #x0F)
(emit-byte segment opcode)
(emit-byte segment (logior (ash (logior #b11000 /i) 3)
(reg-tn-encoding dst/src)))
(emit-byte segment imm))
(define-instruction psrlw (segment dst/src imm)
(:emitter
(emit-sse-inst-with-imm segment dst/src imm
#x66 #x71 2
:operand-size :do-not-set)))
(macrolet ((define-mov-sse-inst (name prefix opcode-from opcode-to)
`(define-instruction ,name (segment dst src)
(:printer ext-rex-xmm-xmm/mem-dir ((prefix ,prefix)
(op ,)))
(:emitter
(cond ((xmm-register-p dst)
(emit-regular-sse-inst segment dst src ,prefix ,opcode-from))
(t
(aver (xmm-register-p src))
(emit-regular-sse-inst segment src dst ,prefix ,opcode-to)))))))
(define-mov-sse-inst movdqa #x66 #x6f #x7f)
(define-mov-sse-inst movdqu #xf3 #x6f #x7f)
)
| Annotation number 2: | Possibly somewhat broken, but should be a start |
| Pasted by: | nyef |
| When: | 1 year, 5 months ago |
| Share: | Tweet this! | http://paste.lisp.org/+1NTX/2 |
| Paste contents: |
(sb!disassem:define-instruction-format (ext-rex-xmm/mem-imm 40
:default-printer
'(:name :tab reg/mem ", " imm))
(prefix :field (byte 8 0))
(rex :field (byte 4 12) :value #b0100)
(wrxb :field (byte 4 8) :type 'wrxb)
(x0f :field (byte 8 16) :value #x0f)
(op :fields (list (byte 8 24) (byte 3 35)))
(reg/mem :fields (list (byte 2 38) (byte 3 32))
:type 'xmmreg/mem)
(imm :type 'imm-byte))
(define-instruction psrlw (segment dst/src imm)
(:printer ext-rex-xmm/mem-imm ((prefix #x66) (op #x71 2)))
(:emitter
(emit-sse-inst-with-imm segment dst/src imm
#x66 #x71 2
:operand-size :do-not-set)))