1
0
mirror of https://git.FreeBSD.org/src.git synced 2025-01-07 13:14:51 +00:00

amd64: depessimize bcmp for small buffers

Adapt assembly generated by clang for memcmp and use it for <= 64 sized
compares (which are the vast majority).

Sample result of doing stats on Broadwell (% of samples):
before: 4.0 kernel     bcmp                 cache_lookup
after : 0.7 kernel     bcmp                 cache_lookup

The routine is most definitely still not optimal. Anyone interested in
spending time improving it is welcome to take over.

Reviewed by:	kib
This commit is contained in:
Mateusz Guzik 2018-05-09 15:16:25 +00:00
parent 55c9d75e6b
commit 20ca271fdd
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=333413

View File

@ -98,17 +98,40 @@ END(sse2_pagezero)
ENTRY(bcmp)
PUSH_FRAME_POINTER
test %rdx,%rdx
je 1f
cmpq $64,%rdx
jg 4f
xor %ecx,%ecx
2:
movzbl (%rdi,%rcx,1),%eax
movzbl (%rsi,%rcx,1),%r8d
cmp %r8b,%al
jne 3f
add $0x1,%rcx
cmp %rcx,%rdx
jne 2b
1:
xor %eax,%eax
POP_FRAME_POINTER
retq
3:
mov $1,%eax
POP_FRAME_POINTER
retq
4:
movq %rdx,%rcx
shrq $3,%rcx
repe
cmpsq
jne 1f
jne 5f
movq %rdx,%rcx
andq $7,%rcx
repe
cmpsb
1:
5:
setne %al
movsbl %al,%eax
POP_FRAME_POINTER