mirror of
https://git.FreeBSD.org/src.git
synced 2025-01-07 13:14:51 +00:00
amd64: depessimize bcmp for small buffers
Adapt assembly generated by clang for memcmp and use it for <= 64 sized compares (which are the vast majority). Sample result of doing stats on Broadwell (% of samples): before: 4.0 kernel bcmp cache_lookup after : 0.7 kernel bcmp cache_lookup The routine is most definitely still not optimal. Anyone interested in spending time improving it is welcome to take over. Reviewed by: kib
This commit is contained in:
parent
55c9d75e6b
commit
20ca271fdd
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=333413
@ -98,17 +98,40 @@ END(sse2_pagezero)
|
||||
|
||||
ENTRY(bcmp)
|
||||
PUSH_FRAME_POINTER
|
||||
test %rdx,%rdx
|
||||
je 1f
|
||||
cmpq $64,%rdx
|
||||
jg 4f
|
||||
|
||||
xor %ecx,%ecx
|
||||
2:
|
||||
movzbl (%rdi,%rcx,1),%eax
|
||||
movzbl (%rsi,%rcx,1),%r8d
|
||||
cmp %r8b,%al
|
||||
jne 3f
|
||||
add $0x1,%rcx
|
||||
cmp %rcx,%rdx
|
||||
jne 2b
|
||||
1:
|
||||
xor %eax,%eax
|
||||
POP_FRAME_POINTER
|
||||
retq
|
||||
3:
|
||||
mov $1,%eax
|
||||
POP_FRAME_POINTER
|
||||
retq
|
||||
4:
|
||||
movq %rdx,%rcx
|
||||
shrq $3,%rcx
|
||||
repe
|
||||
cmpsq
|
||||
jne 1f
|
||||
jne 5f
|
||||
|
||||
movq %rdx,%rcx
|
||||
andq $7,%rcx
|
||||
repe
|
||||
cmpsb
|
||||
1:
|
||||
5:
|
||||
setne %al
|
||||
movsbl %al,%eax
|
||||
POP_FRAME_POINTER
|
||||
|
Loading…
Reference in New Issue
Block a user