From 6210ac95a19416832601b571409a3e08b76d107f Mon Sep 17 00:00:00 2001
From: Alexander Motin <mav@FreeBSD.org>
Date: Thu, 16 Jun 2022 13:01:12 -0400
Subject: [PATCH] amd64: Stop using REP MOVSB for backward memmove()s.

Enhanced REP MOVSB feature of CPUs starting from Ivy Bridge makes
REP MOVSB the fastest way to copy memory in most of cases. However
Intel Optimization Reference Manual says: "setting the DF to force
REP MOVSB to copy bytes from high towards low addresses will expe-
rience significant performance degradation". Measurements on Intel
Cascade Lake and Alder Lake, same as on AMD Zen3 show that it can
drop throughput to as low as 2.5-3.5GB/s, comparing to ~10-30GB/s
of REP MOVSQ or hand-rolled loop, used for non-ERMS CPUs.

This patch keeps ERMS use for forward ordered memory copies, but
removes it for backward overlapped moves where it does not work.

Reviewed by:	mjg
MFC after:	2 weeks
---
 sys/amd64/amd64/support.S | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/sys/amd64/amd64/support.S b/sys/amd64/amd64/support.S
index e21374233c84..8e284c6f6a9e 100644
--- a/sys/amd64/amd64/support.S
+++ b/sys/amd64/amd64/support.S
@@ -507,13 +507,6 @@ END(memcmp)
 	ALIGN_TEXT
 2256:
 	std
-.if \erms == 1
-	leaq	-1(%rdi,%rcx),%rdi
-	leaq	-1(%rsi,%rcx),%rsi
-	rep
-	movsb
-	cld
-.else
 	leaq	-8(%rdi,%rcx),%rdi
 	leaq	-8(%rsi,%rcx),%rsi
 	shrq	$3,%rcx
@@ -523,7 +516,6 @@ END(memcmp)
 	movq	%rdx,%rcx
 	andb	$7,%cl
 	jne	2004b
-.endif
 	\end
 	ret
 .endif