From 43590b15176011e42cccc3db2cdcf7a46350b5a4 Mon Sep 17 00:00:00 2001
From: Bruce Evans <bde@FreeBSD.org>
Date: Fri, 22 Feb 2008 15:55:14 +0000
Subject: [PATCH] Optimize the 9pi/2 < |x| <= 2**19pi/2 case on amd64 and i386
 by avoiding the the double to int conversion operation which is very slow on
 these arches.  Assume that the current rounding mode is the default of
 round-to-nearest and use rounding operations in this mode instead of faking
 this mode using the round-towards-zero mode for conversion to int.  Round the
 double to an integer as a double first and as an int second since the double
 result is needed much earler.

Double rounding isn't a problem since we only need a rough approximation.
We didn't support other current rounding modes and produce much larger
errors than before if called in a non-default mode.

This saves an average about 10 cycles on amd64 (A64) and about 25 on
i386 (A64) for x in the above range.  In some cases the saving is over
25%.  Most cases with |x| < 1000pi now take about 88 cycles for cos
and sin (with certain CFLAGS, etc.), except on i386 where cos and sin
(but not cosf and sinf) are much slower at 111 and 121 cycles respectivly
due to the compiler only optimizing well for float precision.  A64
hardware cos and sin are slower at 105 cycles on i386 and 110 cycles
on amd64.
---
 lib/msun/src/e_rem_pio2.c  | 9 +++++++++
 lib/msun/src/e_rem_pio2f.c | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/lib/msun/src/e_rem_pio2.c b/lib/msun/src/e_rem_pio2.c
index c7d2064d1599..0dc88f41f522 100644
--- a/lib/msun/src/e_rem_pio2.c
+++ b/lib/msun/src/e_rem_pio2.c
@@ -21,6 +21,8 @@ __FBSDID("$FreeBSD$");
  * use __kernel_rem_pio2()
  */
 
+#include <float.h>
+
 #include "math.h"
 #include "math_private.h"
 
@@ -135,8 +137,15 @@ __ieee754_rem_pio2(double x, double *y)
 	if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
 medium:
 	    t  = fabs(x);
+#ifdef HAVE_EFFICIENT_IRINT
+	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+	    STRICT_ASSIGN(double,fn,t*invpio2+0x1.8p52);
+	    fn = fn-0x1.8p52;
+	    n  = irint(fn);
+#else
 	    n  = (int32_t) (t*invpio2+half);
 	    fn = (double)n;
+#endif
 	    r  = t-fn*pio2_1;
 	    w  = fn*pio2_1t;	/* 1st round good to 85 bit */
 	    if(n<32&&ix!=npio2_hw[n-1]) {	
diff --git a/lib/msun/src/e_rem_pio2f.c b/lib/msun/src/e_rem_pio2f.c
index 83a7eaa5e4ef..c487ab8cdf89 100644
--- a/lib/msun/src/e_rem_pio2f.c
+++ b/lib/msun/src/e_rem_pio2f.c
@@ -24,6 +24,8 @@ __FBSDID("$FreeBSD$");
  * use __kernel_rem_pio2() for large x
  */
 
+#include <float.h>
+
 #include "math.h"
 #include "math_private.h"
 
@@ -52,8 +54,15 @@ __ieee754_rem_pio2f(float x, float *y)
     /* 33+53 bit pi is good enough for medium size */
 	if(ix<=0x49490f80) {		/* |x| ~<= 2^19*(pi/2), medium size */
 	    t  = fabsf(x);
+#ifdef HAVE_EFFICIENT_IRINT
+	    /* Use a specialized rint() to get fn.  Assume round-to-nearest. */
+	    STRICT_ASSIGN(double,fn,t*invpio2+0x1.8p52);
+	    fn = fn-0x1.8p52;
+	    n  = irint(fn);
+#else
 	    n  = (int32_t) (t*invpio2+half);
 	    fn = (double)n;
+#endif
 	    r  = t-fn*pio2_1;
 	    w  = fn*pio2_1t;
 	    y[0] = r-w;