From 31e30d75d56226ebac1c7c508bb50c030235872c Mon Sep 17 00:00:00 2001 From: Bruce Evans Date: Fri, 11 Jan 2008 17:11:32 +0000 Subject: [PATCH] Fix fpset*() to not trap if there is a currently unmasked exception. Unmasked exceptions (which can be fixed up using fpset*() before they trap) are very rare, especially on amd64 since SSE exceptions trap synchronously, but I want to merge the faster amd64 implementations of fpset*() back to i386 without introducing the bug on i386. The i386 implementation has always avoided the trap automatically by changing things using load/store of the FP environment, but this is very slow. Most changes only affect the control word, so they can usually be done much more efficiently, and amd64 has always done this, but loading the control word can trap. This version use the fast method only in the usual case where it will not trap. This only costs a couple of integer instructions (including one branch which I haven't optimized carefully yet) in the usual case, but bloats the inlines a lot. The inlines were already a bit too large to handle both the FPU and SSE. --- sys/amd64/include/ieeefp.h | 54 +++++++++++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/sys/amd64/include/ieeefp.h b/sys/amd64/include/ieeefp.h index ce03cd3be5cb..124449279d02 100644 --- a/sys/amd64/include/ieeefp.h +++ b/sys/amd64/include/ieeefp.h @@ -125,11 +125,41 @@ typedef enum { #ifdef __GNUCLIKE_ASM #define __fldcw(addr) __asm __volatile("fldcw %0" : : "m" (*(addr))) +#define __fldenv(addr) __asm __volatile("fldenv %0" : : "m" (*(addr))) #define __fnstcw(addr) __asm __volatile("fnstcw %0" : "=m" (*(addr))) +#define __fnstenv(addr) __asm __volatile("fnstenv %0" : "=m" (*(addr))) #define __fnstsw(addr) __asm __volatile("fnstsw %0" : "=m" (*(addr))) #define __ldmxcsr(addr) __asm __volatile("ldmxcsr %0" : : "m" (*(addr))) #define __stmxcsr(addr) __asm __volatile("stmxcsr %0" : "=m" (*(addr))) +/* + * Load the control word. Be careful not to trap if there is a currently + * unmasked exception (ones that will become freshly unmasked are not a + * problem). This case must be handled by a save/restore of the + * environment or even of the full x87 state. Accessing the environment + * is very inefficient, so only do it when necessary. + */ +static __inline void +__fnldcw(unsigned short _cw, unsigned short _newcw) +{ + struct { + unsigned _cw; + unsigned _other[6]; + } _env; + unsigned short _sw; + + if ((_cw & FP_MSKS_FLD) != FP_MSKS_FLD) { + __fnstsw(&_sw); + if (((_sw & ~_cw) & FP_STKY_FLD) != 0) { + __fnstenv(&_env); + _env._cw = _newcw; + __fldenv(&_env); + return; + } + } + __fldcw(&_newcw); +} + /* * General notes about conflicting SSE vs FP status bits. * This code assumes that software will not fiddle with the control @@ -155,13 +185,13 @@ __fpsetround(fp_rnd_t _m) { fp_rnd_t _p; unsigned _mxcsr; - unsigned short _cw; + unsigned short _cw, _newcw; __fnstcw(&_cw); _p = (fp_rnd_t)((_cw & FP_RND_FLD) >> FP_RND_OFF); - _cw &= ~FP_RND_FLD; - _cw |= (_m << FP_RND_OFF) & FP_RND_FLD; - __fldcw(&_cw); + _newcw = _cw & ~FP_RND_FLD; + _newcw |= (_m << FP_RND_OFF) & FP_RND_FLD; + __fnldcw(_cw, _newcw); __stmxcsr(&_mxcsr); _mxcsr &= ~SSE_RND_FLD; _mxcsr |= (_m << SSE_RND_OFF) & SSE_RND_FLD; @@ -187,13 +217,13 @@ static __inline fp_prec_t __fpsetprec(fp_prec_t _m) { fp_prec_t _p; - unsigned short _cw; + unsigned short _cw, _newcw; __fnstcw(&_cw); _p = (fp_prec_t)((_cw & FP_PRC_FLD) >> FP_PRC_OFF); - _cw &= ~FP_PRC_FLD; - _cw |= (_m << FP_PRC_OFF) & FP_PRC_FLD; - __fldcw(&_cw); + _newcw = _cw & ~FP_PRC_FLD; + _newcw |= (_m << FP_PRC_OFF) & FP_PRC_FLD; + __fnldcw(_cw, _newcw); return (_p); } @@ -217,13 +247,13 @@ __fpsetmask(fp_except_t _m) { fp_except_t _p; unsigned _mxcsr; - unsigned short _cw; + unsigned short _cw, _newcw; __fnstcw(&_cw); _p = (~_cw & FP_MSKS_FLD) >> FP_MSKS_OFF; - _cw &= ~FP_MSKS_FLD; - _cw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD; - __fldcw(&_cw); + _newcw = _cw & ~FP_MSKS_FLD; + _newcw |= (~_m << FP_MSKS_OFF) & FP_MSKS_FLD; + __fnldcw(_cw, _newcw); __stmxcsr(&_mxcsr); /* XXX should we clear non-ieee SSE_DAZ_FLD and SSE_FZ_FLD ? */ _mxcsr &= ~SSE_MSKS_FLD;