[libm] implementation of rsqrt, rsqrtf, and rsqrtl
From the PR:
The attached diff implements the inverse square root function, i.e,
rsqrt(x) = 1 / sqrt(x). Exhaustive testing of the float version
suggests that it is correctly rounded in round-to-nearest for all
test values in the range [0x1p-127,0x1p126].
Exhaustive testing of rsqrt and rsqrtl cannot be done, but 1100M
values of x for rsqrt and 400M values for rsqrtl were tested. All
tested values were correctly rounded.
I do not have access to LD128 (i.e., IEEE 128-bit floating point)
hardware, so the implementation of rsqrtl() is untested.
The following is a summary of changes to source code.
* lib/msun/Makefile:
. Add s_rsqrt.c and s_rsqrtf.c to COMMON_SRCS.
. For non-53-bit long double targets, add s_rsqrtl.c to COMMON_SRCS.
. Add MLINKS for rsqrt.3, rsqrtf.3, and rsqrtl.3 to sqrt.3.
* lib/msun/Symbol.map:
. Add rsqrt, rsqrtf, and rsqrtl to the Symbol map for shared libm.so.
* lib/msun/man/sqrt.3:
. Update the sqrt.3 manual page to include information for rsqrt[fl].
. Note, these function come from ISO C23 (and IEEE-754 2008).
* lib/msun/src/math.h:
. Add prototypes for new functions.
* lib/msun/src/math_private.h:
. Add _SPLIT, _FAST2SUM, _SLOW2SUM, _XADD, _MUL, and _XMUL
macros to perform type-type arthimetic (i.e., float-float).
* src/s_rsqrt.c:
. New file with the implementation of 'double rsqrt(double)'.
. For 53-bit long double targets, add a weak reference for rsqrtl.
* src/s_rsqrtf.c:
. New file with the implementation of 'float rsqrt(float)'.
* src/s_rsqrtl.c
. New file with the implementation of 'long double rsqrt(long double)'.
Note, the LD80 version uses bit twiddling and LD128 version is a
straight C language implementation. The LD128 is untested due to
lack of hardware.
PR: 295089
MFC after: 1 week
This commit is contained in:
committed by
Konstantin Belousov
parent
8b03193289
commit
3085fc9d97
+3
-3
@@ -88,7 +88,7 @@ COMMON_SRCS= b_tgamma.c \
|
||||
s_lround.c s_lroundf.c s_lroundl.c s_modff.c \
|
||||
s_nan.c s_nearbyint.c s_nextafter.c s_nextafterf.c \
|
||||
s_nexttowardf.c s_remquo.c s_remquof.c \
|
||||
s_rint.c s_rintf.c s_round.c s_roundf.c \
|
||||
s_rint.c s_rintf.c s_round.c s_roundf.c s_rsqrt.c s_rsqrtf.c \
|
||||
s_scalbln.c s_scalbn.c s_scalbnf.c s_signbit.c \
|
||||
s_signgam.c s_significand.c s_significandf.c s_sin.c \
|
||||
s_sincos.c s_sincosf.c s_sinf.c \
|
||||
@@ -143,7 +143,7 @@ COMMON_SRCS+= b_tgammal.c catrigl.c \
|
||||
s_fminimum_numl.c s_fminimum_mag_numl.c \
|
||||
s_frexpl.c s_logbl.c s_logl.c s_nanl.c \
|
||||
s_nextafterl.c s_nexttoward.c s_remquol.c s_rintl.c s_roundl.c \
|
||||
s_scalbnl.c s_sinl.c s_sincosl.c s_sinpil.c \
|
||||
s_scalbnl.c s_sinl.c s_sincosl.c s_sinpil.c s_rsqrtl.c \
|
||||
s_tanhl.c s_tanl.c s_tanpil.c s_truncl.c w_cabsl.c
|
||||
# Work around this warning from gcc:
|
||||
# lib/msun/ld80/e_powl.c:275:1: error: floating constant exceeds range of
|
||||
@@ -289,7 +289,7 @@ MLINKS+=sincos.3 sincosf.3 sin.3 sincosl.3
|
||||
MLINKS+=sinh.3 sinhf.3 sinh.3 sinhl.3
|
||||
MLINKS+=sinpi.3 sinpif.3 sinpi.3 sinpil.3
|
||||
MLINKS+=sqrt.3 cbrt.3 sqrt.3 cbrtf.3 sqrt.3 cbrtl.3 sqrt.3 sqrtf.3 \
|
||||
sqrt.3 sqrtl.3
|
||||
sqrt.3 sqrtl.3 sqrt.3 rsqrt.3 sqrt.3 rsqrtf.3 sqrt.3 rsqrtl.3
|
||||
MLINKS+=tan.3 tanf.3 tan.3 tanl.3
|
||||
MLINKS+=tanh.3 tanhf.3 tanh.3 tanhl.3
|
||||
MLINKS+=tanpi.3 tanpif.3 tanpi.3 tanpil.3
|
||||
|
||||
@@ -344,4 +344,7 @@ FBSD_1.9 {
|
||||
fminimum_mag_num;
|
||||
fminimum_mag_numf;
|
||||
fminimum_mag_numl;
|
||||
rsqrt;
|
||||
rsqrtf;
|
||||
rsqrtl;
|
||||
};
|
||||
|
||||
+51
-2
@@ -25,17 +25,20 @@
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.Dd February 15, 2020
|
||||
.Dd May 7, 2026
|
||||
.Dt SQRT 3
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm cbrt ,
|
||||
.Nm cbrtf ,
|
||||
.Nm cbrtl ,
|
||||
.Nm rsqrt ,
|
||||
.Nm rsqrtf ,
|
||||
.Nm rsqrtl,
|
||||
.Nm sqrt ,
|
||||
.Nm sqrtf ,
|
||||
.Nm sqrtl
|
||||
.Nd cube root and square root functions
|
||||
.Nd cube root, square root, and inverse square root functions
|
||||
.Sh LIBRARY
|
||||
.Lb libm
|
||||
.Sh SYNOPSIS
|
||||
@@ -47,6 +50,12 @@
|
||||
.Ft long double
|
||||
.Fn cbrtl "long double x"
|
||||
.Ft double
|
||||
.Fn rsqrt "double x"
|
||||
.Ft float
|
||||
.Fn rsqrtf "float x"
|
||||
.Ft long double
|
||||
.Fn rsqrtl "long double x"
|
||||
.Ft double
|
||||
.Fn sqrt "double x"
|
||||
.Ft float
|
||||
.Fn sqrtf "float x"
|
||||
@@ -63,6 +72,15 @@ the cube root of
|
||||
.Fa x .
|
||||
.Pp
|
||||
The
|
||||
.Fn rsqrt ,
|
||||
.Fn rsqrtf ,
|
||||
and
|
||||
.Fn rsqrtl
|
||||
functions compute
|
||||
the inverse square root of
|
||||
.Fa x .
|
||||
.Pp
|
||||
The
|
||||
.Fn sqrt ,
|
||||
.Fn sqrtf ,
|
||||
and
|
||||
@@ -77,6 +95,23 @@ The
|
||||
and
|
||||
.Fn cbrtl
|
||||
functions return the requested cube root.
|
||||
.Pp
|
||||
The
|
||||
.Fn rsqrt ,
|
||||
.Fn rsqrtf ,
|
||||
and
|
||||
.Fn rsqrtl
|
||||
functions return 1 divided by the square root of
|
||||
.Fa x
|
||||
unless an error occurs.
|
||||
An attempt to take the
|
||||
.Fn rsqrt
|
||||
of negative
|
||||
.Fa x
|
||||
raises an invalid exception and causes an \*(Na to be returned.
|
||||
The inverse square root of \*(Pm0 returns \*(Pm\(if and
|
||||
raises a divide-by-zero exception.
|
||||
.Pp
|
||||
The
|
||||
.Fn sqrt ,
|
||||
.Fn sqrtf ,
|
||||
@@ -104,6 +139,13 @@ and
|
||||
.Fn sqrtl
|
||||
functions conform to
|
||||
.St -isoC-99 .
|
||||
The
|
||||
.Fn rsqrt ,
|
||||
.Fn rsqrtf ,
|
||||
and
|
||||
.Fn rsqrtl
|
||||
functions conform to ISO/IEC 9899:2024 ("ISO C23").
|
||||
.\" .St -isoC-24 .
|
||||
.Sh HISTORY
|
||||
The
|
||||
.Fn cbrt
|
||||
@@ -120,3 +162,10 @@ The
|
||||
.Fn cbrtl
|
||||
function appeared in
|
||||
.Fx 9.0 .
|
||||
The
|
||||
.Fn rsqrt ,
|
||||
.Fn rsqrtf ,
|
||||
and
|
||||
.Fn rsqrtl
|
||||
functions appeared in
|
||||
.Fx 16.0 .
|
||||
|
||||
@@ -544,6 +544,9 @@ long double fminimum_mag_numl(long double, long double);
|
||||
double fminimum_num(double, double);
|
||||
float fminimum_numf(float, float);
|
||||
long double fminimum_numl(long double, long double);
|
||||
double rsqrt(double);
|
||||
float rsqrtf(float);
|
||||
long double rsqrtl(long double);
|
||||
#endif /* __ISO_C_VISIBLE >= 2023 */
|
||||
|
||||
__END_DECLS
|
||||
|
||||
@@ -471,6 +471,89 @@ do { \
|
||||
(a) = __tmp; \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Split x into high and low bits where CC is 0x1p(N/2) + 1 where
|
||||
* N is rounded up for types with odd precisions.
|
||||
*
|
||||
* #define _CC (0x1p12F + 1) // float
|
||||
* #define _CC (0x1p27 + 1) // double
|
||||
* #define _CC (0x1p32L + 1) // long double (LD80)
|
||||
* #define _CC (0x1p57L + 1) // long double (LD128)
|
||||
*/
|
||||
#define _SPLIT(x, xh, xl) \
|
||||
do { \
|
||||
typeof(x) __t1; \
|
||||
__t1 = (x) * _CC; \
|
||||
xh = __t1 + ((x) - __t1); \
|
||||
xl = (x) - xh; \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* FAST2SUM requires |x| >= |y|. x and y are full precision.
|
||||
* Note, _2SUMF(x,y) above destroys x and y.
|
||||
*/
|
||||
#define _FAST2SUM(x, y, hi, lo) \
|
||||
do { \
|
||||
hi = (x) + (y); \
|
||||
lo = (y) - (hi - (x)); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* SLOW2SUM does not require |x| >= |y|. Here, x and y are full precision.
|
||||
* The t1 temporary variable is volatile to prevent compiler optimizations.
|
||||
* Note, _2SUM(x,y) above destroys x and y.
|
||||
*/
|
||||
#define _SLOW2SUM(x, y, hi, lo) \
|
||||
do { \
|
||||
volatile typeof(x) __t1; \
|
||||
typeof(x) __t2; \
|
||||
hi = (x) + (y); \
|
||||
__t1 = hi - (y); \
|
||||
__t2 = hi - __t1; \
|
||||
lo = ((x) - __t1) + ((y) - __t2); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* x and y are full precision quantities that have been split into high
|
||||
* and low parts via the _SPLIT macro. x and y are added to give z as
|
||||
* high and low parts.
|
||||
*/
|
||||
#define _XADD(xh, xl, yh, yl, zh, zl) \
|
||||
do { \
|
||||
typeof(xh) __s1, __s2, __s3, __s4, __s5, __s6; \
|
||||
_SLOW2SUM(xh, yh, __s1, __s2); \
|
||||
_SLOW2SUM(xl, yl, __s3, __s4); \
|
||||
_FAST2SUM(__s1, __s2 + __s3, __s5, __s6); \
|
||||
_FAST2SUM(__s5, __s6 + __s4, zh, zl); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* x and y are full precision quantities. r1 and r2 are full precision
|
||||
* high and low parts of the multiplication x * y.
|
||||
*/
|
||||
#define _MUL(x, y, r1 ,r2) \
|
||||
do { \
|
||||
typeof(x) __xh, __xl, __yh, __yl; \
|
||||
typeof(x) __t1; \
|
||||
_SPLIT(x, __xh, __xl); \
|
||||
_SPLIT(y, __yh, __yl); \
|
||||
r1 = (x) * (y); \
|
||||
__t1 = __xh * __yl + (__xh * __yh - r1); \
|
||||
r2 = __xl * __yl + (__xl * __yh + __t1); \
|
||||
} while(0)
|
||||
|
||||
/*
|
||||
* x and y are full precision quantities that have been split into high
|
||||
* and low parts via the _SPLIT macro. x and y are multiplied to give z
|
||||
* as high and low parts.
|
||||
*/
|
||||
#define _XMUL(xh, xl, yh, yl, ph, pl) \
|
||||
do { \
|
||||
_MUL(xh, yh, ph, pl); \
|
||||
pl += xl * yl + xl * yh + xh * yl; \
|
||||
} while(0)
|
||||
|
||||
|
||||
/*
|
||||
* Common routine to process the arguments to nan(), nanf(), and nanl().
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,153 @@
|
||||
/*-
|
||||
* Copyright (c) 2026 Steven G. Kargl
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice unmodified, this list of conditions, and the following
|
||||
* disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
|
||||
*
|
||||
* First, filter out special cases:
|
||||
*
|
||||
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
|
||||
* 2. rsqrt(nan) = NaN.
|
||||
* 3. rsqrt(+inf) returns +0.
|
||||
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
|
||||
*
|
||||
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
|
||||
* recording the exponent of the scale factor N. Split the possibly
|
||||
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
|
||||
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
|
||||
* in [0.25,1) with n even.
|
||||
*
|
||||
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
|
||||
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
|
||||
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
|
||||
* used with one iteration of Goldschmidt's algorithm:
|
||||
*
|
||||
* z = x * y
|
||||
* h = y / 2
|
||||
* r = 0.5 - h * z
|
||||
* y = h * r + h
|
||||
*
|
||||
* A factor of 2 appears missing in the above, but it is included in the
|
||||
* exponent m.
|
||||
*/
|
||||
#include <fenv.h>
|
||||
#include <float.h>
|
||||
#include "math.h"
|
||||
#include "math_private.h"
|
||||
|
||||
#pragma STDC FENV_ACCESS ON
|
||||
|
||||
#ifdef _CC
|
||||
#undef _CC
|
||||
#endif
|
||||
#define _CC (0x1p27 + 1)
|
||||
|
||||
double
|
||||
rsqrt(double x)
|
||||
{
|
||||
volatile static const double vzero = 0;
|
||||
static const double half = 0.5;
|
||||
int hx, m, rnd;
|
||||
uint32_t lx, ux;
|
||||
double h, ph, pl, rh, rl, y, zh, zl;
|
||||
|
||||
EXTRACT_WORDS(hx, lx, x);
|
||||
ux = (uint32_t)hx & 0x7fffffff;
|
||||
|
||||
/* x = +-0. Raise exception. */
|
||||
if ((ux | lx) == 0)
|
||||
return (1 / x);
|
||||
|
||||
/* x is NaN. */
|
||||
if (ux > 0x7ff00000)
|
||||
return (x + x);
|
||||
|
||||
/* x is +-inf. */
|
||||
if (ux == 0x7ff00000)
|
||||
return (hx & 0x80000000 ? vzero / vzero : 0.);
|
||||
|
||||
/* x < 0. Raise exception. */
|
||||
if (hx < 0)
|
||||
return (vzero / vzero);
|
||||
|
||||
/*
|
||||
* If x is subnormal, then scale it into the normal range.
|
||||
* Split x into significand and exponent, x = f * 2^m, with
|
||||
* f in [0.5,1) and m a biased exponent.
|
||||
*/
|
||||
m = 0;
|
||||
if (hx < 0x00100000) { /* Subnormal */
|
||||
x *= 0x1p54;
|
||||
GET_HIGH_WORD(hx, x);
|
||||
m = -54;
|
||||
}
|
||||
m += (hx >> 20) - 1022;
|
||||
hx = (hx & 0x000fffff) | 0x3fe00000;
|
||||
SET_HIGH_WORD(x, hx);
|
||||
|
||||
/* m is odd. Put x into [0.25,5) and increase m. */
|
||||
if (m & 1) {
|
||||
x /= 2;
|
||||
m += 1;
|
||||
}
|
||||
m = -(m >> 1); /* Prepare for 2^(-m/2). */
|
||||
|
||||
y = 1 / sqrt(x); /* ~52-bit estimate. */
|
||||
|
||||
h = y / 2;
|
||||
|
||||
/*
|
||||
* For values of x with a representation of 0x1.ffffffffffffepN
|
||||
* with N an odd integer, the computed rsqrt() is not correctly
|
||||
* rounded in round-to-nearest without toggling the rounding mode
|
||||
* to FE_TOWARDZERO. Note, FE_DOWNWARD also works. However,
|
||||
* messing with the rounding mode is expensive, so only do it
|
||||
* when necessary. Example, x = 3.9999999999999991
|
||||
* gives y --> hx = 0x3ff00000, lx = 0x00000001
|
||||
*/
|
||||
EXTRACT_WORDS(hx, lx, y);
|
||||
if ((hx & 0x000fffff) == 0 && lx == 1) {
|
||||
rnd = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
_MUL(x, y, zh, zl);
|
||||
_XMUL(zh, zl, h, 0, ph, pl);
|
||||
fesetround(rnd);
|
||||
} else {
|
||||
_MUL(x, y, zh, zl);
|
||||
_XMUL(zh, zl, h, 0, ph, pl);
|
||||
}
|
||||
|
||||
_XADD(-ph, -pl, half, 0, rh, rl);
|
||||
y = rh * h + h;
|
||||
|
||||
ux = (m + 1024) << 20;
|
||||
INSERT_WORDS(x, ux, 0);
|
||||
return (y *= x);
|
||||
}
|
||||
|
||||
#if LDBL_MANT_DIG == 53
|
||||
__weak_reference(rsqrt, rsqrtl);
|
||||
#endif
|
||||
@@ -0,0 +1,155 @@
|
||||
/*-
|
||||
* Copyright (c) 2026 Steven G. Kargl
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice unmodified, this list of conditions, and the following
|
||||
* disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
|
||||
*
|
||||
* First, filter out special cases:
|
||||
*
|
||||
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
|
||||
* 2. rsqrt(nan) = NaN.
|
||||
* 3. rsqrt(+inf) returns +0.
|
||||
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
|
||||
*
|
||||
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
|
||||
* recording the exponent of the scale factor N. Split the possibly
|
||||
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
|
||||
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
|
||||
* in [0.25,1) with n even.
|
||||
*
|
||||
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
|
||||
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
|
||||
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
|
||||
* used with one iteration of Goldschmidt's algorithm:
|
||||
*
|
||||
* z = x * y
|
||||
* h = y / 2
|
||||
* r = 0.5 - h * z
|
||||
* y = h * r + h
|
||||
*
|
||||
* A factor of 2 appears missing in the above, but it is included in the
|
||||
* exponent m.
|
||||
*/
|
||||
#include <fenv.h>
|
||||
#include <float.h>
|
||||
#include "math.h"
|
||||
#include "math_private.h"
|
||||
|
||||
#pragma STDC FENV_ACCESS ON
|
||||
|
||||
#ifdef _CC
|
||||
#undef _CC
|
||||
#endif
|
||||
#define _CC (0x1p12F + 1)
|
||||
|
||||
float
|
||||
rsqrtf(float x)
|
||||
{
|
||||
volatile static const float vzero = 0;
|
||||
static const float half = 0.5;
|
||||
uint32_t ix, ux;
|
||||
int m, rnd;
|
||||
float h, ph, pl, rh, rl, y, zh, zl;
|
||||
|
||||
GET_FLOAT_WORD(ix, x);
|
||||
ux = ix & 0x7fffffff;
|
||||
|
||||
/* x = +-0. Raise exception. */
|
||||
if (ux == 0)
|
||||
return (1 / x);
|
||||
|
||||
/* x is NaN. */
|
||||
if (ux > 0x7f800000)
|
||||
return (x + x);
|
||||
|
||||
/* x is +-inf. */
|
||||
if (ux == 0x7f800000)
|
||||
return (ix & 0x80000000 ? vzero / vzero : 0.F);
|
||||
|
||||
/* x < 0. Raise exception. */
|
||||
if (ix & 0x80000000)
|
||||
return (vzero / vzero);
|
||||
|
||||
/*
|
||||
* If x is subnormal, then scale it into the normal range.
|
||||
* Split x into significand and exponent, x = f * 2^m, with
|
||||
* f in [0.5,1) and m a biased exponent.
|
||||
*/
|
||||
m = 0;
|
||||
if (ux < 0x00800000) { /* Subnormal */
|
||||
x *= 0x1p25f;
|
||||
GET_FLOAT_WORD(ix, x);
|
||||
m = -25;
|
||||
}
|
||||
m += (ix >> 23) - 126; /* Unbiased exponent */
|
||||
ix = (ix & 0x007fffff) | 0x3f000000;
|
||||
SET_FLOAT_WORD(x, ix); /* x is in [0.5,1). */
|
||||
|
||||
/* m is odd. Put x into [0.25,5) and increase m. */
|
||||
if (m & 1) {
|
||||
x /= 2;
|
||||
m += 1;
|
||||
}
|
||||
m = -(m >> 1); /* Prepare for 2^(-m/2). */
|
||||
|
||||
/*
|
||||
* Exhaustive testing of rsqrtf(x) = 1 / sqrtf(x) with x in
|
||||
* [0x1p-127,0x1p126] shows the this approximation gives a
|
||||
* 22- to 23-bit estimate of rsqrt(f). This is equivalent to
|
||||
* a max ulp of ~1.49.
|
||||
*/
|
||||
y = 1 / sqrtf(x);
|
||||
|
||||
h = y / 2;
|
||||
|
||||
/*
|
||||
* For values of x with a representation of 0x1.fffffcpN with
|
||||
* N an odd integer, the computed rsqrtf() is not correctly
|
||||
* rounded in round-to-nearest without toggling the rounding
|
||||
* mode to FE_TOWARDZERO. Note, FE_DOWNWARD also works.
|
||||
* However, messing with the rounding mode is expensive, so
|
||||
* only do it when necessary. Example, x = 0x1.fffffcp3 gives
|
||||
* y --> 0x3f800001.
|
||||
*/
|
||||
GET_FLOAT_WORD(ix, y);
|
||||
if ((ix & 0x000fffff) == 1) {
|
||||
rnd = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
_MUL(x, y, zh, zl);
|
||||
_XMUL(zh, zl, h, 0, ph, pl);
|
||||
fesetround(rnd);
|
||||
} else {
|
||||
_MUL(x, y, zh, zl);
|
||||
_XMUL(zh, zl, h, 0, ph, pl);
|
||||
}
|
||||
|
||||
_XADD(-ph, -pl, half, 0, rh, rl);
|
||||
y = h * rh + h;
|
||||
|
||||
ix = (uint32_t)(m + 128) << 23;
|
||||
SET_FLOAT_WORD(x, ix);
|
||||
return (y *= x);
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
/*-
|
||||
* Copyright (c) 2026 Steven G. Kargl
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice unmodified, this list of conditions, and the following
|
||||
* disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
||||
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
||||
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
||||
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
|
||||
*
|
||||
* First, filter out special cases:
|
||||
*
|
||||
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
|
||||
* 2. rsqrt(nan) = NaN.
|
||||
* 3. rsqrt(+inf) returns +0.
|
||||
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
|
||||
*
|
||||
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
|
||||
* recording the exponent of the scale factor N. Split the possibly
|
||||
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
|
||||
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
|
||||
* in [0.25,1) with n even.
|
||||
*
|
||||
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
|
||||
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
|
||||
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
|
||||
* used with one iteration of Goldschmidt's algorithm:
|
||||
*
|
||||
* z = x * y
|
||||
* h = y / 2
|
||||
* r = 0.5 - h * z
|
||||
* y = h * r + h
|
||||
*
|
||||
* A factor of 2 appears missing in the above, but it is included in the
|
||||
* exponent m.
|
||||
*/
|
||||
#include <fenv.h>
|
||||
#include <float.h>
|
||||
#include "math.h"
|
||||
#include "math_private.h"
|
||||
#include "fpmath.h"
|
||||
|
||||
#pragma STDC FENV_ACCESS ON
|
||||
|
||||
#if LDBL_MANT_DIG == 64
|
||||
|
||||
#ifdef _CC
|
||||
#undef _CC
|
||||
#endif
|
||||
#define _CC (0x1p32L + 1)
|
||||
|
||||
long double
|
||||
rsqrtl(long double x)
|
||||
{
|
||||
volatile static const double vzero = 0;
|
||||
static const double half = 0.5;
|
||||
uint32_t ux;
|
||||
int m, rnd;
|
||||
long double h, ph, pl, rh, rl, y, zh, zl;
|
||||
union IEEEl2bits u;
|
||||
|
||||
u.e = x;
|
||||
ux = (u.bits.manl | u.bits.manh);
|
||||
|
||||
/* x = +-0. Raise exception. */
|
||||
if ((u.bits.exp | ux) == 0)
|
||||
return (1 / x);
|
||||
|
||||
/* x is NaN or x is +-inf. */
|
||||
if (u.bits.exp == 0x7fff)
|
||||
return (ux ? (x + x) : (u.bits.sign ? vzero / vzero : 0));
|
||||
|
||||
/* x < 0. Raise exception. */
|
||||
if (u.bits.sign)
|
||||
return (vzero / vzero);
|
||||
|
||||
/*
|
||||
* If x is subnormal, then scale it into the normal range.
|
||||
* Split x into significand and exponent, x = f * 2^m, with
|
||||
* f in [0.5,1) and m a biased exponent.
|
||||
*/
|
||||
ENTERI();
|
||||
|
||||
if (u.bits.exp == 0) { /* Subnormal */
|
||||
u.e *= 0x1p512;
|
||||
m = u.bits.exp - 0x41fe;
|
||||
} else {
|
||||
m = u.bits.exp - 0x3ffe;
|
||||
}
|
||||
u.bits.exp = 0x3ffe;
|
||||
|
||||
/* m is odd. Put x into [0.25,5) and increase m. */
|
||||
if (m & 1) {
|
||||
u.e /= 2;
|
||||
m += 1;
|
||||
}
|
||||
m = -(m >> 1); /* Prepare for 2^(-m/2). */
|
||||
|
||||
y = 1 / sqrt((double)u.e); /* ~52-bit estimate. */
|
||||
y -= y * (u.e * y * y - 1) / 2; /* ~63-bit estimate. */
|
||||
|
||||
h = y / 2;
|
||||
|
||||
_MUL(u.e, y, zh, zl);
|
||||
_XMUL(zh, zl, h, 0, ph, pl);
|
||||
_XADD(-ph, -pl, half, 0, rh, rl);
|
||||
y = rh * h + h;
|
||||
|
||||
u.e = 1;
|
||||
u.xbits.expsign = 0x3fff + m + 1;
|
||||
RETURNI(y * u.e);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef _CC
|
||||
#undef _CC
|
||||
#endif
|
||||
#define _CC (0x1p57L + 1)
|
||||
|
||||
long double
|
||||
rsqrtl(long double x)
|
||||
{
|
||||
volatile static const double vzero = 0;
|
||||
int hx, m, rnd;
|
||||
long double y;
|
||||
|
||||
/* x = +-0. Raise exception. */
|
||||
if (x == 0)
|
||||
return (1 / x);
|
||||
|
||||
/* x is NaN. */
|
||||
if (isnan(x))
|
||||
return (x + x);
|
||||
|
||||
/* x is +-inf. */
|
||||
if (isinf(x))
|
||||
return (x > 0 ? 0 : vzero / vzero);
|
||||
|
||||
/* x < 0. Raise exception. */
|
||||
if (x < 0)
|
||||
return (vzero / vzero);
|
||||
|
||||
/*
|
||||
* If x is subnormal, then scale it into the normal range.
|
||||
* Split x into significand and exponent, x = f * 2^m, with
|
||||
* f in [0.5,1) and m a biased exponent.
|
||||
*/
|
||||
m = 0;
|
||||
if (!isnormal(x)) {
|
||||
x *= 0x1p114L;
|
||||
m = -114;
|
||||
}
|
||||
x = frexpl(x, &hx);
|
||||
m += hx;
|
||||
|
||||
/* m is odd. Put x into [0.25,5) and increase m. */
|
||||
if (m & 1) {
|
||||
x /= 2;
|
||||
m += 1;
|
||||
}
|
||||
m = -(m >> 1); /* Prepare for 2^(-m/2). */
|
||||
|
||||
y = 1 / sqrt((double)x); /* ~52-bit estimate. */
|
||||
y -= y * (x * y * y - 1) / 2; /* ~104-bit estimate. */
|
||||
|
||||
static const double half = 0.5;
|
||||
long double h, ph, pl, rh, rl, zh, zl;
|
||||
|
||||
h = y / 2;
|
||||
|
||||
rnd = fegetround();
|
||||
fesetround(FE_TOWARDZERO);
|
||||
_MUL(x, y, zh, zl);
|
||||
_XMUL(zh, zl, -h, 0, ph, pl);
|
||||
fesetround(rnd);
|
||||
|
||||
_XADD(ph, pl, half, 0, rh, rl);
|
||||
y = rh * h + h;
|
||||
m++;
|
||||
|
||||
RETURNI(ldexpl(y, m));
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user