[libm] implementation of rsqrt, rsqrtf, and rsqrtl

From the PR:
The attached diff implements the inverse square root function, i.e,
rsqrt(x) = 1 / sqrt(x).  Exhaustive testing of the float version
suggests that it is correctly rounded in round-to-nearest for all
test values in the range [0x1p-127,0x1p126].
Exhaustive testing of rsqrt and rsqrtl cannot be done, but 1100M
values of x for rsqrt and 400M values for rsqrtl were tested.  All
tested values were correctly rounded.

I do not have access to LD128 (i.e., IEEE 128-bit floating point)
hardware, so the implementation of rsqrtl() is untested.

The following is a summary of changes to source code.

* lib/msun/Makefile:
  . Add s_rsqrt.c and s_rsqrtf.c to COMMON_SRCS.
  . For non-53-bit long double targets, add s_rsqrtl.c to COMMON_SRCS.
  . Add MLINKS for rsqrt.3, rsqrtf.3, and rsqrtl.3 to sqrt.3.

* lib/msun/Symbol.map:
  . Add rsqrt, rsqrtf, and rsqrtl to the Symbol map for shared libm.so.

* lib/msun/man/sqrt.3:
  . Update the sqrt.3 manual page to include information for rsqrt[fl].
  . Note, these function come from ISO C23 (and IEEE-754 2008).

* lib/msun/src/math.h:
  . Add prototypes for new functions.

* lib/msun/src/math_private.h:
  . Add _SPLIT, _FAST2SUM, _SLOW2SUM, _XADD, _MUL, and _XMUL
    macros to perform type-type arthimetic (i.e., float-float).

* src/s_rsqrt.c:
  . New file with the implementation of 'double rsqrt(double)'.
  . For 53-bit long double targets, add a weak reference for rsqrtl.

* src/s_rsqrtf.c:
  . New file with the implementation of 'float rsqrt(float)'.

* src/s_rsqrtl.c
  . New file with the implementation of 'long double rsqrt(long double)'.
    Note, the LD80 version uses bit twiddling and LD128 version is a
    straight C language implementation.  The LD128 is untested due to
    lack of hardware.

PR:	295089
MFC after:	1 week
This commit is contained in:
Steve Kargl
2026-05-08 17:06:08 +03:00
committed by Konstantin Belousov
parent 8b03193289
commit 3085fc9d97
8 changed files with 654 additions and 5 deletions
+3 -3
View File
@@ -88,7 +88,7 @@ COMMON_SRCS= b_tgamma.c \
s_lround.c s_lroundf.c s_lroundl.c s_modff.c \
s_nan.c s_nearbyint.c s_nextafter.c s_nextafterf.c \
s_nexttowardf.c s_remquo.c s_remquof.c \
s_rint.c s_rintf.c s_round.c s_roundf.c \
s_rint.c s_rintf.c s_round.c s_roundf.c s_rsqrt.c s_rsqrtf.c \
s_scalbln.c s_scalbn.c s_scalbnf.c s_signbit.c \
s_signgam.c s_significand.c s_significandf.c s_sin.c \
s_sincos.c s_sincosf.c s_sinf.c \
@@ -143,7 +143,7 @@ COMMON_SRCS+= b_tgammal.c catrigl.c \
s_fminimum_numl.c s_fminimum_mag_numl.c \
s_frexpl.c s_logbl.c s_logl.c s_nanl.c \
s_nextafterl.c s_nexttoward.c s_remquol.c s_rintl.c s_roundl.c \
s_scalbnl.c s_sinl.c s_sincosl.c s_sinpil.c \
s_scalbnl.c s_sinl.c s_sincosl.c s_sinpil.c s_rsqrtl.c \
s_tanhl.c s_tanl.c s_tanpil.c s_truncl.c w_cabsl.c
# Work around this warning from gcc:
# lib/msun/ld80/e_powl.c:275:1: error: floating constant exceeds range of
@@ -289,7 +289,7 @@ MLINKS+=sincos.3 sincosf.3 sin.3 sincosl.3
MLINKS+=sinh.3 sinhf.3 sinh.3 sinhl.3
MLINKS+=sinpi.3 sinpif.3 sinpi.3 sinpil.3
MLINKS+=sqrt.3 cbrt.3 sqrt.3 cbrtf.3 sqrt.3 cbrtl.3 sqrt.3 sqrtf.3 \
sqrt.3 sqrtl.3
sqrt.3 sqrtl.3 sqrt.3 rsqrt.3 sqrt.3 rsqrtf.3 sqrt.3 rsqrtl.3
MLINKS+=tan.3 tanf.3 tan.3 tanl.3
MLINKS+=tanh.3 tanhf.3 tanh.3 tanhl.3
MLINKS+=tanpi.3 tanpif.3 tanpi.3 tanpil.3
+3
View File
@@ -344,4 +344,7 @@ FBSD_1.9 {
fminimum_mag_num;
fminimum_mag_numf;
fminimum_mag_numl;
rsqrt;
rsqrtf;
rsqrtl;
};
+51 -2
View File
@@ -25,17 +25,20 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.Dd February 15, 2020
.Dd May 7, 2026
.Dt SQRT 3
.Os
.Sh NAME
.Nm cbrt ,
.Nm cbrtf ,
.Nm cbrtl ,
.Nm rsqrt ,
.Nm rsqrtf ,
.Nm rsqrtl,
.Nm sqrt ,
.Nm sqrtf ,
.Nm sqrtl
.Nd cube root and square root functions
.Nd cube root, square root, and inverse square root functions
.Sh LIBRARY
.Lb libm
.Sh SYNOPSIS
@@ -47,6 +50,12 @@
.Ft long double
.Fn cbrtl "long double x"
.Ft double
.Fn rsqrt "double x"
.Ft float
.Fn rsqrtf "float x"
.Ft long double
.Fn rsqrtl "long double x"
.Ft double
.Fn sqrt "double x"
.Ft float
.Fn sqrtf "float x"
@@ -63,6 +72,15 @@ the cube root of
.Fa x .
.Pp
The
.Fn rsqrt ,
.Fn rsqrtf ,
and
.Fn rsqrtl
functions compute
the inverse square root of
.Fa x .
.Pp
The
.Fn sqrt ,
.Fn sqrtf ,
and
@@ -77,6 +95,23 @@ The
and
.Fn cbrtl
functions return the requested cube root.
.Pp
The
.Fn rsqrt ,
.Fn rsqrtf ,
and
.Fn rsqrtl
functions return 1 divided by the square root of
.Fa x
unless an error occurs.
An attempt to take the
.Fn rsqrt
of negative
.Fa x
raises an invalid exception and causes an \*(Na to be returned.
The inverse square root of \*(Pm0 returns \*(Pm\(if and
raises a divide-by-zero exception.
.Pp
The
.Fn sqrt ,
.Fn sqrtf ,
@@ -104,6 +139,13 @@ and
.Fn sqrtl
functions conform to
.St -isoC-99 .
The
.Fn rsqrt ,
.Fn rsqrtf ,
and
.Fn rsqrtl
functions conform to ISO/IEC 9899:2024 ("ISO C23").
.\" .St -isoC-24 .
.Sh HISTORY
The
.Fn cbrt
@@ -120,3 +162,10 @@ The
.Fn cbrtl
function appeared in
.Fx 9.0 .
The
.Fn rsqrt ,
.Fn rsqrtf ,
and
.Fn rsqrtl
functions appeared in
.Fx 16.0 .
+3
View File
@@ -544,6 +544,9 @@ long double fminimum_mag_numl(long double, long double);
double fminimum_num(double, double);
float fminimum_numf(float, float);
long double fminimum_numl(long double, long double);
double rsqrt(double);
float rsqrtf(float);
long double rsqrtl(long double);
#endif /* __ISO_C_VISIBLE >= 2023 */
__END_DECLS
+83
View File
@@ -471,6 +471,89 @@ do { \
(a) = __tmp; \
} while (0)
/*
* Split x into high and low bits where CC is 0x1p(N/2) + 1 where
* N is rounded up for types with odd precisions.
*
* #define _CC (0x1p12F + 1) // float
* #define _CC (0x1p27 + 1) // double
* #define _CC (0x1p32L + 1) // long double (LD80)
* #define _CC (0x1p57L + 1) // long double (LD128)
*/
#define _SPLIT(x, xh, xl) \
do { \
typeof(x) __t1; \
__t1 = (x) * _CC; \
xh = __t1 + ((x) - __t1); \
xl = (x) - xh; \
} while(0)
/*
* FAST2SUM requires |x| >= |y|. x and y are full precision.
* Note, _2SUMF(x,y) above destroys x and y.
*/
#define _FAST2SUM(x, y, hi, lo) \
do { \
hi = (x) + (y); \
lo = (y) - (hi - (x)); \
} while(0)
/*
* SLOW2SUM does not require |x| >= |y|. Here, x and y are full precision.
* The t1 temporary variable is volatile to prevent compiler optimizations.
* Note, _2SUM(x,y) above destroys x and y.
*/
#define _SLOW2SUM(x, y, hi, lo) \
do { \
volatile typeof(x) __t1; \
typeof(x) __t2; \
hi = (x) + (y); \
__t1 = hi - (y); \
__t2 = hi - __t1; \
lo = ((x) - __t1) + ((y) - __t2); \
} while(0)
/*
* x and y are full precision quantities that have been split into high
* and low parts via the _SPLIT macro. x and y are added to give z as
* high and low parts.
*/
#define _XADD(xh, xl, yh, yl, zh, zl) \
do { \
typeof(xh) __s1, __s2, __s3, __s4, __s5, __s6; \
_SLOW2SUM(xh, yh, __s1, __s2); \
_SLOW2SUM(xl, yl, __s3, __s4); \
_FAST2SUM(__s1, __s2 + __s3, __s5, __s6); \
_FAST2SUM(__s5, __s6 + __s4, zh, zl); \
} while(0)
/*
* x and y are full precision quantities. r1 and r2 are full precision
* high and low parts of the multiplication x * y.
*/
#define _MUL(x, y, r1 ,r2) \
do { \
typeof(x) __xh, __xl, __yh, __yl; \
typeof(x) __t1; \
_SPLIT(x, __xh, __xl); \
_SPLIT(y, __yh, __yl); \
r1 = (x) * (y); \
__t1 = __xh * __yl + (__xh * __yh - r1); \
r2 = __xl * __yl + (__xl * __yh + __t1); \
} while(0)
/*
* x and y are full precision quantities that have been split into high
* and low parts via the _SPLIT macro. x and y are multiplied to give z
* as high and low parts.
*/
#define _XMUL(xh, xl, yh, yl, ph, pl) \
do { \
_MUL(xh, yh, ph, pl); \
pl += xl * yl + xl * yh + xh * yl; \
} while(0)
/*
* Common routine to process the arguments to nan(), nanf(), and nanl().
*/
+153
View File
@@ -0,0 +1,153 @@
/*-
* Copyright (c) 2026 Steven G. Kargl
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
*
* First, filter out special cases:
*
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
* 2. rsqrt(nan) = NaN.
* 3. rsqrt(+inf) returns +0.
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
*
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
* recording the exponent of the scale factor N. Split the possibly
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
* in [0.25,1) with n even.
*
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
* used with one iteration of Goldschmidt's algorithm:
*
* z = x * y
* h = y / 2
* r = 0.5 - h * z
* y = h * r + h
*
* A factor of 2 appears missing in the above, but it is included in the
* exponent m.
*/
#include <fenv.h>
#include <float.h>
#include "math.h"
#include "math_private.h"
#pragma STDC FENV_ACCESS ON
#ifdef _CC
#undef _CC
#endif
#define _CC (0x1p27 + 1)
double
rsqrt(double x)
{
volatile static const double vzero = 0;
static const double half = 0.5;
int hx, m, rnd;
uint32_t lx, ux;
double h, ph, pl, rh, rl, y, zh, zl;
EXTRACT_WORDS(hx, lx, x);
ux = (uint32_t)hx & 0x7fffffff;
/* x = +-0. Raise exception. */
if ((ux | lx) == 0)
return (1 / x);
/* x is NaN. */
if (ux > 0x7ff00000)
return (x + x);
/* x is +-inf. */
if (ux == 0x7ff00000)
return (hx & 0x80000000 ? vzero / vzero : 0.);
/* x < 0. Raise exception. */
if (hx < 0)
return (vzero / vzero);
/*
* If x is subnormal, then scale it into the normal range.
* Split x into significand and exponent, x = f * 2^m, with
* f in [0.5,1) and m a biased exponent.
*/
m = 0;
if (hx < 0x00100000) { /* Subnormal */
x *= 0x1p54;
GET_HIGH_WORD(hx, x);
m = -54;
}
m += (hx >> 20) - 1022;
hx = (hx & 0x000fffff) | 0x3fe00000;
SET_HIGH_WORD(x, hx);
/* m is odd. Put x into [0.25,5) and increase m. */
if (m & 1) {
x /= 2;
m += 1;
}
m = -(m >> 1); /* Prepare for 2^(-m/2). */
y = 1 / sqrt(x); /* ~52-bit estimate. */
h = y / 2;
/*
* For values of x with a representation of 0x1.ffffffffffffepN
* with N an odd integer, the computed rsqrt() is not correctly
* rounded in round-to-nearest without toggling the rounding mode
* to FE_TOWARDZERO. Note, FE_DOWNWARD also works. However,
* messing with the rounding mode is expensive, so only do it
* when necessary. Example, x = 3.9999999999999991
* gives y --> hx = 0x3ff00000, lx = 0x00000001
*/
EXTRACT_WORDS(hx, lx, y);
if ((hx & 0x000fffff) == 0 && lx == 1) {
rnd = fegetround();
fesetround(FE_TOWARDZERO);
_MUL(x, y, zh, zl);
_XMUL(zh, zl, h, 0, ph, pl);
fesetround(rnd);
} else {
_MUL(x, y, zh, zl);
_XMUL(zh, zl, h, 0, ph, pl);
}
_XADD(-ph, -pl, half, 0, rh, rl);
y = rh * h + h;
ux = (m + 1024) << 20;
INSERT_WORDS(x, ux, 0);
return (y *= x);
}
#if LDBL_MANT_DIG == 53
__weak_reference(rsqrt, rsqrtl);
#endif
+155
View File
@@ -0,0 +1,155 @@
/*-
* Copyright (c) 2026 Steven G. Kargl
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
*
* First, filter out special cases:
*
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
* 2. rsqrt(nan) = NaN.
* 3. rsqrt(+inf) returns +0.
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
*
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
* recording the exponent of the scale factor N. Split the possibly
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
* in [0.25,1) with n even.
*
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
* used with one iteration of Goldschmidt's algorithm:
*
* z = x * y
* h = y / 2
* r = 0.5 - h * z
* y = h * r + h
*
* A factor of 2 appears missing in the above, but it is included in the
* exponent m.
*/
#include <fenv.h>
#include <float.h>
#include "math.h"
#include "math_private.h"
#pragma STDC FENV_ACCESS ON
#ifdef _CC
#undef _CC
#endif
#define _CC (0x1p12F + 1)
float
rsqrtf(float x)
{
volatile static const float vzero = 0;
static const float half = 0.5;
uint32_t ix, ux;
int m, rnd;
float h, ph, pl, rh, rl, y, zh, zl;
GET_FLOAT_WORD(ix, x);
ux = ix & 0x7fffffff;
/* x = +-0. Raise exception. */
if (ux == 0)
return (1 / x);
/* x is NaN. */
if (ux > 0x7f800000)
return (x + x);
/* x is +-inf. */
if (ux == 0x7f800000)
return (ix & 0x80000000 ? vzero / vzero : 0.F);
/* x < 0. Raise exception. */
if (ix & 0x80000000)
return (vzero / vzero);
/*
* If x is subnormal, then scale it into the normal range.
* Split x into significand and exponent, x = f * 2^m, with
* f in [0.5,1) and m a biased exponent.
*/
m = 0;
if (ux < 0x00800000) { /* Subnormal */
x *= 0x1p25f;
GET_FLOAT_WORD(ix, x);
m = -25;
}
m += (ix >> 23) - 126; /* Unbiased exponent */
ix = (ix & 0x007fffff) | 0x3f000000;
SET_FLOAT_WORD(x, ix); /* x is in [0.5,1). */
/* m is odd. Put x into [0.25,5) and increase m. */
if (m & 1) {
x /= 2;
m += 1;
}
m = -(m >> 1); /* Prepare for 2^(-m/2). */
/*
* Exhaustive testing of rsqrtf(x) = 1 / sqrtf(x) with x in
* [0x1p-127,0x1p126] shows the this approximation gives a
* 22- to 23-bit estimate of rsqrt(f). This is equivalent to
* a max ulp of ~1.49.
*/
y = 1 / sqrtf(x);
h = y / 2;
/*
* For values of x with a representation of 0x1.fffffcpN with
* N an odd integer, the computed rsqrtf() is not correctly
* rounded in round-to-nearest without toggling the rounding
* mode to FE_TOWARDZERO. Note, FE_DOWNWARD also works.
* However, messing with the rounding mode is expensive, so
* only do it when necessary. Example, x = 0x1.fffffcp3 gives
* y --> 0x3f800001.
*/
GET_FLOAT_WORD(ix, y);
if ((ix & 0x000fffff) == 1) {
rnd = fegetround();
fesetround(FE_TOWARDZERO);
_MUL(x, y, zh, zl);
_XMUL(zh, zl, h, 0, ph, pl);
fesetround(rnd);
} else {
_MUL(x, y, zh, zl);
_XMUL(zh, zl, h, 0, ph, pl);
}
_XADD(-ph, -pl, half, 0, rh, rl);
y = h * rh + h;
ix = (uint32_t)(m + 128) << 23;
SET_FLOAT_WORD(x, ix);
return (y *= x);
}
+203
View File
@@ -0,0 +1,203 @@
/*-
* Copyright (c) 2026 Steven G. Kargl
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice unmodified, this list of conditions, and the following
* disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* Compute the inverse sqrt of x, i.e., rsqrt(x) = 1 / sqrt(x).
*
* First, filter out special cases:
*
* 1. rsqrt(+-0) = +-inf, and raise FE_DIVBYZERO exception.
* 2. rsqrt(nan) = NaN.
* 3. rsqrt(+inf) returns +0.
* 2. rsqrt(x<0) = NaN, and raises FE_INVALID.
*
* If x is a subnormal, scale x into the normal range by x*0x1pN; while
* recording the exponent of the scale factor N. Split the possibly
* scaled x into f*2^n with f in [0.5,1). Set m=n or m=n-N (subnormal).
* If n is odd, then set f = f/2 and increase n to n+1. Thus, f is
* in [0.25,1) with n even.
*
* An initial estimate of y = rqrt[f](x) is 1 / sqrt[f](x). Exhaustive
* testing of rsqrtf() gave a max ULP of 1.49; while testing 500M x in
* [0,1000] gave a max ULP of 1.24 for rsqrt(). The value of y is then
* used with one iteration of Goldschmidt's algorithm:
*
* z = x * y
* h = y / 2
* r = 0.5 - h * z
* y = h * r + h
*
* A factor of 2 appears missing in the above, but it is included in the
* exponent m.
*/
#include <fenv.h>
#include <float.h>
#include "math.h"
#include "math_private.h"
#include "fpmath.h"
#pragma STDC FENV_ACCESS ON
#if LDBL_MANT_DIG == 64
#ifdef _CC
#undef _CC
#endif
#define _CC (0x1p32L + 1)
long double
rsqrtl(long double x)
{
volatile static const double vzero = 0;
static const double half = 0.5;
uint32_t ux;
int m, rnd;
long double h, ph, pl, rh, rl, y, zh, zl;
union IEEEl2bits u;
u.e = x;
ux = (u.bits.manl | u.bits.manh);
/* x = +-0. Raise exception. */
if ((u.bits.exp | ux) == 0)
return (1 / x);
/* x is NaN or x is +-inf. */
if (u.bits.exp == 0x7fff)
return (ux ? (x + x) : (u.bits.sign ? vzero / vzero : 0));
/* x < 0. Raise exception. */
if (u.bits.sign)
return (vzero / vzero);
/*
* If x is subnormal, then scale it into the normal range.
* Split x into significand and exponent, x = f * 2^m, with
* f in [0.5,1) and m a biased exponent.
*/
ENTERI();
if (u.bits.exp == 0) { /* Subnormal */
u.e *= 0x1p512;
m = u.bits.exp - 0x41fe;
} else {
m = u.bits.exp - 0x3ffe;
}
u.bits.exp = 0x3ffe;
/* m is odd. Put x into [0.25,5) and increase m. */
if (m & 1) {
u.e /= 2;
m += 1;
}
m = -(m >> 1); /* Prepare for 2^(-m/2). */
y = 1 / sqrt((double)u.e); /* ~52-bit estimate. */
y -= y * (u.e * y * y - 1) / 2; /* ~63-bit estimate. */
h = y / 2;
_MUL(u.e, y, zh, zl);
_XMUL(zh, zl, h, 0, ph, pl);
_XADD(-ph, -pl, half, 0, rh, rl);
y = rh * h + h;
u.e = 1;
u.xbits.expsign = 0x3fff + m + 1;
RETURNI(y * u.e);
}
#else
#ifdef _CC
#undef _CC
#endif
#define _CC (0x1p57L + 1)
long double
rsqrtl(long double x)
{
volatile static const double vzero = 0;
int hx, m, rnd;
long double y;
/* x = +-0. Raise exception. */
if (x == 0)
return (1 / x);
/* x is NaN. */
if (isnan(x))
return (x + x);
/* x is +-inf. */
if (isinf(x))
return (x > 0 ? 0 : vzero / vzero);
/* x < 0. Raise exception. */
if (x < 0)
return (vzero / vzero);
/*
* If x is subnormal, then scale it into the normal range.
* Split x into significand and exponent, x = f * 2^m, with
* f in [0.5,1) and m a biased exponent.
*/
m = 0;
if (!isnormal(x)) {
x *= 0x1p114L;
m = -114;
}
x = frexpl(x, &hx);
m += hx;
/* m is odd. Put x into [0.25,5) and increase m. */
if (m & 1) {
x /= 2;
m += 1;
}
m = -(m >> 1); /* Prepare for 2^(-m/2). */
y = 1 / sqrt((double)x); /* ~52-bit estimate. */
y -= y * (x * y * y - 1) / 2; /* ~104-bit estimate. */
static const double half = 0.5;
long double h, ph, pl, rh, rl, zh, zl;
h = y / 2;
rnd = fegetround();
fesetround(FE_TOWARDZERO);
_MUL(x, y, zh, zl);
_XMUL(zh, zl, -h, 0, ph, pl);
fesetround(rnd);
_XADD(ph, pl, half, 0, rh, rl);
y = rh * h + h;
m++;
RETURNI(ldexpl(y, m));
}
#endif