libc: scalar strchrnul() in RISC-V assembly

Scalar implementation of strchrnul() in RISC-V assembly and changes to the
corresponding manpage.

Performance was benchmarked on a HiFive Unmatched (SiFive HF105-001) board
using: https://github.com/clausecker/strperf

os: FreeBSD
arch: riscv
        │ strchrnul_baseline │          strchrnul_scalar           │
        │       sec/op       │   sec/op     vs base                │
Short            680.2µ ± 5%   435.3µ ± 0%  -36.01% (p=0.000 n=20)
Mid              314.7µ ± 3%   221.4µ ± 0%  -29.63% (p=0.000 n=20)
Long             152.3µ ± 0%   138.5µ ± 0%   -9.08% (p=0.000 n=20)
geomean          319.5µ        237.2µ       -25.75%

        │ strchrnul_baseline │          strchrnul_scalar          │
        │       MiB/s        │   MiB/s     vs base                │
Short             183.8 ± 5%   287.2 ± 0%  +56.27% (p=0.000 n=20)
Mid               397.3 ± 3%   564.6 ± 0%  +42.12% (p=0.000 n=20)
Long              820.5 ± 0%   902.5 ± 0%   +9.99% (p=0.000 n=20)
geomean           391.3        527.0       +34.68%

MFC after:	1 month
MFC to:		stable/15
Approved by:	markj (mentor)
Reviewed by:	fuz
Sponsored by:	Google LLC (GSoC 2024)
Differential Revision:	https://reviews.freebsd.org/D46047
This commit is contained in:
Strahinja Stanišić
2024-07-19 19:58:04 +02:00
committed by Robert Clausecker
parent 474a80d3ff
commit 08af0bbc9c
2 changed files with 117 additions and 0 deletions
+1
View File
@@ -4,4 +4,5 @@ MDSRCS+= \
memset.S \
strlen.S \
strnlen.S \
strchrnul.S \
strrchr.S
+116
View File
@@ -0,0 +1,116 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
*/
#include <machine/asm.h>
.weak strchrnul
.set strchrnul, __strchrnul
/*
* a0 - const char *str
* a1 - int c;
*/
ENTRY(__strchrnul)
/*
* a0 - const char *ptr;
* a1 - char cccccccc[8];
* a2 - char iter[8];
* a3 - char mask_end
*/
/* int to char */
andi a1, a1, 0xFF
/* t0 = 0x0101010101010101 */
li t0, 0x01010101
slli t1, t0, 32
or t0, t0, t1
/* t1 = 0x8080808080808080 */
slli t1, t0, 7
/* spread char across bytes */
mul a1, a1, t0
/* align_offset */
andi t2, a0, 0b111
/* align pointer */
andi a0, a0, ~0b111
/* if pointer is aligned skip to loop */
beqz t2, .Lloop
ld a2, (a0)
/* mask_start calculation */
slli t2, t2, 3
neg t2, t2
srl t2, t0, t2
/* fill bytes before start with non-zero */
or a3, a2, t2
xor a2, a2, a1
or a2, a2, t2
/* has_zero for \0 */
not t3, a3
not t2, a2
sub a3, a3, t0
sub a2, a2, t0
and a3, a3, t3
and a2, a2, t2
and a3, a3, t1
and a2, a2, t1
/* if \0 or c was found, exit */
or a2, a2, a3
addi a0, a0, 8
bnez a2, .Lfind_char
.Lloop:
ld a2, (a0)
/* has_zero for both \0 or c */
xor a3, a2, a1
not t2, a2
not t3, a3
sub a2, a2, t0
sub a3, a3, t0
and a2, a2, t2
and a3, a3, t3
and a2, a2, t1
and a3, a3, t1
/* if \0 or c was found, exit */
or a2, a2, a3
addi a0, a0, 8
beqz a2, .Lloop
.Lfind_char:
addi a0, a0, -8
/* isolate lowest set bit */
neg t0, a2
and a2, a2, t0
li t0, 0x0001020304050607
srli a2, a2, 7
/* lowest set bit is 2^(8*k)
* multiplying by it shifts the idx array in t0 by k bytes to the left */
mul a2, a2, t0
/* highest byte contains idx of first zero */
srli a2, a2, 56
add a0, a0, a2
ret
END(__strchrnul)