libc: scalar strlen() in RISC-V assembly

Includes a scalar implementation of strlen() for the RISC-V
architecture and changes to the corresponding manpage.

Performance was benchamarked using before and after:
https://github.com/clausecker/strperf

os: FreeBSD
arch: riscv
        │ strlen_baseline │             strlen_scalar              │
        │     sec/op      │   sec/op     vs base                   │
Short        541.2µ ± 17%   401.6µ ± 0%  -25.78% (p=0.000 n=21+20)
Mid          249.6µ ±  3%   191.9µ ± 0%  -23.13% (p=0.000 n=21+20)
Long         124.6µ ±  0%   110.7µ ± 0%  -11.13% (p=0.000 n=21+20)
geomean      256.3µ         204.3µ       -20.26%

        │ strlen_baseline │              strlen_scalar               │
        │       B/s       │      B/s       vs base                   │
Short       220.3Mi ± 14%    296.8Mi ± 0%  +34.74% (p=0.000 n=21+20)
Mid         477.6Mi ±  3%    621.3Mi ± 0%  +30.09% (p=0.000 n=21+20)
Long        956.9Mi ±  0%   1076.7Mi ± 0%  +12.52% (p=0.000 n=21+20)
geomean     465.2Mi          583.4Mi       +25.40%

MFC after:	1 month
MFC to:		stable/15
Approved by:	mhorne, markj (mentor)
Reviewed by:	fuz
Sponsored by:	Google LLC (GSoC 2024)
Differential Revision:	https://reviews.freebsd.org/D45693
This commit is contained in:
Strahinja Stanišić
2024-05-17 16:23:48 +02:00
committed by Robert Clausecker
parent 164156058e
commit e09c1583ed
2 changed files with 78 additions and 0 deletions
+1
View File
@@ -1,4 +1,5 @@
MDSRCS+= \
memchr.S \
memset.S \
strlen.S \
strrchr.S
+77
View File
@@ -0,0 +1,77 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>
*/
#include <machine/asm.h>
/*
* https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
* uses haszero(v) (((v) - 0x01010101UL) & ~(v) & 0x80808080UL)
* which evalutates > 0 when there is zero in v
*
* register a0 - char *s
*/
ENTRY(strlen)
/*
* register a0 - char *str_start
* register a1 - char *str_ptr
* register a2 - char[8] iter
*/
/* load constants for haszero */
li t0, 0x0101010101010101
slli t1, t0, 7 # 0x8080808080808080, avoid li
/* check alignment of str_start */
andi a1, a0, ~0b111
ld a2, (a1)
beq a1, a0, .Lhas_zero
/* fill bytes before str_start with non-zero */
slli t2, a0, 3
addi t3, t2, -64
neg t3, t3
srl t3, t0, t3
or a2, a2, t3
/* unrolled iteration of haszero */
not t2, a2
sub a2, a2, t0
and a2, a2, t2
and a2, a2, t1
bnez a2, .Lfind_zero
.Lloop_has_zero:
ld a2, 8(a1)
addi a1, a1, 8 # move ptr to next 8byte
.Lhas_zero:
not t2, a2
sub a2, a2, t0
and a2, a2, t2
and a2, a2, t1
beqz a2, .Lloop_has_zero
.Lfind_zero:
/* use (iter & -iter) to isolate lowest set bit */
sub a3, zero, a2 #a3 = -iter
and t1, a2, a3 #t1 = (iter & -iter)
li t0, 0x0001020304050607
srli t1, t1, 7
/*
* lowest set bit is 2^(8*k)
* multiplying by it shifts the idx array in t0 by k bytes to the left
*/
mul t1, t1, t0
/* highest byte contains idx of first zero */
srli t1, t1, 56
add a1, a1, t1
sub a0, a1, a0
ret
END(strlen)