libc: scalar memchr() in RISC-V assembly

Added an optimized memchr() implementation in RISC-V assembly and updated
the relevant manpage.

        │ memchr_baseline │            memchr_scalar            │
        │     sec/op      │   sec/op     vs base                │
Short         636.6µ ± 1%   495.9µ ± 1%  -22.10% (p=0.000 n=20)
Mid           279.7µ ± 1%   224.1µ ± 1%  -19.87% (p=0.000 n=20)
Long          138.8µ ± 0%   124.9µ ± 0%  -10.00% (p=0.000 n=20)
geomean       291.3µ        240.3µ       -17.48%

        │ memchr_baseline │            memchr_scalar             │
        │       B/s       │     B/s       vs base                │
Short        187.3Mi ± 1%   240.4Mi ± 1%  +28.37% (p=0.000 n=20)
Mid          426.2Mi ± 1%   531.9Mi ± 1%  +24.79% (p=0.000 n=20)
Long         859.0Mi ± 0%   954.4Mi ± 0%  +11.11% (p=0.000 n=20)
geomean      409.3Mi        496.0Mi       +21.19%

MFC after:	1 month
MFC to:		stable/15
Approved by:	mhorne, markj (mentor)
Reviewed by:	fuz
Sponsored by:	Google LLC (GSoC 2024)
Differential Revision:	https://reviews.freebsd.org/D46023

This commit is contained in:

Strahinja Stanišić

2024-07-17 13:19:52 +02:00

committed by

Robert Clausecker

parent 63ff982b17

commit 563efdd3bd

2 changed files with 189 additions and 0 deletions

									
										lib/libc/riscv/string/Makefile.inc
									
		+1
		
												View File
												
				@@ -1,2 +1,3 @@

				MDSRCS+= \

					memchr.S \

					strrchr.S

									
										lib/libc/riscv/string/memchr.S
									
		+188
		
												View File
												
				@@ -0,0 +1,188 @@

				/*-

				 * SPDX-License-Identifier: BSD-2-Clause

				 *

				 * Copyright (c) 2024 Strahinja Stanisic <strajabot@FreeBSD.org>

				 */

				#include <machine/asm.h>

				/*

				 * a0 - const void *b

				 * a1 - int c

				 * a2 - size_t len

				 */

				ENTRY(memchr)

					/*

					 * a0 - const char *ptr

					 * a1 - char cccccccc[8]

					 * a2 - char iter[8]

					 * a3 - uint8_t *end

					 * a4 - uint64_t *end_align

					 * a5 - uint64_t *end_unroll

					 */

					beqz a2, .Lno_match

					/* c = (uint8_t) c */

					andi a1, a1, 0xFF

					/*

					 * t0 = 0x0101010101010101

					 * t1 = 0x8080808080808080

					 * t2 = b << 3

					 * cccccccc = (uint8_t)c * t0

					 * end = b + len;

					 * ptr = b & ~0b111

					 */

					add a3, a0, a2

					li t0, 0x01010101

					sltu t2, a0, a3

					slli t1, t0, 32

					neg t2, t2

					or t0, t0, t1

					and a3, a3, t2

					slli t1, t0, 7

					slli t2, a0, 3

					and a0, a0, ~0b111

					mul a1, t0, a1

					ld a2, (a0)

					/*

					 * mask_start = REP8_0x01 ^ (REP8_0x01 << t2)

					 * iter = iter ^ cccccccc

					 * iter = iter | mask_start

					 */

					sll t2, t0, t2

					xor a2, a2, a1

					xor t2, t2, t0

					or a2, a2, t2

					/* has_zero(iter)

					 * end_align = (end + 7) & ~0b111;

					 */

					addi a4, a3, 7

					not t2, a2

					sub a2, a2, t0

					and t2, t2, t1

					andi a4, a4, ~0b111

					and a2, a2, t2

					/* ptr = ptr + 8 */

					addi a0, a0, 8

					bnez a2, .Lfind_zero

					/* if(ptr == end_align) */

					beq a0, a4, .Lno_match

					/* end_unroll = end_align & ~0b1111 */

					andi a5, a4, ~0b1111

					/*

					 * Instead of branching to check if `ptr` is 16-byte aligned:

					 *   - Probe the next 8 bytes for `c`

					 *   - Align `ptr` down to the nearest 16-byte boundary

					 *

					 * If `ptr` was already 16-byte aligned, those 8 bytes will be

					 * checked again inside the unrolled loop.

					 *

					 * This removes an unpredictable branch and improves performance.

					 */

					ld a2, (a0)

					xor a2, a2, a1

					not t2, a2

					sub a2, a2, t0

					and t2, t2, t1

					and a2, a2, t2

					addi a0, a0, 8

					bnez a2, .Lfind_zero

					andi a0, a0, ~0b1111

					/* while(ptr != end_unroll) */

					beq a0, a5, .Lskip_loop

				.Lloop:

					ld a2, (a0)

					ld t3, 8(a0)

					xor a2, a2, a1

					xor t3, t3, a1

					not t2, a2

					not t4, t3

					sub a2, a2, t0

					sub t3, t3, t0

					and t2, t2, t1

					and t4, t4, t1

					and a2, a2, t2

					and t3, t3, t4

					addi a0, a0, 8

					bnez a2, .Lfind_zero

					/* move into iter for find_zero */

					mv a2, t3

					addi a0, a0, 8

					bnez a2, .Lfind_zero

					bne a0, a5, .Lloop

				.Lskip_loop:

					/* there might be one 8byte left */

					beq a0, a4, .Lno_match

					ld a2, (a0)

					xor a2, a2, a1

					not t2, a2

					sub a2, a2, t0

					and t2, t2, t1

					and a2, a2, t2

					addi a0, a0, 8

					beqz a2, .Lno_match

				.Lfind_zero:

					/*

					 * ptr = ptr - 8

					 * t1 = 0x0001020304050607

					 * iter = iter & (-iter)

					 * iter = iter >> 7

					 * iter = iter * t1

					 * iter = iter >> 56

					 */

					li t1, 0x10203000

					neg t0, a2

					slli t1, t1, 4

					and a2, a2, t0

					addi t1, t1, 0x405

					srli a2, a2, 7

					slli t1, t1, 16

					addi a0, a0, -8

					addi t1, t1, 0x607

					mul a2, a2, t1

					srli a2, a2, 56

					/* left = end - ptr */

					sub t0, a3, a0

					/* return iter < left ? ptr + iter : NULL */

					sltu t1, a2, t0

					neg t1, t1

					add a0, a0, a2

					and a0, a0, t1

					ret

				.Lno_match:

					li a0, 0

					ret

				END(memchr)