grep(1): optimize -w/--word-regexp word boundary check

The -w option checks word boundaries before and after each potential
match by decoding the adjacent character.  This was done via the
heavyweight sscanf(3) with "%lc", which goes through the full scanf
parser and locale-aware mbrtowc(3) machinery even for simple ASCII.

Replace with a three-tier fast path:

1. ASCII bytes (< 0x80): simple isalnum(3) / '_' comparison
2. UTF-8 continuation bytes (0x80-0xBF): interior bytes of a multi-byte
   character are always word characters -> no further decoding needed
3. Multi-byte start bytes (>= 0xC0): decode with mbrtowc(3) directly
   instead of sscanf(3)/%lc, avoiding scanf parser overhead

Benchmark with ministat(1) (10 runs each):

Worst-case ASCII (100k lines of 100 'a' chars, -w 'a'):
    Difference at 95.0% confidence: -15.3% +/- 3.1%

Worst-case Unicode (50k lines of 100 accented 'e', -w 'e'):
    Difference at 95.0% confidence: -11.2% +/- 4.7%

Normal -w (500k lines, -w 'the'):
    Difference at 95.0% confidence: -18.1% +/- 3.6%

French text (100k lines, -w accented 'ete'):
    Difference at 95.0% confidence: -18.0% +/- 4.1%

Non -w case shows no regression.

Reviewed by:	kevans
Differential Revision:	https://reviews.freebsd.org/D57587

This commit is contained in:

Baptiste Daroussin

2026-06-10 16:41:39 +02:00

parent b4af6a4ccc

commit a74c77cc7b

1 changed files with 34 additions and 10 deletions

									
										usr.bin/grep/util.c
									
		+34
		-10
	
												View File
												
					@@ -490,6 +490,35 @@ litexec(const struct pat *pat, const char *string, size_t nmatch,

					#define iswword(x)	(iswalnum((x)) || (x) == L'_')

					#define iswword(x)	(iswalnum((x)) || (x) == L'_')

					/*

					 * Check if the byte at the given offset in the line is a word character

					 * (alphanumeric or _).  Handles ASCII fast path, UTF-8 continuation bytes,

					 * and multi-byte decoding via mbrtowc(3).

					 */

					static bool

					iswordchar(const char *dat, size_t len, size_t offset)

					{

						unsigned char ch;

						mbstate_t mbstate;

						wchar_t wc;

						size_t n;

						if (offset >= len)

							return (false);

						ch = (unsigned char)dat[offset];

						if (ch < 0x80)

							return (isalnum(ch) || ch == '_');

						if ((ch & 0xC0) == 0x80)

							/* Continuation byte: part of a word */

							return (true);

						/* Multi-byte start byte: decode with mbrtowc */

						memset(&mbstate, 0, sizeof(mbstate));

						n = mbrtowc(&wc, &dat[offset], MB_CUR_MAX, &mbstate);

						return (n == (size_t)-1 || n == (size_t)-2 || iswword(wc));

					}

					/*

					/*

					 * Processes a line comparing it with the specified patterns.  Each pattern

					 * Processes a line comparing it with the specified patterns.  Each pattern

					 * is looped to be compared along with the full string, saving each and every

					 * is looped to be compared along with the full string, saving each and every

					@@ -501,7 +530,6 @@ static bool

					procline(struct parsec *pc)

					procline(struct parsec *pc)

					{

					{

						regmatch_t pmatch, lastmatch, chkmatch;

						regmatch_t pmatch, lastmatch, chkmatch;

						wchar_t wbegin, wend;

						size_t st, nst;

						size_t st, nst;

						unsigned int i;

						unsigned int i;

						int r = 0, leflags = eflags;

						int r = 0, leflags = eflags;

					@@ -567,18 +595,14 @@ procline(struct parsec *pc)

									continue;

									continue;

								/* Check for whole word match */

								/* Check for whole word match */

								if (wflag) {

								if (wflag) {

									wbegin = wend = L' ';

									if (pmatch.rm_so != 0 &&

									if (pmatch.rm_so != 0 &&

									    sscanf(&pc->ln.dat[pmatch.rm_so - 1],

									    iswordchar(pc->ln.dat, pc->ln.len,

									    "%lc", &wbegin) != 1)

									    pmatch.rm_so - 1))

										r = REG_NOMATCH;

										r = REG_NOMATCH;

									else if ((size_t)pmatch.rm_eo !=

									if (r == 0 && (size_t)pmatch.rm_eo !=

									    pc->ln.len &&

									    pc->ln.len &&

									    sscanf(&pc->ln.dat[pmatch.rm_eo],

									    iswordchar(pc->ln.dat, pc->ln.len,

									    "%lc", &wend) != 1)

									    pmatch.rm_eo))

										r = REG_NOMATCH;

									else if (iswword(wbegin) ||

									    iswword(wend))

										r = REG_NOMATCH;

										r = REG_NOMATCH;

									/*

									/*

									 * If we're doing whole word matching and we

									 * If we're doing whole word matching and we