grep(1): optimize -w/--word-regexp word boundary check
The -w option checks word boundaries before and after each potential
match by decoding the adjacent character. This was done via the
heavyweight sscanf(3) with "%lc", which goes through the full scanf
parser and locale-aware mbrtowc(3) machinery even for simple ASCII.
Replace with a three-tier fast path:
1. ASCII bytes (< 0x80): simple isalnum(3) / '_' comparison
2. UTF-8 continuation bytes (0x80-0xBF): interior bytes of a multi-byte
character are always word characters -> no further decoding needed
3. Multi-byte start bytes (>= 0xC0): decode with mbrtowc(3) directly
instead of sscanf(3)/%lc, avoiding scanf parser overhead
Benchmark with ministat(1) (10 runs each):
Worst-case ASCII (100k lines of 100 'a' chars, -w 'a'):
Difference at 95.0% confidence: -15.3% +/- 3.1%
Worst-case Unicode (50k lines of 100 accented 'e', -w 'e'):
Difference at 95.0% confidence: -11.2% +/- 4.7%
Normal -w (500k lines, -w 'the'):
Difference at 95.0% confidence: -18.1% +/- 3.6%
French text (100k lines, -w accented 'ete'):
Difference at 95.0% confidence: -18.0% +/- 4.1%
Non -w case shows no regression.
Reviewed by: kevans
Differential Revision: https://reviews.freebsd.org/D57587
This commit is contained in:
+34
-10
@@ -490,6 +490,35 @@ litexec(const struct pat *pat, const char *string, size_t nmatch,
|
||||
|
||||
#define iswword(x) (iswalnum((x)) || (x) == L'_')
|
||||
|
||||
/*
|
||||
* Check if the byte at the given offset in the line is a word character
|
||||
* (alphanumeric or _). Handles ASCII fast path, UTF-8 continuation bytes,
|
||||
* and multi-byte decoding via mbrtowc(3).
|
||||
*/
|
||||
static bool
|
||||
iswordchar(const char *dat, size_t len, size_t offset)
|
||||
{
|
||||
unsigned char ch;
|
||||
mbstate_t mbstate;
|
||||
wchar_t wc;
|
||||
size_t n;
|
||||
|
||||
if (offset >= len)
|
||||
return (false);
|
||||
|
||||
ch = (unsigned char)dat[offset];
|
||||
if (ch < 0x80)
|
||||
return (isalnum(ch) || ch == '_');
|
||||
if ((ch & 0xC0) == 0x80)
|
||||
/* Continuation byte: part of a word */
|
||||
return (true);
|
||||
|
||||
/* Multi-byte start byte: decode with mbrtowc */
|
||||
memset(&mbstate, 0, sizeof(mbstate));
|
||||
n = mbrtowc(&wc, &dat[offset], MB_CUR_MAX, &mbstate);
|
||||
return (n == (size_t)-1 || n == (size_t)-2 || iswword(wc));
|
||||
}
|
||||
|
||||
/*
|
||||
* Processes a line comparing it with the specified patterns. Each pattern
|
||||
* is looped to be compared along with the full string, saving each and every
|
||||
@@ -501,7 +530,6 @@ static bool
|
||||
procline(struct parsec *pc)
|
||||
{
|
||||
regmatch_t pmatch, lastmatch, chkmatch;
|
||||
wchar_t wbegin, wend;
|
||||
size_t st, nst;
|
||||
unsigned int i;
|
||||
int r = 0, leflags = eflags;
|
||||
@@ -567,18 +595,14 @@ procline(struct parsec *pc)
|
||||
continue;
|
||||
/* Check for whole word match */
|
||||
if (wflag) {
|
||||
wbegin = wend = L' ';
|
||||
if (pmatch.rm_so != 0 &&
|
||||
sscanf(&pc->ln.dat[pmatch.rm_so - 1],
|
||||
"%lc", &wbegin) != 1)
|
||||
iswordchar(pc->ln.dat, pc->ln.len,
|
||||
pmatch.rm_so - 1))
|
||||
r = REG_NOMATCH;
|
||||
else if ((size_t)pmatch.rm_eo !=
|
||||
if (r == 0 && (size_t)pmatch.rm_eo !=
|
||||
pc->ln.len &&
|
||||
sscanf(&pc->ln.dat[pmatch.rm_eo],
|
||||
"%lc", &wend) != 1)
|
||||
r = REG_NOMATCH;
|
||||
else if (iswword(wbegin) ||
|
||||
iswword(wend))
|
||||
iswordchar(pc->ln.dat, pc->ln.len,
|
||||
pmatch.rm_eo))
|
||||
r = REG_NOMATCH;
|
||||
/*
|
||||
* If we're doing whole word matching and we
|
||||
|
||||
Reference in New Issue
Block a user