Skip to content

Commit 2b899a8

Browse files
Improved consuming word boundaries
1 parent c3f2fb3 commit 2b899a8

File tree

2 files changed

+188
-17
lines changed

2 files changed

+188
-17
lines changed

src/next-char.ts

Lines changed: 118 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
import { CharSet } from "refa";
2-
import { Alternative, Element } from "regexpp/ast";
2+
import { Alternative, Element, WordBoundaryAssertion } from "regexpp/ast";
33
import {
44
getMatchingDirectionFromAssertionKind,
55
isStrictBackreference,
66
getLengthRange,
77
hasSomeDescendant,
88
isEmptyBackreference,
99
MatchingDirection,
10+
invertMatchingDirection,
1011
} from "./basic";
1112
import { toCharSet } from "./to-char-set";
1213
import { followPaths } from "./follow";
@@ -122,6 +123,20 @@ export interface FirstPartiallyConsumedChar {
122123
look: FirstLookChar;
123124
}
124125

126+
class ImplOptions {
127+
readonly currentWordBoundaries: WordBoundaryAssertion[] = [];
128+
129+
isCurrentWordBoundary(element: WordBoundaryAssertion): boolean {
130+
return this.currentWordBoundaries.some(e => e === element);
131+
}
132+
pushWordBoundary(element: WordBoundaryAssertion): void {
133+
this.currentWordBoundaries.push(element);
134+
}
135+
popWordBoundary(): void {
136+
this.currentWordBoundaries.pop();
137+
}
138+
}
139+
125140
/**
126141
* If a character is returned, it guaranteed to be a super set of the actual character. If the given element is
127142
* always of zero length, then the empty character set will be returned.
@@ -142,32 +157,75 @@ export function getFirstConsumedChar(
142157
direction: MatchingDirection,
143158
flags: ReadonlyFlags
144159
): FirstConsumedChar {
160+
const options = new ImplOptions();
161+
145162
if (Array.isArray(element)) {
146-
return getFirstConsumedCharAlternativesImpl(element as readonly Alternative[], direction, flags);
163+
return getFirstConsumedCharAlternativesImpl(element as readonly Alternative[], direction, flags, options);
147164
} else {
148-
return getFirstConsumedCharImpl(element as Element | Alternative, direction, flags);
165+
return getFirstConsumedCharImpl(element as Element | Alternative, direction, flags, options);
149166
}
150167
}
151168
function getFirstConsumedCharAlternativesImpl(
152169
element: readonly Alternative[],
153170
direction: MatchingDirection,
154-
flags: ReadonlyFlags
171+
flags: ReadonlyFlags,
172+
options: ImplOptions
155173
): FirstConsumedChar {
156174
return firstConsumedCharUnion(
157-
element.map(e => getFirstConsumedCharImpl(e, direction, flags)),
175+
element.map(e => getFirstConsumedCharImpl(e, direction, flags, options)),
158176
flags
159177
);
160178
}
161179
function getFirstConsumedCharImpl(
162180
element: Element | Alternative,
163181
direction: MatchingDirection,
164-
flags: ReadonlyFlags
182+
flags: ReadonlyFlags,
183+
options: ImplOptions
165184
): FirstConsumedChar {
166185
switch (element.type) {
167186
case "Assertion":
168187
switch (element.kind) {
169188
case "word":
170-
return misdirectedAssertion();
189+
if (options.isCurrentWordBoundary(element)) {
190+
// this means that the value of a word boundary assertion depends on itself indirectly.
191+
// we have to stop the recursion here because infinite recursion is possible otherwise.
192+
return misdirectedAssertion();
193+
} else {
194+
options.pushWordBoundary(element);
195+
const before = getFirstCharAfterImpl(
196+
element,
197+
invertMatchingDirection(direction),
198+
flags,
199+
options
200+
);
201+
options.popWordBoundary();
202+
203+
// Remember:
204+
// \B == (?<=\w)(?=\w)|(?<!\w)(?!\w)
205+
// \b == (?<!\w)(?=\w)|(?<=\w)(?!\w)
206+
207+
const word = Chars.word(flags);
208+
209+
if (before.edge) {
210+
// this forces our hand a little. Since the previous "character" might be the start/end of
211+
// the string, we have to enter the alternative that starts with `(?<!\w)`
212+
if (before.char.isDisjointWith(word)) {
213+
return wordAssertion(element.negate);
214+
} else {
215+
// it might be either of the alternatives
216+
return misdirectedAssertion();
217+
}
218+
} else {
219+
if (before.char.isDisjointWith(word)) {
220+
return wordAssertion(element.negate);
221+
} else if (before.char.isSubsetOf(word)) {
222+
return wordAssertion(!element.negate);
223+
} else {
224+
// it might be either of the alternatives
225+
return misdirectedAssertion();
226+
}
227+
}
228+
}
171229
case "end":
172230
case "start":
173231
if (getMatchingDirectionFromAssertionKind(element.kind) === direction) {
@@ -190,7 +248,8 @@ function getFirstConsumedCharImpl(
190248
const firstChar = getFirstConsumedCharAlternativesImpl(
191249
element.alternatives,
192250
direction,
193-
flags
251+
flags,
252+
options
194253
);
195254
const range = getLengthRange(element.alternatives);
196255
if (firstChar.empty || !range) {
@@ -210,7 +269,8 @@ function getFirstConsumedCharImpl(
210269
const firstChar = getFirstConsumedCharAlternativesImpl(
211270
element.alternatives,
212271
direction,
213-
flags
272+
flags,
273+
options
214274
);
215275
return emptyWord(firstConsumedToLook(firstChar));
216276
}
@@ -231,7 +291,7 @@ function getFirstConsumedCharImpl(
231291
return emptyWord();
232292
}
233293

234-
const firstChar = getFirstConsumedCharImpl(element.element, direction, flags);
294+
const firstChar = getFirstConsumedCharImpl(element.element, direction, flags, options);
235295
if (element.min === 0) {
236296
return firstConsumedCharUnion([emptyWord(), firstChar], flags);
237297
} else {
@@ -249,7 +309,7 @@ function getFirstConsumedCharImpl(
249309
return firstConsumedCharConcat(
250310
(function* (): Iterable<FirstConsumedChar> {
251311
for (const e of elements) {
252-
yield getFirstConsumedCharImpl(e, direction, flags);
312+
yield getFirstConsumedCharImpl(e, direction, flags, options);
253313
}
254314
})(),
255315
flags
@@ -258,13 +318,13 @@ function getFirstConsumedCharImpl(
258318

259319
case "CapturingGroup":
260320
case "Group":
261-
return getFirstConsumedCharAlternativesImpl(element.alternatives, direction, flags);
321+
return getFirstConsumedCharAlternativesImpl(element.alternatives, direction, flags, options);
262322

263323
case "Backreference": {
264324
if (isEmptyBackreference(element)) {
265325
return emptyWord();
266326
}
267-
const resolvedChar = getFirstConsumedCharImpl(element.resolved, direction, flags);
327+
const resolvedChar = getFirstConsumedCharImpl(element.resolved, direction, flags, options);
268328

269329
// the resolved character is only exact if it is only a single character.
270330
// i.e. /(\w)\1/ here the (\w) will capture exactly any word character, but the \1 can only match
@@ -306,6 +366,15 @@ function getFirstConsumedCharImpl(
306366
exact: true,
307367
});
308368
}
369+
function wordAssertion(negate: boolean): FirstPartiallyConsumedChar {
370+
const word = Chars.word(flags);
371+
372+
return emptyWord({
373+
char: negate ? word.negate() : word,
374+
edge: negate,
375+
exact: true,
376+
});
377+
}
309378
function emptyWord(look?: FirstLookChar): FirstPartiallyConsumedChar {
310379
return firstConsumedCharEmptyWord(flags, look);
311380
}
@@ -505,6 +574,14 @@ export function getFirstConsumedCharAfter(
505574
afterThis: Element,
506575
direction: MatchingDirection,
507576
flags: ReadonlyFlags
577+
): FirstConsumedChar {
578+
return getFirstConsumedCharAfterImpl(afterThis, direction, flags, new ImplOptions());
579+
}
580+
function getFirstConsumedCharAfterImpl(
581+
afterThis: Element,
582+
direction: MatchingDirection,
583+
flags: ReadonlyFlags,
584+
options: ImplOptions
508585
): FirstConsumedChar {
509586
type State = Readonly<FirstConsumedChar>;
510587
const result = followPaths<State>(
@@ -516,7 +593,7 @@ export function getFirstConsumedCharAfter(
516593
return firstConsumedCharUnion(states, flags);
517594
},
518595
enter(element, state, direction): State {
519-
const first = getFirstConsumedChar(element, direction, flags);
596+
const first = getFirstConsumedCharImpl(element, direction, flags, options);
520597
return firstConsumedCharConcat([state, first], flags);
521598
},
522599
continueInto(): boolean {
@@ -543,7 +620,15 @@ export function getFirstCharAfter(
543620
direction: MatchingDirection,
544621
flags: ReadonlyFlags
545622
): FirstLookChar {
546-
return firstConsumedToLook(getFirstConsumedCharAfter(afterThis, direction, flags));
623+
return getFirstCharAfterImpl(afterThis, direction, flags, new ImplOptions());
624+
}
625+
function getFirstCharAfterImpl(
626+
afterThis: Element,
627+
direction: MatchingDirection,
628+
flags: ReadonlyFlags,
629+
options: ImplOptions
630+
): FirstLookChar {
631+
return firstConsumedToLook(getFirstConsumedCharAfterImpl(afterThis, direction, flags, options));
547632
}
548633

549634
/**
@@ -565,6 +650,14 @@ export function getFirstConsumedCharAfterWithContributors(
565650
afterThis: Element,
566651
direction: MatchingDirection,
567652
flags: ReadonlyFlags
653+
): WithContributors<FirstConsumedChar> {
654+
return getFirstConsumedCharAfterWithContributorsImpl(afterThis, direction, flags, new ImplOptions());
655+
}
656+
function getFirstConsumedCharAfterWithContributorsImpl(
657+
afterThis: Element,
658+
direction: MatchingDirection,
659+
flags: ReadonlyFlags,
660+
option: ImplOptions
568661
): WithContributors<FirstConsumedChar> {
569662
type State = Readonly<WithContributors<FirstConsumedChar>>;
570663
const result = followPaths<State>(
@@ -586,7 +679,7 @@ export function getFirstConsumedCharAfterWithContributors(
586679
},
587680

588681
enter(element, state, direction): State {
589-
const first = getFirstConsumedChar(element, direction, flags);
682+
const first = getFirstConsumedCharImpl(element, direction, flags, option);
590683
return {
591684
char: firstConsumedCharConcat([state.char, first], flags),
592685
contributors: [...state.contributors, element],
@@ -614,6 +707,14 @@ export function getFirstCharAfterWithContributors(
614707
direction: MatchingDirection,
615708
flags: ReadonlyFlags
616709
): WithContributors<FirstLookChar> {
617-
const { char, contributors } = getFirstConsumedCharAfterWithContributors(afterThis, direction, flags);
710+
return getFirstCharAfterWithContributorsImpl(afterThis, direction, flags, new ImplOptions());
711+
}
712+
function getFirstCharAfterWithContributorsImpl(
713+
afterThis: Element,
714+
direction: MatchingDirection,
715+
flags: ReadonlyFlags,
716+
option: ImplOptions
717+
): WithContributors<FirstLookChar> {
718+
const { char, contributors } = getFirstConsumedCharAfterWithContributorsImpl(afterThis, direction, flags, option);
618719
return { char: firstConsumedToLook(char), contributors };
619720
}

tests/next-char.ts

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,76 @@ describe(RAA.getFirstConsumedCharAfter.name, function () {
244244
look: { char: toCharSet(/[^]/), exact: true, edge: true },
245245
},
246246
},
247+
248+
// word boundary assertions
249+
{
250+
regexp: /(?<afterThis>a|b)\b/,
251+
expected: {
252+
char: toCharSet(/[]/),
253+
exact: true,
254+
empty: true,
255+
look: { char: toCharSet(/[\W]/), exact: true, edge: true },
256+
},
257+
},
258+
{
259+
regexp: /(?:a|b)(?<afterThis>)\b/,
260+
expected: {
261+
char: toCharSet(/[]/),
262+
exact: true,
263+
empty: true,
264+
look: { char: toCharSet(/[\W]/), exact: true, edge: true },
265+
},
266+
},
267+
{
268+
regexp: /(?:a|b(?<afterThis>))\b/,
269+
expected: {
270+
char: toCharSet(/[]/),
271+
exact: true,
272+
empty: true,
273+
look: { char: toCharSet(/[\W]/), exact: true, edge: true },
274+
},
275+
},
276+
{
277+
regexp: /;(?<afterThis>)\b[\d.]+/,
278+
expected: {
279+
char: toCharSet(/[\d]/),
280+
exact: true,
281+
empty: false,
282+
},
283+
},
284+
{
285+
regexp: /;(?<afterThis>)\b\b\b\b\b\b[\d.]+/,
286+
expected: {
287+
char: toCharSet(/[\d]/),
288+
exact: true,
289+
empty: false,
290+
},
291+
},
292+
{
293+
regexp: /;(?<afterThis>)\B[\d.]+/,
294+
expected: {
295+
char: toCharSet(/[.]/),
296+
exact: true,
297+
empty: false,
298+
},
299+
},
300+
{
301+
regexp: /;(?<afterThis>)\B\B\B[\d.]+/,
302+
expected: {
303+
char: toCharSet(/[.]/),
304+
exact: true,
305+
empty: false,
306+
},
307+
},
308+
{
309+
regexp: /^(?<afterThis>)\b/,
310+
expected: {
311+
char: toCharSet(/[]/),
312+
exact: true,
313+
empty: true,
314+
look: { char: toCharSet(/[\w]/), exact: true, edge: false },
315+
},
316+
},
247317
]);
248318

249319
function test(cases: TestCase[]): void {

0 commit comments

Comments
 (0)