1
1
import { CharSet } from "refa" ;
2
- import { Alternative , Element } from "regexpp/ast" ;
2
+ import { Alternative , Element , WordBoundaryAssertion } from "regexpp/ast" ;
3
3
import {
4
4
getMatchingDirectionFromAssertionKind ,
5
5
isStrictBackreference ,
6
6
getLengthRange ,
7
7
hasSomeDescendant ,
8
8
isEmptyBackreference ,
9
9
MatchingDirection ,
10
+ invertMatchingDirection ,
10
11
} from "./basic" ;
11
12
import { toCharSet } from "./to-char-set" ;
12
13
import { followPaths } from "./follow" ;
@@ -122,6 +123,20 @@ export interface FirstPartiallyConsumedChar {
122
123
look : FirstLookChar ;
123
124
}
124
125
126
+ class ImplOptions {
127
+ readonly currentWordBoundaries : WordBoundaryAssertion [ ] = [ ] ;
128
+
129
+ isCurrentWordBoundary ( element : WordBoundaryAssertion ) : boolean {
130
+ return this . currentWordBoundaries . some ( e => e === element ) ;
131
+ }
132
+ pushWordBoundary ( element : WordBoundaryAssertion ) : void {
133
+ this . currentWordBoundaries . push ( element ) ;
134
+ }
135
+ popWordBoundary ( ) : void {
136
+ this . currentWordBoundaries . pop ( ) ;
137
+ }
138
+ }
139
+
125
140
/**
126
141
* If a character is returned, it guaranteed to be a super set of the actual character. If the given element is
127
142
* always of zero length, then the empty character set will be returned.
@@ -142,32 +157,75 @@ export function getFirstConsumedChar(
142
157
direction : MatchingDirection ,
143
158
flags : ReadonlyFlags
144
159
) : FirstConsumedChar {
160
+ const options = new ImplOptions ( ) ;
161
+
145
162
if ( Array . isArray ( element ) ) {
146
- return getFirstConsumedCharAlternativesImpl ( element as readonly Alternative [ ] , direction , flags ) ;
163
+ return getFirstConsumedCharAlternativesImpl ( element as readonly Alternative [ ] , direction , flags , options ) ;
147
164
} else {
148
- return getFirstConsumedCharImpl ( element as Element | Alternative , direction , flags ) ;
165
+ return getFirstConsumedCharImpl ( element as Element | Alternative , direction , flags , options ) ;
149
166
}
150
167
}
151
168
function getFirstConsumedCharAlternativesImpl (
152
169
element : readonly Alternative [ ] ,
153
170
direction : MatchingDirection ,
154
- flags : ReadonlyFlags
171
+ flags : ReadonlyFlags ,
172
+ options : ImplOptions
155
173
) : FirstConsumedChar {
156
174
return firstConsumedCharUnion (
157
- element . map ( e => getFirstConsumedCharImpl ( e , direction , flags ) ) ,
175
+ element . map ( e => getFirstConsumedCharImpl ( e , direction , flags , options ) ) ,
158
176
flags
159
177
) ;
160
178
}
161
179
function getFirstConsumedCharImpl (
162
180
element : Element | Alternative ,
163
181
direction : MatchingDirection ,
164
- flags : ReadonlyFlags
182
+ flags : ReadonlyFlags ,
183
+ options : ImplOptions
165
184
) : FirstConsumedChar {
166
185
switch ( element . type ) {
167
186
case "Assertion" :
168
187
switch ( element . kind ) {
169
188
case "word" :
170
- return misdirectedAssertion ( ) ;
189
+ if ( options . isCurrentWordBoundary ( element ) ) {
190
+ // this means that the value of a word boundary assertion depends on itself indirectly.
191
+ // we have to stop the recursion here because infinite recursion is possible otherwise.
192
+ return misdirectedAssertion ( ) ;
193
+ } else {
194
+ options . pushWordBoundary ( element ) ;
195
+ const before = getFirstCharAfterImpl (
196
+ element ,
197
+ invertMatchingDirection ( direction ) ,
198
+ flags ,
199
+ options
200
+ ) ;
201
+ options . popWordBoundary ( ) ;
202
+
203
+ // Remember:
204
+ // \B == (?<=\w)(?=\w)|(?<!\w)(?!\w)
205
+ // \b == (?<!\w)(?=\w)|(?<=\w)(?!\w)
206
+
207
+ const word = Chars . word ( flags ) ;
208
+
209
+ if ( before . edge ) {
210
+ // this forces our hand a little. Since the previous "character" might be the start/end of
211
+ // the string, we have to enter the alternative that starts with `(?<!\w)`
212
+ if ( before . char . isDisjointWith ( word ) ) {
213
+ return wordAssertion ( element . negate ) ;
214
+ } else {
215
+ // it might be either of the alternatives
216
+ return misdirectedAssertion ( ) ;
217
+ }
218
+ } else {
219
+ if ( before . char . isDisjointWith ( word ) ) {
220
+ return wordAssertion ( element . negate ) ;
221
+ } else if ( before . char . isSubsetOf ( word ) ) {
222
+ return wordAssertion ( ! element . negate ) ;
223
+ } else {
224
+ // it might be either of the alternatives
225
+ return misdirectedAssertion ( ) ;
226
+ }
227
+ }
228
+ }
171
229
case "end" :
172
230
case "start" :
173
231
if ( getMatchingDirectionFromAssertionKind ( element . kind ) === direction ) {
@@ -190,7 +248,8 @@ function getFirstConsumedCharImpl(
190
248
const firstChar = getFirstConsumedCharAlternativesImpl (
191
249
element . alternatives ,
192
250
direction ,
193
- flags
251
+ flags ,
252
+ options
194
253
) ;
195
254
const range = getLengthRange ( element . alternatives ) ;
196
255
if ( firstChar . empty || ! range ) {
@@ -210,7 +269,8 @@ function getFirstConsumedCharImpl(
210
269
const firstChar = getFirstConsumedCharAlternativesImpl (
211
270
element . alternatives ,
212
271
direction ,
213
- flags
272
+ flags ,
273
+ options
214
274
) ;
215
275
return emptyWord ( firstConsumedToLook ( firstChar ) ) ;
216
276
}
@@ -231,7 +291,7 @@ function getFirstConsumedCharImpl(
231
291
return emptyWord ( ) ;
232
292
}
233
293
234
- const firstChar = getFirstConsumedCharImpl ( element . element , direction , flags ) ;
294
+ const firstChar = getFirstConsumedCharImpl ( element . element , direction , flags , options ) ;
235
295
if ( element . min === 0 ) {
236
296
return firstConsumedCharUnion ( [ emptyWord ( ) , firstChar ] , flags ) ;
237
297
} else {
@@ -249,7 +309,7 @@ function getFirstConsumedCharImpl(
249
309
return firstConsumedCharConcat (
250
310
( function * ( ) : Iterable < FirstConsumedChar > {
251
311
for ( const e of elements ) {
252
- yield getFirstConsumedCharImpl ( e , direction , flags ) ;
312
+ yield getFirstConsumedCharImpl ( e , direction , flags , options ) ;
253
313
}
254
314
} ) ( ) ,
255
315
flags
@@ -258,13 +318,13 @@ function getFirstConsumedCharImpl(
258
318
259
319
case "CapturingGroup" :
260
320
case "Group" :
261
- return getFirstConsumedCharAlternativesImpl ( element . alternatives , direction , flags ) ;
321
+ return getFirstConsumedCharAlternativesImpl ( element . alternatives , direction , flags , options ) ;
262
322
263
323
case "Backreference" : {
264
324
if ( isEmptyBackreference ( element ) ) {
265
325
return emptyWord ( ) ;
266
326
}
267
- const resolvedChar = getFirstConsumedCharImpl ( element . resolved , direction , flags ) ;
327
+ const resolvedChar = getFirstConsumedCharImpl ( element . resolved , direction , flags , options ) ;
268
328
269
329
// the resolved character is only exact if it is only a single character.
270
330
// i.e. /(\w)\1/ here the (\w) will capture exactly any word character, but the \1 can only match
@@ -306,6 +366,15 @@ function getFirstConsumedCharImpl(
306
366
exact : true ,
307
367
} ) ;
308
368
}
369
+ function wordAssertion ( negate : boolean ) : FirstPartiallyConsumedChar {
370
+ const word = Chars . word ( flags ) ;
371
+
372
+ return emptyWord ( {
373
+ char : negate ? word . negate ( ) : word ,
374
+ edge : negate ,
375
+ exact : true ,
376
+ } ) ;
377
+ }
309
378
function emptyWord ( look ?: FirstLookChar ) : FirstPartiallyConsumedChar {
310
379
return firstConsumedCharEmptyWord ( flags , look ) ;
311
380
}
@@ -505,6 +574,14 @@ export function getFirstConsumedCharAfter(
505
574
afterThis : Element ,
506
575
direction : MatchingDirection ,
507
576
flags : ReadonlyFlags
577
+ ) : FirstConsumedChar {
578
+ return getFirstConsumedCharAfterImpl ( afterThis , direction , flags , new ImplOptions ( ) ) ;
579
+ }
580
+ function getFirstConsumedCharAfterImpl (
581
+ afterThis : Element ,
582
+ direction : MatchingDirection ,
583
+ flags : ReadonlyFlags ,
584
+ options : ImplOptions
508
585
) : FirstConsumedChar {
509
586
type State = Readonly < FirstConsumedChar > ;
510
587
const result = followPaths < State > (
@@ -516,7 +593,7 @@ export function getFirstConsumedCharAfter(
516
593
return firstConsumedCharUnion ( states , flags ) ;
517
594
} ,
518
595
enter ( element , state , direction ) : State {
519
- const first = getFirstConsumedChar ( element , direction , flags ) ;
596
+ const first = getFirstConsumedCharImpl ( element , direction , flags , options ) ;
520
597
return firstConsumedCharConcat ( [ state , first ] , flags ) ;
521
598
} ,
522
599
continueInto ( ) : boolean {
@@ -543,7 +620,15 @@ export function getFirstCharAfter(
543
620
direction : MatchingDirection ,
544
621
flags : ReadonlyFlags
545
622
) : FirstLookChar {
546
- return firstConsumedToLook ( getFirstConsumedCharAfter ( afterThis , direction , flags ) ) ;
623
+ return getFirstCharAfterImpl ( afterThis , direction , flags , new ImplOptions ( ) ) ;
624
+ }
625
+ function getFirstCharAfterImpl (
626
+ afterThis : Element ,
627
+ direction : MatchingDirection ,
628
+ flags : ReadonlyFlags ,
629
+ options : ImplOptions
630
+ ) : FirstLookChar {
631
+ return firstConsumedToLook ( getFirstConsumedCharAfterImpl ( afterThis , direction , flags , options ) ) ;
547
632
}
548
633
549
634
/**
@@ -565,6 +650,14 @@ export function getFirstConsumedCharAfterWithContributors(
565
650
afterThis : Element ,
566
651
direction : MatchingDirection ,
567
652
flags : ReadonlyFlags
653
+ ) : WithContributors < FirstConsumedChar > {
654
+ return getFirstConsumedCharAfterWithContributorsImpl ( afterThis , direction , flags , new ImplOptions ( ) ) ;
655
+ }
656
+ function getFirstConsumedCharAfterWithContributorsImpl (
657
+ afterThis : Element ,
658
+ direction : MatchingDirection ,
659
+ flags : ReadonlyFlags ,
660
+ option : ImplOptions
568
661
) : WithContributors < FirstConsumedChar > {
569
662
type State = Readonly < WithContributors < FirstConsumedChar > > ;
570
663
const result = followPaths < State > (
@@ -586,7 +679,7 @@ export function getFirstConsumedCharAfterWithContributors(
586
679
} ,
587
680
588
681
enter ( element , state , direction ) : State {
589
- const first = getFirstConsumedChar ( element , direction , flags ) ;
682
+ const first = getFirstConsumedCharImpl ( element , direction , flags , option ) ;
590
683
return {
591
684
char : firstConsumedCharConcat ( [ state . char , first ] , flags ) ,
592
685
contributors : [ ...state . contributors , element ] ,
@@ -614,6 +707,14 @@ export function getFirstCharAfterWithContributors(
614
707
direction : MatchingDirection ,
615
708
flags : ReadonlyFlags
616
709
) : WithContributors < FirstLookChar > {
617
- const { char, contributors } = getFirstConsumedCharAfterWithContributors ( afterThis , direction , flags ) ;
710
+ return getFirstCharAfterWithContributorsImpl ( afterThis , direction , flags , new ImplOptions ( ) ) ;
711
+ }
712
+ function getFirstCharAfterWithContributorsImpl (
713
+ afterThis : Element ,
714
+ direction : MatchingDirection ,
715
+ flags : ReadonlyFlags ,
716
+ option : ImplOptions
717
+ ) : WithContributors < FirstLookChar > {
718
+ const { char, contributors } = getFirstConsumedCharAfterWithContributorsImpl ( afterThis , direction , flags , option ) ;
618
719
return { char : firstConsumedToLook ( char ) , contributors } ;
619
720
}
0 commit comments