@@ -18,19 +18,20 @@ import scala.util.parsing.combinator.RegexParsers
18
18
* completion (supporting fuzzy matching)
19
19
*/
20
20
trait TermsParsers extends RegexParsers with RegexCompletionSupport with TermsParsingHelpers {
21
- val DefaultMaxCompletionsCount = 15 // exposed
21
+ val DefaultMaxCompletionsCount = 15 // exposed
22
22
private val DefaultSimilarityThreshold = 20
23
23
private val CompletionCandidatesMultiplierRatio = 3
24
24
25
25
/**
26
26
* This defines a parser which parses any of the specified terms.
27
27
* The parser performs a fast match by means of a trie data structure, initialized upon creation.
28
- * Completions will return all available terms below the matching trie node (if any)
28
+ * Completions will return all available terms below the matching trie node, in alphabetical order (if any)
29
29
* @param terms the list of terms to build the parser for
30
+ * @param maxCompletionsCount maximum number of completions returned by the parser
30
31
* @return parser instance
31
32
*/
32
- def oneOfTerms (terms : Seq [String ]): Parser [String ] = {
33
- new TermsParser (terms)
33
+ def oneOfTerms (terms : Seq [String ], maxCompletionsCount : Int = DefaultMaxCompletionsCount ): Parser [String ] = {
34
+ TermsParser (terms, maxCompletionsCount )
34
35
}
35
36
36
37
/**
@@ -59,17 +60,19 @@ trait TermsParsers extends RegexParsers with RegexCompletionSupport with TermsPa
59
60
similarityMeasure : (String , String ) => Double = diceSorensenSimilarity,
60
61
similarityThreshold : Int = DefaultSimilarityThreshold ,
61
62
maxCompletionsCount : Int = DefaultMaxCompletionsCount ): Parser [String ] = {
62
- new FuzzyParser (terms, similarityMeasure, similarityThreshold, maxCompletionsCount)
63
+ FuzzyParser (terms, similarityMeasure, similarityThreshold, maxCompletionsCount)
63
64
}
64
65
65
- sealed private class TermsParser (terms : Seq [String ]) extends Parser [String ] {
66
- protected def originalTerms : Seq [String ] = terms.map(_.trim()).filter(_.nonEmpty)
67
- protected def normalizedTerms : Seq [String ] = originalTerms.map(_.toLowerCase)
68
-
69
- protected val trie : PrefixMap [String ] = PrefixMap (normalizedTerms.zip(originalTerms).map {
70
- case (normalizedTerm, originalTerm) => (normalizedTerm, originalTerm)
71
- }: _* )
66
+ private object TermsParser {
67
+ def apply (terms : Seq [String ], maxCompletionsCount : Int ): TermsParser = {
68
+ val trie = PrefixMap (normalizedTerms(terms).zip(trimmedNonEmptyTerms(terms)).map {
69
+ case (normalizedTerm, originalTerm) => (normalizedTerm, originalTerm)
70
+ }: _* )
71
+ new TermsParser (trie, maxCompletionsCount)
72
+ }
73
+ }
72
74
75
+ sealed private class TermsParser (trie : PrefixMap [String ], maxCompletionsCount : Int ) extends Parser [String ] {
73
76
override def apply (in : Input ): ParseResult [String ] = {
74
77
val start = dropAnyWhiteSpace(in)
75
78
val (terms, finalPosition) =
@@ -90,57 +93,75 @@ trait TermsParsers extends RegexParsers with RegexCompletionSupport with TermsPa
90
93
case Success (_, _) => Completions .empty
91
94
case NoSuccess (_, _) =>
92
95
val start = dropAnyWhiteSpace(in)
93
- val terms = findAllTermsWithPrefix(start, start.offset, trie)
94
- Completions (in.pos, CompletionSet ( terms.map(t => Completion (t))) )
96
+ val terms = alphabeticalCompletions( findAllTermsWithPrefix(start, start.offset, trie), maxCompletionsCount )
97
+ Completions (in.pos, terms)
95
98
}
96
99
}
97
100
}
98
101
99
- sealed private class FuzzyParser (terms : Seq [String ], similarityMeasure : (String , String ) => Double , similarityThreshold : Int , maxCompletionsCount : Int )
100
- extends TermsParser (terms) {
102
+ private def trimmedNonEmptyTerms (terms : Seq [String ]) = terms.map(_.trim()).filter(_.nonEmpty)
103
+ private def normalizedTerms (terms : Seq [String ]) = terms.map(_.toLowerCase)
104
+
105
+ private def lazyQuicksort [A ](xs : Stream [A ])(implicit o : Ordering [A ]): Stream [A ] = {
106
+ import o ._
107
+ if (xs.isEmpty) xs
108
+ else {
109
+ val (smaller, bigger) = xs.tail.partition(_ < xs.head)
110
+ lazyQuicksort(smaller) #::: xs.head #:: lazyQuicksort(bigger)
111
+ }
112
+ }
101
113
102
- val ngramMap : PrefixMap [Array [String ]] = {
114
+ private def alphabeticalCompletions (terms : Iterable [String ], maxCompletionsCount : Int ): CompletionSet =
115
+ CompletionSet (
116
+ lazyQuicksort(terms.toStream)
117
+ .take(maxCompletionsCount)
118
+ .reverse
119
+ .zipWithIndex
120
+ .map {
121
+ case (t, rank) => Completion (t, rank)
122
+ }
123
+ .toSet)
124
+
125
+ private object FuzzyParser {
126
+ def apply (terms : Seq [String ], similarityMeasure : (String , String ) => Double , similarityThreshold : Int , maxCompletionsCount : Int ): FuzzyParser = {
127
+ val originals = trimmedNonEmptyTerms(terms)
128
+ val normalized = normalizedTerms(terms)
129
+ val completionsWhenInputEmpty = alphabeticalCompletions(originals, maxCompletionsCount)
103
130
val trigramTermPairs =
104
- normalizedTerms .zip(originalTerms ).par.flatMap {
131
+ normalized .zip(originals ).par.flatMap {
105
132
case (normalizedTerm, originalTerm) =>
106
133
tokenizeWords(normalizedTerm).flatMap(trigramsWithAffixing).map(trigram => trigram -> originalTerm)
107
134
}
108
- PrefixMap (trigramTermPairs.groupBy(_._1).mapValues(_.map(_._2).toArray).toSeq.seq: _* )
135
+ val ngramMap = PrefixMap (trigramTermPairs.groupBy(_._1).mapValues(_.map(_._2).toArray).toSeq.seq: _* )
136
+ val trie = PrefixMap (normalized.zip(originals).map {
137
+ case (normalizedTerm, originalTerm) => (normalizedTerm, originalTerm)
138
+ }: _* )
139
+ new FuzzyParser (completionsWhenInputEmpty, ngramMap, trie, similarityMeasure, similarityThreshold, maxCompletionsCount)
109
140
}
141
+ }
142
+
143
+ sealed private class FuzzyParser private (completionsWhenInputEmpty : CompletionSet ,
144
+ ngramMap : PrefixMap [Array [String ]],
145
+ trie : PrefixMap [String ],
146
+ similarityMeasure : (String , String ) => Double ,
147
+ similarityThreshold : Int ,
148
+ maxCompletionsCount : Int )
149
+ extends TermsParser (trie, maxCompletionsCount) {
110
150
111
151
override def completions (in : Input ): Completions = {
112
152
apply(in) match {
113
153
case Success (_, _) => Completions .empty
114
154
case NoSuccess (_, _) =>
115
155
val start = dropAnyWhiteSpace(in)
116
156
if (start.atEnd) {
117
- // return everything
118
- alphabeticalCompletions(start)
157
+ Completions (in.pos, completionsWhenInputEmpty)
119
158
} else {
120
159
fuzzyCompletions(start)
121
160
}
122
161
}
123
162
}
124
163
125
- val maxCandidatesCount : Int = maxCompletionsCount * CompletionCandidatesMultiplierRatio
126
-
127
- private def alphabeticalCompletions (in : Input ): Completions = {
128
- val matches = ngramMap
129
- .withPrefix(remainder(in).toLowerCase)
130
- .values
131
- .flatten
132
- .toList
133
- .distinct
134
- .sorted
135
- .take(maxCompletionsCount)
136
- if (matches.nonEmpty) {
137
- Completions (in.pos, CompletionSet (matches.reverse.zipWithIndex.map {
138
- case (t, rank) => Completion (t, rank)
139
- }.toSet))
140
- } else {
141
- Completions .empty
142
- }
143
- }
164
+ private val maxCandidatesCount : Int = maxCompletionsCount * CompletionCandidatesMultiplierRatio
144
165
145
166
private def findAndScoreNgramMatches (ngrams : Seq [String ]): Map [String , Int ] = {
146
167
def iter (ngram : String , remainingNgrams : Seq [String ], termsFromPreviousIter : Set [String ], acc : Map [String , Int ]): Map [String , Int ] = {
@@ -163,15 +184,15 @@ trait TermsParsers extends RegexParsers with RegexCompletionSupport with TermsPa
163
184
private def fuzzyCompletions (in : Input ): Completions = {
164
185
val incompleteTerm = remainder(in)
165
186
val candidates = findCandidateMatches(incompleteTerm)
166
- val rankedCompletions = candidates
167
- .map {
168
- case (candidateTerm, _) =>
169
- (candidateTerm, math.round(similarityMeasure(incompleteTerm, candidateTerm) * 100.0 ).toInt)
170
- }
171
- .filter { case (_, similarity) => similarity >= similarityThreshold }
172
- .sortBy {
173
- case (term, similarity) => (- similarity, term)
174
- }
187
+ val rankedCompletions = lazyQuicksort(
188
+ candidates.toStream
189
+ .map {
190
+ case (candidateTerm, _) =>
191
+ (candidateTerm, math.round(similarityMeasure(incompleteTerm, candidateTerm) * 100.0 ).toInt)
192
+ }
193
+ .filter { case (_, similarity) => similarity >= similarityThreshold })( Ordering .by( {
194
+ case (term, similarity) => (- similarity, term)
195
+ }))
175
196
.take(maxCompletionsCount)
176
197
if (rankedCompletions.nonEmpty) {
177
198
Completions (in.pos, CompletionSet (rankedCompletions.map {
@@ -185,7 +206,7 @@ trait TermsParsers extends RegexParsers with RegexCompletionSupport with TermsPa
185
206
private def findCandidateMatches (incompleteTerm : String ): Seq [(String , Int )] = {
186
207
val trigrams = trigramsWithAffixing(incompleteTerm.toLowerCase)
187
208
val matchingTerms : Map [String , Int ] = findAndScoreNgramMatches(trigrams)
188
- matchingTerms.toSeq.sortBy(_._2).reverse.take(maxCandidatesCount)
209
+ matchingTerms.toSeq.sortBy(_._2).view. reverse.take(maxCandidatesCount)
189
210
}
190
211
}
191
212
0 commit comments