diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 5140093..bcd4156 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -13,21 +13,24 @@ import copy from itertools import groupby from collections import namedtuple -import six from six.moves import zip -from lxml.etree import XPathEvaluator, Comment +from lxml.etree import Comment, iterwalk from webstruct.sequence_encoding import IobEncoder -from webstruct.text_tokenizers import tokenize +from webstruct.text_tokenizers import tokenize, TextToken from webstruct.utils import ( replace_html_tags, kill_html_tags, - smart_join, ) -_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail') +_HtmlToken = namedtuple('HtmlToken', ['index', + 'tokens', + 'elem', + 'is_tail', + 'position', + 'length']) class HtmlToken(_HtmlToken): @@ -41,6 +44,8 @@ class HtmlToken(_HtmlToken): * :attr:`elem` is the current html block (as lxml's Element) - most likely you want :attr:`parent` instead of it * :attr:`is_tail` flag indicates that token belongs to element tail + * :attr:`position` is logical position(in letters or codepoints) of token start in parent text + * :attr:`length` is logical length(in letters or codepoints) of token in parent text Computed properties: @@ -64,8 +69,10 @@ def root(self): return self.elem.getroottree() def __repr__(self): - return "HtmlToken(token=%r, parent=%r, index=%s)" % ( - self.token, self.parent, self.index + return ("HtmlToken(" + "token=%r, parent=%r, index=%s, position=%d, length=%d" + ")") % ( + self.token, self.parent, self.index, self.position, self.length ) @@ -85,7 +92,8 @@ class HtmlTokenizer(object): ---------- tagset : set, optional - A set of entity types to keep. If not passed, all entity types are kept. + A set of entity types to keep. + If not passed, all entity types are kept. Use this argument to discard some entity types from training data. sequence_encoder : object, optional Sequence encoder object. If not passed, @@ -142,7 +150,7 @@ def tokenize_single(self, tree): >>> tree = loader.loadbytes(b"

hello, John Doe
Mary said

") >>> html_tokens, tags = html_tokenizer.tokenize_single(tree) >>> html_tokens - [HtmlToken(token='hello', parent=, index=0), HtmlToken...] + [HtmlToken(token='hello', parent=, index=0, ...), HtmlToken...] >>> tags ['O', 'B-PER', 'I-PER', 'B-PER', 'O'] >>> for tok, iob_tag in zip(html_tokens, tags): @@ -180,6 +188,8 @@ def detokenize_single(self, html_tokens, tags): Build annotated ``lxml.etree.ElementTree`` from ``html_tokens`` (a list of :class:`.HtmlToken` instances) and ``tags`` (a list of their tags). + **ATTENTION**: ``html_tokens`` should be tokenized from tree + without tags Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` text tokens (this is the format :mod:`webstruct.loaders` use). @@ -190,9 +200,7 @@ def detokenize_single(self, html_tokens, tags): if not html_tokens: return None - orig_tree = html_tokens[0].root - tree = copy.deepcopy(orig_tree) - xpatheval = XPathEvaluator(tree) + tree = html_tokens[0].root # find starts/ends of token groups token_groups = self.sequence_encoder.group(zip(html_tokens, tags)) @@ -206,30 +214,49 @@ def detokenize_single(self, html_tokens, tags): pos += n_tokens # mark starts/ends with special tokens - data = zip(html_tokens, tags, range(len(html_tokens))) - keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail) + data = [(s, True) for s in starts] + data.extend((s, False) for s in ends) + keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail) + data.sort(key=keyfunc) - for (orig_elem, is_tail), g in groupby(data, keyfunc): + for (_, is_tail), g in groupby(data, keyfunc): g = list(g) - fix = False - tokens = g[0][0].tokens[:] - for token, tag, token_idx in g: - if token_idx in starts: - text = ' __START_%s__ %s' % (tag[2:], tokens[token.index]) - tokens[token.index] = text - fix = True - if token_idx in ends: - text = '%s __END_%s__ ' % (tokens[token.index], tag[2:]) - tokens[token.index] = text - fix = True - - if fix: - xpath = orig_tree.getpath(orig_elem) - elem = xpatheval(xpath)[0] - if is_tail: - elem.tail = smart_join(tokens) + g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1])) + + if not g: + continue + + elem = html_tokens[g[0][0]].elem + + pos_in_source = 0 + source = elem.text + if is_tail: + source = elem.tail + + mods = list() + + for idx, is_starts in g: + token = html_tokens[idx] + tag = tags[idx] + mods.append(source[pos_in_source:token.position]) + pos_in_source = token.position + if is_starts: + patch = ' __START_%s__ ' % (tag[2:],) + mods.append(patch) else: - elem.text = smart_join(tokens) + end_in_source = pos_in_source + token.length + mods.append(source[pos_in_source:end_in_source]) + pos_in_source = pos_in_source + token.length + patch = ' __END_%s__ ' % (tag[2:],) + mods.append(patch) + + mods.append(source[pos_in_source:]) + modded = ''.join(mods) + + if is_tail: + elem.tail = modded + else: + elem.text = modded return tree @@ -245,18 +272,35 @@ def _process_tree(self, tree): return head_tokens, head_tags = self._tokenize_and_split(tree.text) + char_tokens = [t.chars for t in head_tokens] for index, (token, tag) in enumerate(zip(head_tokens, head_tags)): - yield HtmlToken(index, head_tokens, tree, False), tag + yield HtmlToken(index, + char_tokens, + tree, + False, + token.position, + token.length), tag for child in tree: # where is my precious "yield from"? for html_token, tag in self._process_tree(child): yield html_token, tag tail_tokens, tail_tags = self._tokenize_and_split(tree.tail) + char_tokens = [t.chars for t in tail_tokens] for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)): - yield HtmlToken(index, tail_tokens, tree, True), tag + yield HtmlToken(index, + char_tokens, + tree, + True, + token.position, + token.length), tag + + def cleanup_tree(self, tree): + cleaned = copy.deepcopy(tree) + for _, elem in iterwalk(cleaned): + self._cleanup_elem(elem) - self._cleanup_elem(tree) + return cleaned def _cleanup_elem(self, elem): """ Remove special tokens from elem """ @@ -266,16 +310,23 @@ def _cleanup_elem(self, elem): elem.tail = self._tag_re.sub("", elem.tail) def _tokenize_and_split(self, text): - input_tokens = self._limit_tags(self.text_tokenize_func(text or '')) - input_tokens = map(six.text_type, input_tokens) - return self.sequence_encoder.encode_split(input_tokens) + text = text or '' + input_tokens = [t for t in self.text_tokenize_func(text)] + input_tokens = self._limit_tags(input_tokens) + input_tokens = [TextToken(chars=t.chars, + position=t.position, + length=t.length) for t in input_tokens] + chains = self.sequence_encoder.encode(t.chars for t in input_tokens) + chains = self.sequence_encoder.from_indices(chains, input_tokens) + chains = [l for l in chains] + return self.sequence_encoder.split(chains) def _limit_tags(self, input_tokens): if self.tagset is None: return input_tokens proc = self.sequence_encoder.token_processor - token_classes = [proc.classify(tok) for tok in input_tokens] + token_classes = [proc.classify(tok.chars) for tok in input_tokens] return [ tok for (tok, (typ, value)) in zip(input_tokens, token_classes) if not (typ in {'start', 'end'} and value not in self.tagset) diff --git a/webstruct/html_tokenizer_benchmark.py b/webstruct/html_tokenizer_benchmark.py new file mode 100644 index 0000000..c7bd17a --- /dev/null +++ b/webstruct/html_tokenizer_benchmark.py @@ -0,0 +1,34 @@ +import os.path +import glob +import timeit +import functools + +import webstruct.webannotator +import webstruct.html_tokenizer + +def load_trees(tokenizer, trees): + for tree in trees: + tokenizer.tokenize_single(tree) + +def main(): + path = os.path.join(os.path.dirname(__file__) , + ".." , + "webstruct_data", + "corpus/business_pages/wa/*.html") + + paths = sorted(glob.glob(path)) + + with open(paths[0], 'rb') as sample_reader: + colors = webstruct.webannotator.EntityColors.from_htmlbytes(sample_reader.read()) + entities = [typ for typ in colors] + + loader = webstruct.WebAnnotatorLoader(known_entities=entities) + + trees = [loader.load(p) for p in paths] + tokenizer = webstruct.html_tokenizer.HtmlTokenizer() + print(timeit.timeit(functools.partial(load_trees, tokenizer, trees), + setup='gc.enable()', + number=3)) + +if __name__ == "__main__": + main() diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index ba212ef..5b55752 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -11,13 +11,14 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"] - >>> iob_encoder.encode(input_tokens) + >>> def encode(encoder, tokens): return [p for p in IobEncoder.from_indices(encoder.encode(tokens), tokens)] + >>> encode(iob_encoder, input_tokens) [('John', 'B-PER'), ('said', 'O')] - Get the result in another format using ``encode_split`` method:: >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"] - >>> tokens, tags = iob_encoder.encode_split(input_tokens) + >>> tokens = encode(iob_encoder, input_tokens) + >>> tokens, tags = iob_encoder.split(tokens) >>> tokens, tags (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O']) @@ -25,9 +26,11 @@ class IobEncoder(object): stream and continue the encoding later:: >>> iob_encoder = IobEncoder() - >>> iob_encoder.encode(["__START_PER__", "John"]) + >>> input_tokens_partial = ["__START_PER__", "John"] + >>> encode(iob_encoder, input_tokens_partial) [('John', 'B-PER')] - >>> iob_encoder.encode(["Mayer", "__END_PER__", "said"]) + >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"] + >>> encode(iob_encoder, input_tokens_partial) [('Mayer', 'I-PER'), ('said', 'O')] To reset internal state, use ``reset method``:: @@ -36,7 +39,7 @@ class IobEncoder(object): Group results to entities:: - >>> iob_encoder.group(iob_encoder.encode(input_tokens)) + >>> iob_encoder.group(encode(iob_encoder, input_tokens)) [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')] Input token stream is processed by ``InputTokenProcessor()`` by default; @@ -53,7 +56,7 @@ def reset(self): self.tag = 'O' def iter_encode(self, input_tokens): - for token in input_tokens: + for number, token in enumerate(input_tokens): token_type, value = self.token_processor.classify(token) if token_type == 'start': @@ -68,7 +71,7 @@ def iter_encode(self, input_tokens): self.tag = "O" elif token_type == 'token': - yield token, self.tag + yield number, self.tag if self.tag[0] == 'B': self.tag = "I" + self.tag[1:] @@ -81,13 +84,14 @@ def iter_encode(self, input_tokens): def encode(self, input_tokens): return list(self.iter_encode(input_tokens)) - def encode_split(self, input_tokens): - """ The same as ``encode``, but returns ``(tokens, tags)`` tuple """ - res = self.encode(input_tokens) - if not res: - return (), () - tokens, tags = zip(*res) - return list(tokens), list(tags) + def split(self, tokens): + """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """ + return [t[0] for t in tokens], [t[1] for t in tokens] + + @classmethod + def from_indices(cls, indices, input_tokens): + for idx, tag in indices: + yield input_tokens[idx], tag @classmethod def group(cls, data, strict=False): @@ -186,4 +190,3 @@ def classify(self, token): # regular token return 'token', token - diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py index f386606..44420de 100644 --- a/webstruct/tests/test_html_tokenizer.py +++ b/webstruct/tests/test_html_tokenizer.py @@ -69,8 +69,6 @@ def assertTokenizationWorks(self, tree): [u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY'] ) - tree = html_tokens[0].root - self.assertNotIn(b'__', tostring(tree)) def test_tokenize_single(self): self.assertTokenizationWorks(self._load()) @@ -84,7 +82,7 @@ def test_detokenize_single(self): tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) - new_tree = html_tokens[0].root + new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) @@ -93,6 +91,7 @@ def test_detokenize_single(self): html_document_fromstring(UNANNOTATED_HTML) ) + html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) @@ -137,3 +136,34 @@ def test_tokenize_scripts_and_styles(self): # and restores the tree if needed detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(tree, detokenized_tree) + + def test_detokenize_preserve_commas(self): + annotated_html = b""" + + __START_ORG__ hello __END_ORG__ a, b world + + """ + + annotated_tree = HtmlLoader().loadbytes(annotated_html) + tokenizer = HtmlTokenizer() + html_tokens, tags = tokenizer.tokenize_single(annotated_tree) + clean_tree = tokenizer.cleanup_tree(annotated_tree) + html_tokens, _ = tokenizer.tokenize_single(clean_tree) + detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) + self.assertHtmlTreeEqual(annotated_tree, detokenized_tree) + + def test_detokenize_handle_unicode(self): + annotated_html = bytes(u""" + + Δ __START_ORG__ hello __END_ORG__ a, b world + + """.encode('utf-8')) + + + annotated_tree = HtmlLoader().loadbytes(annotated_html) + tokenizer = HtmlTokenizer() + html_tokens, tags = tokenizer.tokenize_single(annotated_tree) + clean_tree = tokenizer.cleanup_tree(annotated_tree) + html_tokens, _ = tokenizer.tokenize_single(clean_tree) + detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) + self.assertHtmlTreeEqual(annotated_tree, detokenized_tree) diff --git a/webstruct/tests/test_text_tokenizer.py b/webstruct/tests/test_text_tokenizer.py new file mode 100644 index 0000000..7427bfc --- /dev/null +++ b/webstruct/tests/test_text_tokenizer.py @@ -0,0 +1,48 @@ +import unittest +import pytest + +from webstruct.text_tokenizers import TextToken, WordTokenizer + +class TestTokenizerTest(unittest.TestCase): + def do_tokenize(self, text, result): + self.assertEqual(result, WordTokenizer().segment_words(text)) + + @pytest.mark.xfail + def test_phone(self): + return self.do_tokenize( + "Phone:855-349-1914", + [TextToken(chars='Phone', position=0, length=5)] + [TextToken(chars=':', position=5, length=1)] + [TextToken(chars='855-349-1914', position=6, length=12)] + ) + + @pytest.mark.xfail + def test_hyphen_mid(self): + return self.do_tokenize( + "Powai Campus, Mumbai-400077", + [TextToken(chars='Powai', position=0, length=5), + TextToken(chars='Campus', position=6, length=6), + TextToken(chars=',', position=12, length=1), + TextToken(chars='Mumbai', position=14, length=6), + TextToken(chars='-', position=20, length=1), + TextToken(chars='400077', position=21, length=6)] + ) + + @pytest.mark.xfail + def test_hyphen_end(self): + return self.do_tokenize( + "Saudi Arabia-", + [TextToken(chars='Saudi', position=0, length=5), + TextToken(chars='Arabia', position=6, length=6), + TextToken(chars='-', position=12, length=1)] + ) + + @pytest.mark.xfail + def test_hyphen_end(self): + return self.do_tokenize( + "1 5858/ 1800", + [TextToken(chars='1', position=0, length=1), + TextToken(chars='5858', position=2, length=4), + TextToken(chars='/', position=6, length=1), + TextToken(chars='1800', position=8, length=4)] + ) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 6cc166a..fd1c72c 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -1,61 +1,121 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import re +import collections + +TextToken = collections.namedtuple('TextToken', 'chars, position, length') class WordTokenizer(object): r"""This tokenizer is copy-pasted version of TreebankWordTokenizer that doesn't split on @ and ':' symbols and doesn't split contractions:: - >>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com''' - >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com'] - >>> WordTokenizer().tokenize(s) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com'] + >>> WordTokenizer().segment_words(s) + [TextToken(chars='Good', position=0, length=4), + TextToken(chars='muffins', position=5, length=7), + TextToken(chars='cost', position=13, length=4), + TextToken(chars='$', position=18, length=1), + TextToken(chars='3.88', position=19, length=4), + TextToken(chars='in', position=24, length=2), + TextToken(chars='New', position=27, length=3), + TextToken(chars='York.', position=31, length=5), + TextToken(chars='Email:', position=37, length=6), + TextToken(chars='muffins@gmail.com', position=44, length=17)] >>> s = '''Shelbourne Road,''' - >>> WordTokenizer().tokenize(s) - ['Shelbourne', 'Road', ','] + >>> WordTokenizer().segment_words(s) + [TextToken(chars='Shelbourne', position=0, length=10), + TextToken(chars='Road', position=11, length=4), + TextToken(chars=',', position=15, length=1)] >>> s = '''population of 100,000''' - >>> WordTokenizer().tokenize(s) - ['population', 'of', '100,000'] + >>> WordTokenizer().segment_words(s) + [TextToken(chars='population', position=0, length=10), + TextToken(chars='of', position=11, length=2), + TextToken(chars='100,000', position=14, length=7)] >>> s = '''Hello|World''' - >>> WordTokenizer().tokenize(s) - ['Hello', '|', 'World'] + >>> WordTokenizer().segment_words(s) + [TextToken(chars='Hello', position=0, length=5), + TextToken(chars='|', position=5, length=1), + TextToken(chars='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' - >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE - ['``', 'We', 'beat', 'some', 'pretty', 'good', - 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] + >>> WordTokenizer().segment_words(s2) # doctest: +NORMALIZE_WHITESPACE + [TextToken(chars='``', position=0, length=1), + TextToken(chars='We', position=1, length=2), + TextToken(chars='beat', position=4, length=4), + TextToken(chars='some', position=9, length=4), + TextToken(chars='pretty', position=14, length=6), + TextToken(chars='good', position=21, length=4), + TextToken(chars='teams', position=26, length=5), + TextToken(chars='to', position=32, length=2), + TextToken(chars='get', position=35, length=3), + TextToken(chars='here', position=39, length=4), + TextToken(chars=',', position=43, length=1), + TextToken(chars="''", position=44, length=1), + TextToken(chars='Slocum', position=46, length=6), + TextToken(chars='said', position=53, length=4), + TextToken(chars='.', position=57, length=1)] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' - >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE - ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable', - ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', - 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', - 'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.'] + >>> WordTokenizer().segment_words(s3) # doctest: +NORMALIZE_WHITESPACE + [TextToken(chars='Well', position=0, length=4), + TextToken(chars=',', position=4, length=1), + TextToken(chars='we', position=6, length=2), + TextToken(chars="couldn't", position=9, length=8), + TextToken(chars='have', position=18, length=4), + TextToken(chars='this', position=23, length=4), + TextToken(chars='predictable', position=28, length=11), + TextToken(chars=',', position=39, length=1), + TextToken(chars='cliche-ridden', position=41, length=13), + TextToken(chars=',', position=54, length=1), + TextToken(chars='``', position=56, length=1), + TextToken(chars='Touched', position=57, length=7), + TextToken(chars='by', position=65, length=2), + TextToken(chars='an', position=68, length=2), + TextToken(chars='Angel', position=71, length=5), + TextToken(chars="''", position=76, length=1), + TextToken(chars='(', position=78, length=1), + TextToken(chars='a', position=79, length=1), + TextToken(chars='show', position=81, length=4), + TextToken(chars='creator', position=86, length=7), + TextToken(chars='John', position=94, length=4), + TextToken(chars='Masius', position=99, length=6), + TextToken(chars='worked', position=106, length=6), + TextToken(chars='on', position=113, length=2), + TextToken(chars=')', position=115, length=1), + TextToken(chars='wanna-be', position=117, length=8), + TextToken(chars='if', position=126, length=2), + TextToken(chars='she', position=129, length=3), + TextToken(chars="didn't", position=133, length=6), + TextToken(chars='.', position=139, length=1)] + + >>> WordTokenizer().segment_words('"') + [TextToken(chars='``', position=0, length=1)] + + >>> WordTokenizer().segment_words('" a') + [TextToken(chars='``', position=0, length=1), + TextToken(chars='a', position=2, length=1)] Some issues: - >>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP - ['Phone', ':', '855-349-1914'] - - >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP - ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.'] - - >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP - ['Powai', 'Campus', ',', 'Mumbai", "-", "400077'] - - >>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP - ['1', '5858', '/', '1800'] - - >>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP - ['Saudi', 'Arabia', '-'] + >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") + [TextToken(chars='Copyright', position=0, length=9), + TextToken(chars=u'\xa9', position=10, length=1), + TextToken(chars='2014', position=12, length=4), + TextToken(chars='Foo', position=17, length=3), + TextToken(chars='Bar', position=21, length=3), + TextToken(chars='and', position=25, length=3), + TextToken(chars='Buzz', position=29, length=4), + TextToken(chars='Spam.', position=34, length=5), + TextToken(chars='All', position=40, length=3), + TextToken(chars='Rights', position=44, length=6), + TextToken(chars='Reserved', position=51, length=8), + TextToken(chars='.', position=59, length=1)] """ @@ -76,41 +136,64 @@ class WordTokenizer(object): open_quotes = re.compile(r'(^|[\s(\[{<])"') - def _tokenize(self, text): + def _segment_words(self, text): # this one cannot be placed in the loop because it requires # position check (beginning of the string) or previous char value - text = self.open_quotes.sub(r'\1``', text) + quote = self.open_quotes.search(text) + if quote is not None: + end = quote.end() - 1 + for t in self._segment_words(text[:end]): + yield t + yield TextToken(chars='``', position=end, length=1) + shift = end + 1 + for t in self._segment_words(text[shift:]): + yield TextToken(chars=t.chars, + position=t.position + shift, + length=t.length) + return i = 0 token_start = 0 while 1: if i >= len(text): - yield text[token_start:] + yield TextToken(chars=text[token_start:], + position=token_start, + length=len(text) - token_start) break shift = 1 partial_text = text[i:] for regex, token in self.rules: match = regex.match(partial_text) if match: - yield text[token_start:i] + yield TextToken(chars=text[token_start:i], + position=token_start, + length=i - token_start) shift = match.end() - match.start() token_start = i + shift if token is None: - yield match.group() + yield TextToken(chars=match.group(), + position=i + match.start(), + length=shift) else: - yield token + yield TextToken(chars=token, + position=i + match.start(), + length=shift) break i += shift + def segment_words(self, text): + return [t for t in self._segment_words(text) if t.chars] + def tokenize(self, text): - return [t for t in self._tokenize(text) if t] + return [t.chars for t in self.segment_words(text)] class DefaultTokenizer(WordTokenizer): - def tokenize(self, text): - tokens = super(DefaultTokenizer, self).tokenize(text) + def segment_words(self, text): + tokens = super(DefaultTokenizer, self).segment_words(text) # remove standalone commas and semicolons - # as they broke tag sets, e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" + # as they broke tag sets, + # e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" # but it has negative consequences, e.g. # etalon: [PER-B, PER-I, FUNC-B] @@ -118,7 +201,7 @@ def tokenize(self, text): # because we removed punctuation # FIXME: remove as token, but save as feature left/right_punct:"," - return [t for t in tokens if t not in {',', ';'}] + return [t for t in tokens if t.chars not in {',', ';'}] -tokenize = DefaultTokenizer().tokenize +tokenize = DefaultTokenizer().segment_words