From 36d56f2c4e659658acb3f01664223b02e9e0125a Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 16:04:47 +0300 Subject: [PATCH 01/31] text tokenizer return postions of token --- webstruct/html_tokenizer.py | 2 +- webstruct/text_tokenizers.py | 23 +++++++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 5140093..1facd66 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -266,7 +266,7 @@ def _cleanup_elem(self, elem): elem.tail = self._tag_re.sub("", elem.tail) def _tokenize_and_split(self, text): - input_tokens = self._limit_tags(self.text_tokenize_func(text or '')) + input_tokens = self._limit_tags(t.token for t in self.text_tokenize_func(text or '')) input_tokens = map(six.text_type, input_tokens) return self.sequence_encoder.encode_split(input_tokens) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 6cc166a..b7086ae 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -1,6 +1,9 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import re +import collections + +TextToken = collections.namedtuple('TextToken', 'token, position, length') class WordTokenizer(object): @@ -85,25 +88,33 @@ def _tokenize(self, text): token_start = 0 while 1: if i >= len(text): - yield text[token_start:] + yield TextToken(token=text[token_start:], + position=token_start, + length=len(text) - token_start) break shift = 1 partial_text = text[i:] for regex, token in self.rules: match = regex.match(partial_text) if match: - yield text[token_start:i] + yield TextToken(token=text[token_start:i], + position=token_start, + length=i - token_start) shift = match.end() - match.start() token_start = i + shift if token is None: - yield match.group() + yield TextToken(token=match.group(), + position=match.start(), + length=shift) else: - yield token + yield TextToken(token=token, + position=match.start(), + length=shift) break i += shift def tokenize(self, text): - return [t for t in self._tokenize(text) if t] + return [t for t in self._tokenize(text) if t.token] class DefaultTokenizer(WordTokenizer): @@ -118,7 +129,7 @@ def tokenize(self, text): # because we removed punctuation # FIXME: remove as token, but save as feature left/right_punct:"," - return [t for t in tokens if t not in {',', ';'}] + return [t for t in tokens if t.token not in {',', ';'}] tokenize = DefaultTokenizer().tokenize From 2d4d2ef1777b0de401895995972b8fdc56816d5d Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 16:16:11 +0300 Subject: [PATCH 02/31] update tests --- webstruct/text_tokenizers.py | 74 +++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 10 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index b7086ae..42d178d 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -15,33 +15,87 @@ class WordTokenizer(object): >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com'] >>> WordTokenizer().tokenize(s) - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com'] + [TextToken(token='Good', position=0, length=4), + TextToken(token='muffins', position=5, length=7), + TextToken(token='cost', position=13, length=4), + TextToken(token='$', position=0, length=1), + TextToken(token='3.88', position=19, length=4), + TextToken(token='in', position=24, length=2), + TextToken(token='New', position=27, length=3), + TextToken(token='York.', position=31, length=5), + TextToken(token='Email:', position=37, length=6), + TextToken(token='muffins@gmail.com', position=44, length=17)] >>> s = '''Shelbourne Road,''' >>> WordTokenizer().tokenize(s) - ['Shelbourne', 'Road', ','] + [TextToken(token='Shelbourne', position=0, length=10), + TextToken(token='Road', position=11, length=4), + TextToken(token=',', position=0, length=1)] >>> s = '''population of 100,000''' >>> WordTokenizer().tokenize(s) - ['population', 'of', '100,000'] + [TextToken(token='population', position=0, length=10), + TextToken(token='of', position=11, length=2), + TextToken(token='100,000', position=14, length=7)] >>> s = '''Hello|World''' >>> WordTokenizer().tokenize(s) - ['Hello', '|', 'World'] + [TextToken(token='Hello', position=0, length=5), + TextToken(token='|', position=0, length=1), + TextToken(token='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE - ['``', 'We', 'beat', 'some', 'pretty', 'good', - 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] + [TextToken(token='``', position=0, length=2), + TextToken(token='We', position=2, length=2), + TextToken(token='beat', position=5, length=4), + TextToken(token='some', position=10, length=4), + TextToken(token='pretty', position=15, length=6), + TextToken(token='good', position=22, length=4), + TextToken(token='teams', position=27, length=5), + TextToken(token='to', position=33, length=2), + TextToken(token='get', position=36, length=3), + TextToken(token='here', position=40, length=4), + TextToken(token=',', position=0, length=1), + TextToken(token="''", position=0, length=1), + TextToken(token='Slocum', position=47, length=6), + TextToken(token='said', position=54, length=4), + TextToken(token='.', position=0, length=1)] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE - ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable', - ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', - 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', - 'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.'] + [TextToken(token='Well', position=0, length=4), + TextToken(token=',', position=0, length=1), + TextToken(token='we', position=6, length=2), + TextToken(token="couldn't", position=9, length=8), + TextToken(token='have', position=18, length=4), + TextToken(token='this', position=23, length=4), + TextToken(token='predictable', position=28, length=11), + TextToken(token=',', position=0, length=1), + TextToken(token='cliche-ridden', position=41, length=13), + TextToken(token=',', position=0, length=1), + TextToken(token='``', position=0, length=2), + TextToken(token='Touched', position=58, length=7), + TextToken(token='by', position=66, length=2), + TextToken(token='an', position=69, length=2), + TextToken(token='Angel', position=72, length=5), + TextToken(token="''", position=0, length=1), + TextToken(token='(', position=0, length=1), + TextToken(token='a', position=80, length=1), + TextToken(token='show', position=82, length=4), + TextToken(token='creator', position=87, length=7), + TextToken(token='John', position=95, length=4), + TextToken(token='Masius', position=100, length=6), + TextToken(token='worked', position=107, length=6), + TextToken(token='on', position=114, length=2), + TextToken(token=')', position=0, length=1), + TextToken(token='wanna-be', position=118, length=8), + TextToken(token='if', position=127, length=2), + TextToken(token='she', position=130, length=3), + TextToken(token="didn't", position=134, length=6), + TextToken(token='.', position=0, length=1)] Some issues: From 80658cacfabd5e14ba36d0e25c5d04e4c80e3447 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 17:07:45 +0300 Subject: [PATCH 03/31] separate statement for every action --- webstruct/html_tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 1facd66..a344433 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -266,7 +266,9 @@ def _cleanup_elem(self, elem): elem.tail = self._tag_re.sub("", elem.tail) def _tokenize_and_split(self, text): - input_tokens = self._limit_tags(t.token for t in self.text_tokenize_func(text or '')) + text = text or '' + input_tokens = [t.token for t in self.text_tokenize_func(text)] + input_tokens = self._limit_tags(input_tokens) input_tokens = map(six.text_type, input_tokens) return self.sequence_encoder.encode_split(input_tokens) From c52e449079cf30a5f8de52ef10451a61e5ba1e64 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 17:08:04 +0300 Subject: [PATCH 04/31] comma preserving test --- webstruct/tests/test_html_tokenizer.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py index f386606..9d1c81e 100644 --- a/webstruct/tests/test_html_tokenizer.py +++ b/webstruct/tests/test_html_tokenizer.py @@ -7,6 +7,7 @@ from webstruct.loaders import GateLoader, HtmlLoader from webstruct.utils import html_document_fromstring from .utils import HtmlTest +import pdb GATE_HTML = b""" @@ -137,3 +138,16 @@ def test_tokenize_scripts_and_styles(self): # and restores the tree if needed detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(tree, detokenized_tree) + + def test_detokenize_preserve_commas(self): + html = b""" + + __START_TAG_ORG__ hello __END_TAG_ORG__ a, b world + + """ + + tree = HtmlLoader().loadbytes(html) + tokenizer = HtmlTokenizer() + html_tokens, tags = tokenizer.tokenize_single(tree) + detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) + self.assertHtmlTreeEqual(tree, detokenized_tree) From 81787767645ba1464ae1691264997ee6f24e7d0a Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 17:41:29 +0300 Subject: [PATCH 05/31] too much tokens around --- webstruct/html_tokenizer.py | 2 +- webstruct/text_tokenizers.py | 142 +++++++++++++++++------------------ 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index a344433..9020794 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -267,7 +267,7 @@ def _cleanup_elem(self, elem): def _tokenize_and_split(self, text): text = text or '' - input_tokens = [t.token for t in self.text_tokenize_func(text)] + input_tokens = [t.chars for t in self.text_tokenize_func(text)] input_tokens = self._limit_tags(input_tokens) input_tokens = map(six.text_type, input_tokens) return self.sequence_encoder.encode_split(input_tokens) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 42d178d..be25ba2 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -3,7 +3,7 @@ import re import collections -TextToken = collections.namedtuple('TextToken', 'token, position, length') +TextToken = collections.namedtuple('TextToken', 'chars, position, length') class WordTokenizer(object): @@ -15,87 +15,87 @@ class WordTokenizer(object): >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com'] >>> WordTokenizer().tokenize(s) - [TextToken(token='Good', position=0, length=4), - TextToken(token='muffins', position=5, length=7), - TextToken(token='cost', position=13, length=4), - TextToken(token='$', position=0, length=1), - TextToken(token='3.88', position=19, length=4), - TextToken(token='in', position=24, length=2), - TextToken(token='New', position=27, length=3), - TextToken(token='York.', position=31, length=5), - TextToken(token='Email:', position=37, length=6), - TextToken(token='muffins@gmail.com', position=44, length=17)] + [TextToken(chars='Good', position=0, length=4), + TextToken(chars='muffins', position=5, length=7), + TextToken(chars='cost', position=13, length=4), + TextToken(chars='$', position=0, length=1), + TextToken(chars='3.88', position=19, length=4), + TextToken(chars='in', position=24, length=2), + TextToken(chars='New', position=27, length=3), + TextToken(chars='York.', position=31, length=5), + TextToken(chars='Email:', position=37, length=6), + TextToken(chars='muffins@gmail.com', position=44, length=17)] >>> s = '''Shelbourne Road,''' >>> WordTokenizer().tokenize(s) - [TextToken(token='Shelbourne', position=0, length=10), - TextToken(token='Road', position=11, length=4), - TextToken(token=',', position=0, length=1)] + [TextToken(chars='Shelbourne', position=0, length=10), + TextToken(chars='Road', position=11, length=4), + TextToken(chars=',', position=0, length=1)] >>> s = '''population of 100,000''' >>> WordTokenizer().tokenize(s) - [TextToken(token='population', position=0, length=10), - TextToken(token='of', position=11, length=2), - TextToken(token='100,000', position=14, length=7)] + [TextToken(chars='population', position=0, length=10), + TextToken(chars='of', position=11, length=2), + TextToken(chars='100,000', position=14, length=7)] >>> s = '''Hello|World''' >>> WordTokenizer().tokenize(s) - [TextToken(token='Hello', position=0, length=5), - TextToken(token='|', position=0, length=1), - TextToken(token='World', position=6, length=5)] + [TextToken(chars='Hello', position=0, length=5), + TextToken(chars='|', position=0, length=1), + TextToken(chars='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE - [TextToken(token='``', position=0, length=2), - TextToken(token='We', position=2, length=2), - TextToken(token='beat', position=5, length=4), - TextToken(token='some', position=10, length=4), - TextToken(token='pretty', position=15, length=6), - TextToken(token='good', position=22, length=4), - TextToken(token='teams', position=27, length=5), - TextToken(token='to', position=33, length=2), - TextToken(token='get', position=36, length=3), - TextToken(token='here', position=40, length=4), - TextToken(token=',', position=0, length=1), - TextToken(token="''", position=0, length=1), - TextToken(token='Slocum', position=47, length=6), - TextToken(token='said', position=54, length=4), - TextToken(token='.', position=0, length=1)] + [TextToken(chars='``', position=0, length=2), + TextToken(chars='We', position=2, length=2), + TextToken(chars='beat', position=5, length=4), + TextToken(chars='some', position=10, length=4), + TextToken(chars='pretty', position=15, length=6), + TextToken(chars='good', position=22, length=4), + TextToken(chars='teams', position=27, length=5), + TextToken(chars='to', position=33, length=2), + TextToken(chars='get', position=36, length=3), + TextToken(chars='here', position=40, length=4), + TextToken(chars=',', position=0, length=1), + TextToken(chars="''", position=0, length=1), + TextToken(chars='Slocum', position=47, length=6), + TextToken(chars='said', position=54, length=4), + TextToken(chars='.', position=0, length=1)] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE - [TextToken(token='Well', position=0, length=4), - TextToken(token=',', position=0, length=1), - TextToken(token='we', position=6, length=2), - TextToken(token="couldn't", position=9, length=8), - TextToken(token='have', position=18, length=4), - TextToken(token='this', position=23, length=4), - TextToken(token='predictable', position=28, length=11), - TextToken(token=',', position=0, length=1), - TextToken(token='cliche-ridden', position=41, length=13), - TextToken(token=',', position=0, length=1), - TextToken(token='``', position=0, length=2), - TextToken(token='Touched', position=58, length=7), - TextToken(token='by', position=66, length=2), - TextToken(token='an', position=69, length=2), - TextToken(token='Angel', position=72, length=5), - TextToken(token="''", position=0, length=1), - TextToken(token='(', position=0, length=1), - TextToken(token='a', position=80, length=1), - TextToken(token='show', position=82, length=4), - TextToken(token='creator', position=87, length=7), - TextToken(token='John', position=95, length=4), - TextToken(token='Masius', position=100, length=6), - TextToken(token='worked', position=107, length=6), - TextToken(token='on', position=114, length=2), - TextToken(token=')', position=0, length=1), - TextToken(token='wanna-be', position=118, length=8), - TextToken(token='if', position=127, length=2), - TextToken(token='she', position=130, length=3), - TextToken(token="didn't", position=134, length=6), - TextToken(token='.', position=0, length=1)] + [TextToken(chars='Well', position=0, length=4), + TextToken(chars=',', position=0, length=1), + TextToken(chars='we', position=6, length=2), + TextToken(chars="couldn't", position=9, length=8), + TextToken(chars='have', position=18, length=4), + TextToken(chars='this', position=23, length=4), + TextToken(chars='predictable', position=28, length=11), + TextToken(chars=',', position=0, length=1), + TextToken(chars='cliche-ridden', position=41, length=13), + TextToken(chars=',', position=0, length=1), + TextToken(chars='``', position=0, length=2), + TextToken(chars='Touched', position=58, length=7), + TextToken(chars='by', position=66, length=2), + TextToken(chars='an', position=69, length=2), + TextToken(chars='Angel', position=72, length=5), + TextToken(chars="''", position=0, length=1), + TextToken(chars='(', position=0, length=1), + TextToken(chars='a', position=80, length=1), + TextToken(chars='show', position=82, length=4), + TextToken(chars='creator', position=87, length=7), + TextToken(chars='John', position=95, length=4), + TextToken(chars='Masius', position=100, length=6), + TextToken(chars='worked', position=107, length=6), + TextToken(chars='on', position=114, length=2), + TextToken(chars=')', position=0, length=1), + TextToken(chars='wanna-be', position=118, length=8), + TextToken(chars='if', position=127, length=2), + TextToken(chars='she', position=130, length=3), + TextToken(chars="didn't", position=134, length=6), + TextToken(chars='.', position=0, length=1)] Some issues: @@ -142,7 +142,7 @@ def _tokenize(self, text): token_start = 0 while 1: if i >= len(text): - yield TextToken(token=text[token_start:], + yield TextToken(chars=text[token_start:], position=token_start, length=len(text) - token_start) break @@ -151,24 +151,24 @@ def _tokenize(self, text): for regex, token in self.rules: match = regex.match(partial_text) if match: - yield TextToken(token=text[token_start:i], + yield TextToken(chars=text[token_start:i], position=token_start, length=i - token_start) shift = match.end() - match.start() token_start = i + shift if token is None: - yield TextToken(token=match.group(), + yield TextToken(chars=match.group(), position=match.start(), length=shift) else: - yield TextToken(token=token, + yield TextToken(chars=token, position=match.start(), length=shift) break i += shift def tokenize(self, text): - return [t for t in self._tokenize(text) if t.token] + return [t for t in self._tokenize(text) if t.chars] class DefaultTokenizer(WordTokenizer): @@ -183,7 +183,7 @@ def tokenize(self, text): # because we removed punctuation # FIXME: remove as token, but save as feature left/right_punct:"," - return [t for t in tokens if t.token not in {',', ';'}] + return [t for t in tokens if t.chars not in {',', ';'}] tokenize = DefaultTokenizer().tokenize From 51c0932c82bc4c4acb743679eefdf76e55f1fead Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 19:12:05 +0300 Subject: [PATCH 06/31] encode in indices instead of entities --- webstruct/sequence_encoding.py | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index ba212ef..566bee9 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -11,13 +11,15 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"] - >>> iob_encoder.encode(input_tokens) + >>> [p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)] [('John', 'B-PER'), ('said', 'O')] Get the result in another format using ``encode_split`` method:: >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"] - >>> tokens, tags = iob_encoder.encode_split(input_tokens) + >>> tokens = iob_encoder.encode(input_tokens) + >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens)] + >>> tokens, tags = iob_encoder.split(tokens) >>> tokens, tags (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O']) @@ -25,9 +27,15 @@ class IobEncoder(object): stream and continue the encoding later:: >>> iob_encoder = IobEncoder() - >>> iob_encoder.encode(["__START_PER__", "John"]) + >>> input_tokens_partial = ["__START_PER__", "John"] + >>> tokens = iob_encoder.encode(input_tokens_partial) + >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] + >>> tokens [('John', 'B-PER')] - >>> iob_encoder.encode(["Mayer", "__END_PER__", "said"]) + >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"] + >>> tokens = iob_encoder.encode(input_tokens_partial) + >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] + >>> tokens [('Mayer', 'I-PER'), ('said', 'O')] To reset internal state, use ``reset method``:: @@ -36,7 +44,7 @@ class IobEncoder(object): Group results to entities:: - >>> iob_encoder.group(iob_encoder.encode(input_tokens)) + >>> iob_encoder.group([p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)]) [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')] Input token stream is processed by ``InputTokenProcessor()`` by default; @@ -53,7 +61,7 @@ def reset(self): self.tag = 'O' def iter_encode(self, input_tokens): - for token in input_tokens: + for number, token in enumerate(input_tokens): token_type, value = self.token_processor.classify(token) if token_type == 'start': @@ -68,7 +76,7 @@ def iter_encode(self, input_tokens): self.tag = "O" elif token_type == 'token': - yield token, self.tag + yield number, self.tag if self.tag[0] == 'B': self.tag = "I" + self.tag[1:] @@ -81,14 +89,17 @@ def iter_encode(self, input_tokens): def encode(self, input_tokens): return list(self.iter_encode(input_tokens)) - def encode_split(self, input_tokens): - """ The same as ``encode``, but returns ``(tokens, tags)`` tuple """ - res = self.encode(input_tokens) - if not res: - return (), () - tokens, tags = zip(*res) + def split(self, tokens): + """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """ + tokens, tags = zip(*tokens) return list(tokens), list(tags) + @classmethod + def from_indicies(Cls, indicies, input_tokens): + for idx, tag in indicies: + yield input_tokens[idx], tag + + @classmethod def group(cls, data, strict=False): """ From 1a667ec72901d47e72a5256b868fb21babaf5dc5 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 19:20:49 +0300 Subject: [PATCH 07/31] handle empty lists --- webstruct/sequence_encoding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index 566bee9..d91349d 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -91,8 +91,7 @@ def encode(self, input_tokens): def split(self, tokens): """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """ - tokens, tags = zip(*tokens) - return list(tokens), list(tags) + return [t[0] for t in tokens], [t[1] for t in tokens] @classmethod def from_indicies(Cls, indicies, input_tokens): From 24465b1ff04940488e9f2a120ae8fa76102d7cde Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Thu, 21 Sep 2017 19:31:02 +0300 Subject: [PATCH 08/31] pass token length and position from TextToken to HtmlToken --- webstruct/html_tokenizer.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 9020794..95ab044 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -19,7 +19,7 @@ from lxml.etree import XPathEvaluator, Comment from webstruct.sequence_encoding import IobEncoder -from webstruct.text_tokenizers import tokenize +from webstruct.text_tokenizers import tokenize, TextToken from webstruct.utils import ( replace_html_tags, kill_html_tags, @@ -27,7 +27,7 @@ ) -_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail') +_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail position length') class HtmlToken(_HtmlToken): @@ -41,6 +41,8 @@ class HtmlToken(_HtmlToken): * :attr:`elem` is the current html block (as lxml's Element) - most likely you want :attr:`parent` instead of it * :attr:`is_tail` flag indicates that token belongs to element tail + * :attr:`position is position of token start in parent text + * :attr:`length is length of token in parent text Computed properties: @@ -64,8 +66,8 @@ def root(self): return self.elem.getroottree() def __repr__(self): - return "HtmlToken(token=%r, parent=%r, index=%s)" % ( - self.token, self.parent, self.index + return "HtmlToken(token=%r, parent=%r, index=%s, position=%d, length=%d)" % ( + self.token, self.parent, self.index, self.position, self.length ) @@ -142,7 +144,7 @@ def tokenize_single(self, tree): >>> tree = loader.loadbytes(b"

hello, John Doe
Mary said

") >>> html_tokens, tags = html_tokenizer.tokenize_single(tree) >>> html_tokens - [HtmlToken(token='hello', parent=, index=0), HtmlToken...] + [HtmlToken(token='hello', parent=, index=0, ...), HtmlToken...] >>> tags ['O', 'B-PER', 'I-PER', 'B-PER', 'O'] >>> for tok, iob_tag in zip(html_tokens, tags): @@ -245,16 +247,18 @@ def _process_tree(self, tree): return head_tokens, head_tags = self._tokenize_and_split(tree.text) + char_tokens = [t.chars for t in head_tokens] for index, (token, tag) in enumerate(zip(head_tokens, head_tags)): - yield HtmlToken(index, head_tokens, tree, False), tag + yield HtmlToken(index, char_tokens, tree, False, token.position, token.length), tag for child in tree: # where is my precious "yield from"? for html_token, tag in self._process_tree(child): yield html_token, tag tail_tokens, tail_tags = self._tokenize_and_split(tree.tail) + char_tokens = [t.chars for t in tail_tokens] for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)): - yield HtmlToken(index, tail_tokens, tree, True), tag + yield HtmlToken(index, char_tokens, tree, True, token.position, token.length), tag self._cleanup_elem(tree) @@ -267,17 +271,21 @@ def _cleanup_elem(self, elem): def _tokenize_and_split(self, text): text = text or '' - input_tokens = [t.chars for t in self.text_tokenize_func(text)] + input_tokens = [t for t in self.text_tokenize_func(text)] input_tokens = self._limit_tags(input_tokens) - input_tokens = map(six.text_type, input_tokens) - return self.sequence_encoder.encode_split(input_tokens) + input_tokens = [TextToken(chars=six.text_type(t.chars), + position=t.position, + length=t.length) for t in input_tokens] + chains = self.sequence_encoder.encode(t.chars for t in input_tokens) + chains = [l for l in self.sequence_encoder.from_indicies(chains, input_tokens)] + return self.sequence_encoder.split(chains) def _limit_tags(self, input_tokens): if self.tagset is None: return input_tokens proc = self.sequence_encoder.token_processor - token_classes = [proc.classify(tok) for tok in input_tokens] + token_classes = [proc.classify(tok.chars) for tok in input_tokens] return [ tok for (tok, (typ, value)) in zip(input_tokens, token_classes) if not (typ in {'start', 'end'} and value not in self.tagset) From 06befbb73362d4186af69ade2de6f7c2e0790fbe Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 22 Sep 2017 13:48:46 +0300 Subject: [PATCH 09/31] letter perfect detokenization --- webstruct/html_tokenizer.py | 67 ++++++++++++++++---------- webstruct/tests/test_html_tokenizer.py | 14 +++--- 2 files changed, 49 insertions(+), 32 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 95ab044..07de2a0 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -16,14 +16,13 @@ import six from six.moves import zip -from lxml.etree import XPathEvaluator, Comment +from lxml.etree import Comment from webstruct.sequence_encoding import IobEncoder from webstruct.text_tokenizers import tokenize, TextToken from webstruct.utils import ( replace_html_tags, kill_html_tags, - smart_join, ) @@ -182,6 +181,7 @@ def detokenize_single(self, html_tokens, tags): Build annotated ``lxml.etree.ElementTree`` from ``html_tokens`` (a list of :class:`.HtmlToken` instances) and ``tags`` (a list of their tags). + **ATTENTION**: ``html_tokens`` should be tokenized from tree without tags Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` text tokens (this is the format :mod:`webstruct.loaders` use). @@ -192,9 +192,7 @@ def detokenize_single(self, html_tokens, tags): if not html_tokens: return None - orig_tree = html_tokens[0].root - tree = copy.deepcopy(orig_tree) - xpatheval = XPathEvaluator(tree) + tree = html_tokens[0].root # find starts/ends of token groups token_groups = self.sequence_encoder.group(zip(html_tokens, tags)) @@ -208,30 +206,47 @@ def detokenize_single(self, html_tokens, tags): pos += n_tokens # mark starts/ends with special tokens - data = zip(html_tokens, tags, range(len(html_tokens))) - keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail) + data = [(s, True) for s in starts] + data.extend((s, False) for s in ends) + keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail) + data.sort(key = keyfunc) - for (orig_elem, is_tail), g in groupby(data, keyfunc): + for (_, is_tail), g in groupby(data, keyfunc): g = list(g) - fix = False - tokens = g[0][0].tokens[:] - for token, tag, token_idx in g: - if token_idx in starts: - text = ' __START_%s__ %s' % (tag[2:], tokens[token.index]) - tokens[token.index] = text - fix = True - if token_idx in ends: - text = '%s __END_%s__ ' % (tokens[token.index], tag[2:]) - tokens[token.index] = text - fix = True - - if fix: - xpath = orig_tree.getpath(orig_elem) - elem = xpatheval(xpath)[0] - if is_tail: - elem.tail = smart_join(tokens) + g.sort(key = lambda t:(html_tokens[t[0]].position, not t[1])) + + if not g: + continue + + elem = html_tokens[g[0][0]].elem + + pos_in_source = 0 + source = elem.text + if is_tail: + source = elem.tail + + modded = '' + + for idx, is_starts in g: + token = html_tokens[idx] + tag = tags[idx] + modded = modded + source[pos_in_source:token.position] + pos_in_source = token.position + if is_starts: + patch = ' __START_%s__ ' % (tag[2:],) + modded = modded + patch else: - elem.text = smart_join(tokens) + modded = modded + source[pos_in_source:pos_in_source + token.length] + pos_in_source = pos_in_source + token.length + patch = ' __END_%s__ ' % (tag[2:],) + modded = modded + patch + + + modded = modded + source[pos_in_source:] + if is_tail: + elem.tail = modded + else: + elem.text = modded return tree diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py index 9d1c81e..1ac102d 100644 --- a/webstruct/tests/test_html_tokenizer.py +++ b/webstruct/tests/test_html_tokenizer.py @@ -7,7 +7,6 @@ from webstruct.loaders import GateLoader, HtmlLoader from webstruct.utils import html_document_fromstring from .utils import HtmlTest -import pdb GATE_HTML = b""" @@ -94,6 +93,7 @@ def test_detokenize_single(self): html_document_fromstring(UNANNOTATED_HTML) ) + html_tokens, _ = tokenizer.tokenize_single(new_tree.getroot()) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) @@ -140,14 +140,16 @@ def test_tokenize_scripts_and_styles(self): self.assertHtmlTreeEqual(tree, detokenized_tree) def test_detokenize_preserve_commas(self): - html = b""" + annotated_html = b""" - __START_TAG_ORG__ hello __END_TAG_ORG__ a, b world + __START_ORG__ hello __END_ORG__ a, b world """ - tree = HtmlLoader().loadbytes(html) + annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() - html_tokens, tags = tokenizer.tokenize_single(tree) + html_tokens, tags = tokenizer.tokenize_single(annotated_tree) + clean_tree = html_tokens[0].root.getroot() + html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) - self.assertHtmlTreeEqual(tree, detokenized_tree) + self.assertHtmlTreeEqual(annotated_tree, detokenized_tree) From e5730b21cae928af85a90d7fcb643f4d79cf3989 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 25 Sep 2017 14:33:50 +0000 Subject: [PATCH 10/31] do not cleanup tokenized tree by default, separate method for tree cleanup --- webstruct/html_tokenizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 07de2a0..a98d9a9 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -16,7 +16,7 @@ import six from six.moves import zip -from lxml.etree import Comment +from lxml.etree import Comment, iterwalk from webstruct.sequence_encoding import IobEncoder from webstruct.text_tokenizers import tokenize, TextToken @@ -275,7 +275,12 @@ def _process_tree(self, tree): for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)): yield HtmlToken(index, char_tokens, tree, True, token.position, token.length), tag - self._cleanup_elem(tree) + def cleanup_tree(self, tree): + cleaned = copy.deepcopy(tree) + for _, elem in iterwalk(cleaned): + self._cleanup_elem(elem) + + return cleaned def _cleanup_elem(self, elem): """ Remove special tokens from elem """ From e34044471496d00384eca78f928e0b5ebbd784e5 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 25 Sep 2017 14:40:47 +0000 Subject: [PATCH 11/31] update tests for separate tree cleaning --- webstruct/tests/test_html_tokenizer.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py index 1ac102d..fdd0c58 100644 --- a/webstruct/tests/test_html_tokenizer.py +++ b/webstruct/tests/test_html_tokenizer.py @@ -69,8 +69,6 @@ def assertTokenizationWorks(self, tree): [u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY'] ) - tree = html_tokens[0].root - self.assertNotIn(b'__', tostring(tree)) def test_tokenize_single(self): self.assertTokenizationWorks(self._load()) @@ -84,7 +82,7 @@ def test_detokenize_single(self): tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(src_tree) - new_tree = html_tokens[0].root + new_tree = tokenizer.cleanup_tree(src_tree) self.assertIn(b'__START_ORG__', tostring(src_tree)) self.assertNotIn(b'__START_ORG__', tostring(new_tree)) @@ -93,7 +91,7 @@ def test_detokenize_single(self): html_document_fromstring(UNANNOTATED_HTML) ) - html_tokens, _ = tokenizer.tokenize_single(new_tree.getroot()) + html_tokens, _ = tokenizer.tokenize_single(new_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertIn(b'__START_ORG__', tostring(detokenized_tree)) @@ -149,7 +147,7 @@ def test_detokenize_preserve_commas(self): annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) - clean_tree = html_tokens[0].root.getroot() + clean_tree = tokenizer.cleanup_tree(annotated_tree) html_tokens, _ = tokenizer.tokenize_single(clean_tree) detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) self.assertHtmlTreeEqual(annotated_tree, detokenized_tree) From 89673c114427e292f23a5490f529cae9626a55e3 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 25 Sep 2017 14:47:27 +0000 Subject: [PATCH 12/31] update tests for correct punctuation positions --- webstruct/text_tokenizers.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index be25ba2..e2a0a58 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -18,7 +18,7 @@ class WordTokenizer(object): [TextToken(chars='Good', position=0, length=4), TextToken(chars='muffins', position=5, length=7), TextToken(chars='cost', position=13, length=4), - TextToken(chars='$', position=0, length=1), + TextToken(chars='$', position=18, length=1), TextToken(chars='3.88', position=19, length=4), TextToken(chars='in', position=24, length=2), TextToken(chars='New', position=27, length=3), @@ -30,7 +30,7 @@ class WordTokenizer(object): >>> WordTokenizer().tokenize(s) [TextToken(chars='Shelbourne', position=0, length=10), TextToken(chars='Road', position=11, length=4), - TextToken(chars=',', position=0, length=1)] + TextToken(chars=',', position=15, length=1)] >>> s = '''population of 100,000''' >>> WordTokenizer().tokenize(s) @@ -41,7 +41,7 @@ class WordTokenizer(object): >>> s = '''Hello|World''' >>> WordTokenizer().tokenize(s) [TextToken(chars='Hello', position=0, length=5), - TextToken(chars='|', position=0, length=1), + TextToken(chars='|', position=5, length=1), TextToken(chars='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' @@ -56,33 +56,33 @@ class WordTokenizer(object): TextToken(chars='to', position=33, length=2), TextToken(chars='get', position=36, length=3), TextToken(chars='here', position=40, length=4), - TextToken(chars=',', position=0, length=1), - TextToken(chars="''", position=0, length=1), + TextToken(chars=',', position=44, length=1), + TextToken(chars="''", position=45, length=1), TextToken(chars='Slocum', position=47, length=6), TextToken(chars='said', position=54, length=4), - TextToken(chars='.', position=0, length=1)] + TextToken(chars='.', position=58, length=1)] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE [TextToken(chars='Well', position=0, length=4), - TextToken(chars=',', position=0, length=1), + TextToken(chars=',', position=4, length=1), TextToken(chars='we', position=6, length=2), TextToken(chars="couldn't", position=9, length=8), TextToken(chars='have', position=18, length=4), TextToken(chars='this', position=23, length=4), TextToken(chars='predictable', position=28, length=11), - TextToken(chars=',', position=0, length=1), + TextToken(chars=',', position=39, length=1), TextToken(chars='cliche-ridden', position=41, length=13), - TextToken(chars=',', position=0, length=1), - TextToken(chars='``', position=0, length=2), + TextToken(chars=',', position=54, length=1), + TextToken(chars='``', position=56, length=2), TextToken(chars='Touched', position=58, length=7), TextToken(chars='by', position=66, length=2), TextToken(chars='an', position=69, length=2), TextToken(chars='Angel', position=72, length=5), - TextToken(chars="''", position=0, length=1), - TextToken(chars='(', position=0, length=1), + TextToken(chars="''", position=77, length=1), + TextToken(chars='(', position=79, length=1), TextToken(chars='a', position=80, length=1), TextToken(chars='show', position=82, length=4), TextToken(chars='creator', position=87, length=7), @@ -90,12 +90,12 @@ class WordTokenizer(object): TextToken(chars='Masius', position=100, length=6), TextToken(chars='worked', position=107, length=6), TextToken(chars='on', position=114, length=2), - TextToken(chars=')', position=0, length=1), + TextToken(chars=')', position=116, length=1), TextToken(chars='wanna-be', position=118, length=8), TextToken(chars='if', position=127, length=2), TextToken(chars='she', position=130, length=3), TextToken(chars="didn't", position=134, length=6), - TextToken(chars='.', position=0, length=1)] + TextToken(chars='.', position=140, length=1)] Some issues: @@ -158,11 +158,11 @@ def _tokenize(self, text): token_start = i + shift if token is None: yield TextToken(chars=match.group(), - position=match.start(), + position=i + match.start(), length=shift) else: yield TextToken(chars=token, - position=match.start(), + position=i + match.start(), length=shift) break i += shift From 7c4598464017c2d4eee078ebbfd30e81be997ec8 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 25 Sep 2017 14:48:56 +0000 Subject: [PATCH 13/31] correct length for replaced quotes --- webstruct/text_tokenizers.py | 91 ++++++++++++++++++++++-------------- 1 file changed, 55 insertions(+), 36 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index e2a0a58..29cf8aa 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -46,21 +46,21 @@ class WordTokenizer(object): >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE - [TextToken(chars='``', position=0, length=2), - TextToken(chars='We', position=2, length=2), - TextToken(chars='beat', position=5, length=4), - TextToken(chars='some', position=10, length=4), - TextToken(chars='pretty', position=15, length=6), - TextToken(chars='good', position=22, length=4), - TextToken(chars='teams', position=27, length=5), - TextToken(chars='to', position=33, length=2), - TextToken(chars='get', position=36, length=3), - TextToken(chars='here', position=40, length=4), - TextToken(chars=',', position=44, length=1), - TextToken(chars="''", position=45, length=1), - TextToken(chars='Slocum', position=47, length=6), - TextToken(chars='said', position=54, length=4), - TextToken(chars='.', position=58, length=1)] + [TextToken(chars='``', position=0, length=1), + TextToken(chars='We', position=1, length=2), + TextToken(chars='beat', position=4, length=4), + TextToken(chars='some', position=9, length=4), + TextToken(chars='pretty', position=14, length=6), + TextToken(chars='good', position=21, length=4), + TextToken(chars='teams', position=26, length=5), + TextToken(chars='to', position=32, length=2), + TextToken(chars='get', position=35, length=3), + TextToken(chars='here', position=39, length=4), + TextToken(chars=',', position=43, length=1), + TextToken(chars="''", position=44, length=1), + TextToken(chars='Slocum', position=46, length=6), + TextToken(chars='said', position=53, length=4), + TextToken(chars='.', position=57, length=1)] >>> s3 = '''Well, we couldn't have this predictable, ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius @@ -76,26 +76,33 @@ class WordTokenizer(object): TextToken(chars=',', position=39, length=1), TextToken(chars='cliche-ridden', position=41, length=13), TextToken(chars=',', position=54, length=1), - TextToken(chars='``', position=56, length=2), - TextToken(chars='Touched', position=58, length=7), - TextToken(chars='by', position=66, length=2), - TextToken(chars='an', position=69, length=2), - TextToken(chars='Angel', position=72, length=5), - TextToken(chars="''", position=77, length=1), - TextToken(chars='(', position=79, length=1), - TextToken(chars='a', position=80, length=1), - TextToken(chars='show', position=82, length=4), - TextToken(chars='creator', position=87, length=7), - TextToken(chars='John', position=95, length=4), - TextToken(chars='Masius', position=100, length=6), - TextToken(chars='worked', position=107, length=6), - TextToken(chars='on', position=114, length=2), - TextToken(chars=')', position=116, length=1), - TextToken(chars='wanna-be', position=118, length=8), - TextToken(chars='if', position=127, length=2), - TextToken(chars='she', position=130, length=3), - TextToken(chars="didn't", position=134, length=6), - TextToken(chars='.', position=140, length=1)] + TextToken(chars='``', position=56, length=1), + TextToken(chars='Touched', position=57, length=7), + TextToken(chars='by', position=65, length=2), + TextToken(chars='an', position=68, length=2), + TextToken(chars='Angel', position=71, length=5), + TextToken(chars="''", position=76, length=1), + TextToken(chars='(', position=78, length=1), + TextToken(chars='a', position=79, length=1), + TextToken(chars='show', position=81, length=4), + TextToken(chars='creator', position=86, length=7), + TextToken(chars='John', position=94, length=4), + TextToken(chars='Masius', position=99, length=6), + TextToken(chars='worked', position=106, length=6), + TextToken(chars='on', position=113, length=2), + TextToken(chars=')', position=115, length=1), + TextToken(chars='wanna-be', position=117, length=8), + TextToken(chars='if', position=126, length=2), + TextToken(chars='she', position=129, length=3), + TextToken(chars="didn't", position=133, length=6), + TextToken(chars='.', position=139, length=1)] + + >>> WordTokenizer().tokenize('"') + [TextToken(chars='``', position=0, length=1)] + + >>> WordTokenizer().tokenize('" a') + [TextToken(chars='``', position=0, length=1), + TextToken(chars='a', position=2, length=1)] Some issues: @@ -136,7 +143,19 @@ class WordTokenizer(object): def _tokenize(self, text): # this one cannot be placed in the loop because it requires # position check (beginning of the string) or previous char value - text = self.open_quotes.sub(r'\1``', text) + quote = self.open_quotes.search(text) + if quote is not None: + end = quote.end() - 1 + for t in self._tokenize(text[:end]): + yield t + yield TextToken(chars='``', position=end, length=1) + shift = end + 1 + for t in self._tokenize(text[shift:]): + yield TextToken(chars=t.chars, + position=t.position + shift, + length=t.length) + return + i = 0 token_start = 0 From 46fc4df8023fa71f43e267dbd8256a0ff9ade2d5 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 08:49:40 +0000 Subject: [PATCH 14/31] pep8 --- webstruct/html_tokenizer.py | 44 ++++++++++++++++++++++++---------- webstruct/sequence_encoding.py | 2 -- webstruct/text_tokenizers.py | 4 ++-- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index a98d9a9..753fd6b 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -16,7 +16,7 @@ import six from six.moves import zip -from lxml.etree import Comment, iterwalk +from lxml.etree import Comment, iterwalk from webstruct.sequence_encoding import IobEncoder from webstruct.text_tokenizers import tokenize, TextToken @@ -26,7 +26,12 @@ ) -_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail position length') +_HtmlToken = namedtuple('HtmlToken', ['index', + 'tokens', + 'elem', + 'is_tail', + 'position', + 'length']) class HtmlToken(_HtmlToken): @@ -65,7 +70,9 @@ def root(self): return self.elem.getroottree() def __repr__(self): - return "HtmlToken(token=%r, parent=%r, index=%s, position=%d, length=%d)" % ( + return ("HtmlToken(" + "token=%r, parent=%r, index=%s, position=%d, length=%d" + ")") % ( self.token, self.parent, self.index, self.position, self.length ) @@ -86,7 +93,8 @@ class HtmlTokenizer(object): ---------- tagset : set, optional - A set of entity types to keep. If not passed, all entity types are kept. + A set of entity types to keep. + If not passed, all entity types are kept. Use this argument to discard some entity types from training data. sequence_encoder : object, optional Sequence encoder object. If not passed, @@ -181,7 +189,8 @@ def detokenize_single(self, html_tokens, tags): Build annotated ``lxml.etree.ElementTree`` from ``html_tokens`` (a list of :class:`.HtmlToken` instances) and ``tags`` (a list of their tags). - **ATTENTION**: ``html_tokens`` should be tokenized from tree without tags + **ATTENTION**: ``html_tokens`` should be tokenized from tree + without tags Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__`` text tokens (this is the format :mod:`webstruct.loaders` use). @@ -209,11 +218,11 @@ def detokenize_single(self, html_tokens, tags): data = [(s, True) for s in starts] data.extend((s, False) for s in ends) keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail) - data.sort(key = keyfunc) + data.sort(key=keyfunc) for (_, is_tail), g in groupby(data, keyfunc): g = list(g) - g.sort(key = lambda t:(html_tokens[t[0]].position, not t[1])) + g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1])) if not g: continue @@ -236,12 +245,12 @@ def detokenize_single(self, html_tokens, tags): patch = ' __START_%s__ ' % (tag[2:],) modded = modded + patch else: - modded = modded + source[pos_in_source:pos_in_source + token.length] + end_in_source = pos_in_source + token.length + modded = modded + source[pos_in_source:end_in_source] pos_in_source = pos_in_source + token.length patch = ' __END_%s__ ' % (tag[2:],) modded = modded + patch - modded = modded + source[pos_in_source:] if is_tail: elem.tail = modded @@ -264,7 +273,12 @@ def _process_tree(self, tree): head_tokens, head_tags = self._tokenize_and_split(tree.text) char_tokens = [t.chars for t in head_tokens] for index, (token, tag) in enumerate(zip(head_tokens, head_tags)): - yield HtmlToken(index, char_tokens, tree, False, token.position, token.length), tag + yield HtmlToken(index, + char_tokens, + tree, + False, + token.position, + token.length), tag for child in tree: # where is my precious "yield from"? for html_token, tag in self._process_tree(child): @@ -273,7 +287,12 @@ def _process_tree(self, tree): tail_tokens, tail_tags = self._tokenize_and_split(tree.tail) char_tokens = [t.chars for t in tail_tokens] for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)): - yield HtmlToken(index, char_tokens, tree, True, token.position, token.length), tag + yield HtmlToken(index, + char_tokens, + tree, + True, + token.position, + token.length), tag def cleanup_tree(self, tree): cleaned = copy.deepcopy(tree) @@ -297,7 +316,8 @@ def _tokenize_and_split(self, text): position=t.position, length=t.length) for t in input_tokens] chains = self.sequence_encoder.encode(t.chars for t in input_tokens) - chains = [l for l in self.sequence_encoder.from_indicies(chains, input_tokens)] + chains = self.sequence_encoder.from_indicies(chains, input_tokens) + chains = [l for l in chains] return self.sequence_encoder.split(chains) def _limit_tags(self, input_tokens): diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index d91349d..40c3ae7 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -98,7 +98,6 @@ def from_indicies(Cls, indicies, input_tokens): for idx, tag in indicies: yield input_tokens[idx], tag - @classmethod def group(cls, data, strict=False): """ @@ -196,4 +195,3 @@ def classify(self, token): # regular token return 'token', token - diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 29cf8aa..9d46527 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -156,7 +156,6 @@ def _tokenize(self, text): length=t.length) return - i = 0 token_start = 0 while 1: @@ -194,7 +193,8 @@ class DefaultTokenizer(WordTokenizer): def tokenize(self, text): tokens = super(DefaultTokenizer, self).tokenize(text) # remove standalone commas and semicolons - # as they broke tag sets, e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" + # as they broke tag sets + # , e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" # but it has negative consequences, e.g. # etalon: [PER-B, PER-I, FUNC-B] From 388170ecd7b849f9a821bd013a0c7fc3b6e81b04 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 09:52:16 +0000 Subject: [PATCH 15/31] comma at line end, not start --- webstruct/text_tokenizers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 9d46527..6ad2a47 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -193,8 +193,8 @@ class DefaultTokenizer(WordTokenizer): def tokenize(self, text): tokens = super(DefaultTokenizer, self).tokenize(text) # remove standalone commas and semicolons - # as they broke tag sets - # , e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" + # as they broke tag sets, + # e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" # but it has negative consequences, e.g. # etalon: [PER-B, PER-I, FUNC-B] From 71caf612b7936978beafad225183dde471f6decd Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 09:58:06 +0000 Subject: [PATCH 16/31] one join instead of many additions, dont be Schleimel --- webstruct/html_tokenizer.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 753fd6b..e423332 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -234,24 +234,26 @@ def detokenize_single(self, html_tokens, tags): if is_tail: source = elem.tail - modded = '' + mods = list() for idx, is_starts in g: token = html_tokens[idx] tag = tags[idx] - modded = modded + source[pos_in_source:token.position] + mods.append(source[pos_in_source:token.position]) pos_in_source = token.position if is_starts: patch = ' __START_%s__ ' % (tag[2:],) - modded = modded + patch + mods.append(patch) else: end_in_source = pos_in_source + token.length - modded = modded + source[pos_in_source:end_in_source] + mods.append(source[pos_in_source:end_in_source]) pos_in_source = pos_in_source + token.length patch = ' __END_%s__ ' % (tag[2:],) - modded = modded + patch + mods.append(patch) + + mods.append(source[pos_in_source:]) + modded = ''.join(mods) - modded = modded + source[pos_in_source:] if is_tail: elem.tail = modded else: From 37d7470837748797a76467299613896e7d4c373b Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 10:02:37 +0000 Subject: [PATCH 17/31] correct formatting --- webstruct/html_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index e423332..c66b5e3 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -45,8 +45,8 @@ class HtmlToken(_HtmlToken): * :attr:`elem` is the current html block (as lxml's Element) - most likely you want :attr:`parent` instead of it * :attr:`is_tail` flag indicates that token belongs to element tail - * :attr:`position is position of token start in parent text - * :attr:`length is length of token in parent text + * :attr:`position` is position of token start in parent text + * :attr:`length` is length of token in parent text Computed properties: From e93c6dc3a9993a9c097002daebc650de937a2eb5 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 10:18:00 +0000 Subject: [PATCH 18/31] add clarification --- webstruct/html_tokenizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index c66b5e3..7e3aebe 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -45,8 +45,8 @@ class HtmlToken(_HtmlToken): * :attr:`elem` is the current html block (as lxml's Element) - most likely you want :attr:`parent` instead of it * :attr:`is_tail` flag indicates that token belongs to element tail - * :attr:`position` is position of token start in parent text - * :attr:`length` is length of token in parent text + * :attr:`position` is logical position(in letters or codepoints) of token start in parent text + * :attr:`length` is logical length(in letters or codepoints) of token in parent text Computed properties: From e02c275aab4dfc9c791224d1dec5c4a4a6f25b78 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 10:58:41 +0000 Subject: [PATCH 19/31] fix typo --- webstruct/html_tokenizer.py | 2 +- webstruct/sequence_encoding.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index 7e3aebe..bcb65be 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -318,7 +318,7 @@ def _tokenize_and_split(self, text): position=t.position, length=t.length) for t in input_tokens] chains = self.sequence_encoder.encode(t.chars for t in input_tokens) - chains = self.sequence_encoder.from_indicies(chains, input_tokens) + chains = self.sequence_encoder.from_indices(chains, input_tokens) chains = [l for l in chains] return self.sequence_encoder.split(chains) diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index 40c3ae7..6152066 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -11,14 +11,13 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"] - >>> [p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)] + >>> [p for p in IobEncoder.from_indices(iob_encoder.encode(input_tokens), input_tokens)] [('John', 'B-PER'), ('said', 'O')] - Get the result in another format using ``encode_split`` method:: >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"] >>> tokens = iob_encoder.encode(input_tokens) - >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens)] + >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens)] >>> tokens, tags = iob_encoder.split(tokens) >>> tokens, tags (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O']) @@ -29,12 +28,12 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens_partial = ["__START_PER__", "John"] >>> tokens = iob_encoder.encode(input_tokens_partial) - >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] + >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens_partial)] >>> tokens [('John', 'B-PER')] >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"] >>> tokens = iob_encoder.encode(input_tokens_partial) - >>> tokens = [p for p in IobEncoder.from_indicies(tokens, input_tokens_partial)] + >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens_partial)] >>> tokens [('Mayer', 'I-PER'), ('said', 'O')] @@ -44,7 +43,7 @@ class IobEncoder(object): Group results to entities:: - >>> iob_encoder.group([p for p in IobEncoder.from_indicies(iob_encoder.encode(input_tokens), input_tokens)]) + >>> iob_encoder.group([p for p in IobEncoder.from_indices(iob_encoder.encode(input_tokens), input_tokens)]) [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')] Input token stream is processed by ``InputTokenProcessor()`` by default; @@ -94,8 +93,8 @@ def split(self, tokens): return [t[0] for t in tokens], [t[1] for t in tokens] @classmethod - def from_indicies(Cls, indicies, input_tokens): - for idx, tag in indicies: + def from_indices(Cls, indices, input_tokens): + for idx, tag in indices: yield input_tokens[idx], tag @classmethod From f26569f3550110b9bceacff3bcf8a6e17ab0c539 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 10:59:04 +0000 Subject: [PATCH 20/31] pep8 --- webstruct/sequence_encoding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index 6152066..ff6d8bf 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -93,7 +93,7 @@ def split(self, tokens): return [t[0] for t in tokens], [t[1] for t in tokens] @classmethod - def from_indices(Cls, indices, input_tokens): + def from_indices(cls, indices, input_tokens): for idx, tag in indices: yield input_tokens[idx], tag From d1aecbb2e3f4fbbc207ac2795081cef7c32baa9f Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 11:21:41 +0000 Subject: [PATCH 21/31] preserve tokenize method for compatibility --- webstruct/text_tokenizers.py | 45 +++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 6ad2a47..a0fa4e1 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -12,9 +12,9 @@ class WordTokenizer(object): >>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com''' - >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP + >>> TreebankWordTokenizer().span_tokenize(s) # doctest: +SKIP ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com'] - >>> WordTokenizer().tokenize(s) + >>> WordTokenizer().span_tokenize(s) [TextToken(chars='Good', position=0, length=4), TextToken(chars='muffins', position=5, length=7), TextToken(chars='cost', position=13, length=4), @@ -27,25 +27,25 @@ class WordTokenizer(object): TextToken(chars='muffins@gmail.com', position=44, length=17)] >>> s = '''Shelbourne Road,''' - >>> WordTokenizer().tokenize(s) + >>> WordTokenizer().span_tokenize(s) [TextToken(chars='Shelbourne', position=0, length=10), TextToken(chars='Road', position=11, length=4), TextToken(chars=',', position=15, length=1)] >>> s = '''population of 100,000''' - >>> WordTokenizer().tokenize(s) + >>> WordTokenizer().span_tokenize(s) [TextToken(chars='population', position=0, length=10), TextToken(chars='of', position=11, length=2), TextToken(chars='100,000', position=14, length=7)] >>> s = '''Hello|World''' - >>> WordTokenizer().tokenize(s) + >>> WordTokenizer().span_tokenize(s) [TextToken(chars='Hello', position=0, length=5), TextToken(chars='|', position=5, length=1), TextToken(chars='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' - >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE + >>> WordTokenizer().span_tokenize(s2) # doctest: +NORMALIZE_WHITESPACE [TextToken(chars='``', position=0, length=1), TextToken(chars='We', position=1, length=2), TextToken(chars='beat', position=4, length=4), @@ -65,7 +65,7 @@ class WordTokenizer(object): ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' - >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE + >>> WordTokenizer().span_tokenize(s3) # doctest: +NORMALIZE_WHITESPACE [TextToken(chars='Well', position=0, length=4), TextToken(chars=',', position=4, length=1), TextToken(chars='we', position=6, length=2), @@ -97,28 +97,28 @@ class WordTokenizer(object): TextToken(chars="didn't", position=133, length=6), TextToken(chars='.', position=139, length=1)] - >>> WordTokenizer().tokenize('"') + >>> WordTokenizer().span_tokenize('"') [TextToken(chars='``', position=0, length=1)] - >>> WordTokenizer().tokenize('" a') + >>> WordTokenizer().span_tokenize('" a') [TextToken(chars='``', position=0, length=1), TextToken(chars='a', position=2, length=1)] Some issues: - >>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP + >>> WordTokenizer().span_tokenize("Phone:855-349-1914") # doctest: +SKIP ['Phone', ':', '855-349-1914'] - >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP + >>> WordTokenizer().span_tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.'] - >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP + >>> WordTokenizer().span_tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP ['Powai', 'Campus', ',', 'Mumbai", "-", "400077'] - >>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP + >>> WordTokenizer().span_tokenize("1 5858/ 1800") # doctest: +SKIP ['1', '5858', '/', '1800'] - >>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP + >>> WordTokenizer().span_tokenize("Saudi Arabia-") # doctest: +SKIP ['Saudi', 'Arabia', '-'] """ @@ -140,17 +140,17 @@ class WordTokenizer(object): open_quotes = re.compile(r'(^|[\s(\[{<])"') - def _tokenize(self, text): + def _span_tokenize(self, text): # this one cannot be placed in the loop because it requires # position check (beginning of the string) or previous char value quote = self.open_quotes.search(text) if quote is not None: end = quote.end() - 1 - for t in self._tokenize(text[:end]): + for t in self._span_tokenize(text[:end]): yield t yield TextToken(chars='``', position=end, length=1) shift = end + 1 - for t in self._tokenize(text[shift:]): + for t in self._span_tokenize(text[shift:]): yield TextToken(chars=t.chars, position=t.position + shift, length=t.length) @@ -185,13 +185,16 @@ def _tokenize(self, text): break i += shift + def span_tokenize(self, text): + return [t for t in self._span_tokenize(text) if t.chars] + def tokenize(self, text): - return [t for t in self._tokenize(text) if t.chars] + return [t.chars for t in self.span_tokenize(text)] class DefaultTokenizer(WordTokenizer): - def tokenize(self, text): - tokens = super(DefaultTokenizer, self).tokenize(text) + def span_tokenize(self, text): + tokens = super(DefaultTokenizer, self).span_tokenize(text) # remove standalone commas and semicolons # as they broke tag sets, # e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" @@ -205,4 +208,4 @@ def tokenize(self, text): return [t for t in tokens if t.chars not in {',', ';'}] -tokenize = DefaultTokenizer().tokenize +tokenize = DefaultTokenizer().span_tokenize From 35a9d88214796172caf3970bff77a650b2c8f784 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 12:42:14 +0000 Subject: [PATCH 22/31] function to reduce code in tests --- webstruct/sequence_encoding.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py index ff6d8bf..5b55752 100644 --- a/webstruct/sequence_encoding.py +++ b/webstruct/sequence_encoding.py @@ -11,13 +11,13 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"] - >>> [p for p in IobEncoder.from_indices(iob_encoder.encode(input_tokens), input_tokens)] + >>> def encode(encoder, tokens): return [p for p in IobEncoder.from_indices(encoder.encode(tokens), tokens)] + >>> encode(iob_encoder, input_tokens) [('John', 'B-PER'), ('said', 'O')] >>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"] - >>> tokens = iob_encoder.encode(input_tokens) - >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens)] + >>> tokens = encode(iob_encoder, input_tokens) >>> tokens, tags = iob_encoder.split(tokens) >>> tokens, tags (['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O']) @@ -27,14 +27,10 @@ class IobEncoder(object): >>> iob_encoder = IobEncoder() >>> input_tokens_partial = ["__START_PER__", "John"] - >>> tokens = iob_encoder.encode(input_tokens_partial) - >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens_partial)] - >>> tokens + >>> encode(iob_encoder, input_tokens_partial) [('John', 'B-PER')] >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"] - >>> tokens = iob_encoder.encode(input_tokens_partial) - >>> tokens = [p for p in IobEncoder.from_indices(tokens, input_tokens_partial)] - >>> tokens + >>> encode(iob_encoder, input_tokens_partial) [('Mayer', 'I-PER'), ('said', 'O')] To reset internal state, use ``reset method``:: @@ -43,7 +39,7 @@ class IobEncoder(object): Group results to entities:: - >>> iob_encoder.group([p for p in IobEncoder.from_indices(iob_encoder.encode(input_tokens), input_tokens)]) + >>> iob_encoder.group(encode(iob_encoder, input_tokens)) [(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')] Input token stream is processed by ``InputTokenProcessor()`` by default; From 90331883d5d1da6b78db546ced71801bb3be423d Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 14:02:18 +0000 Subject: [PATCH 23/31] remove test for nltk tokenizer --- webstruct/text_tokenizers.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index a0fa4e1..6038614 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -10,10 +10,7 @@ class WordTokenizer(object): r"""This tokenizer is copy-pasted version of TreebankWordTokenizer that doesn't split on @ and ':' symbols and doesn't split contractions:: - >>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com''' - >>> TreebankWordTokenizer().span_tokenize(s) # doctest: +SKIP - ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com'] >>> WordTokenizer().span_tokenize(s) [TextToken(chars='Good', position=0, length=4), TextToken(chars='muffins', position=5, length=7), From c14f363e792ea7aea78407e830fe385f17ffc06c Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 14:09:15 +0000 Subject: [PATCH 24/31] test our behaviour, which difers from original treebank tokenizer --- webstruct/text_tokenizers.py | 45 +++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 6038614..67a5bc3 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -103,20 +103,37 @@ class WordTokenizer(object): Some issues: - >>> WordTokenizer().span_tokenize("Phone:855-349-1914") # doctest: +SKIP - ['Phone', ':', '855-349-1914'] - - >>> WordTokenizer().span_tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP - ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.'] - - >>> WordTokenizer().span_tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP - ['Powai', 'Campus', ',', 'Mumbai", "-", "400077'] - - >>> WordTokenizer().span_tokenize("1 5858/ 1800") # doctest: +SKIP - ['1', '5858', '/', '1800'] - - >>> WordTokenizer().span_tokenize("Saudi Arabia-") # doctest: +SKIP - ['Saudi', 'Arabia', '-'] + >>> WordTokenizer().span_tokenize("Phone:855-349-1914") + [TextToken(chars='Phone:855-349-1914', position=0, length=18)] + + >>> WordTokenizer().span_tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") + [TextToken(chars='Copyright', position=0, length=9), + TextToken(chars=u'\xa9', position=10, length=1), + TextToken(chars='2014', position=12, length=4), + TextToken(chars='Foo', position=17, length=3), + TextToken(chars='Bar', position=21, length=3), + TextToken(chars='and', position=25, length=3), + TextToken(chars='Buzz', position=29, length=4), + TextToken(chars='Spam.', position=34, length=5), + TextToken(chars='All', position=40, length=3), + TextToken(chars='Rights', position=44, length=6), + TextToken(chars='Reserved', position=51, length=8), + TextToken(chars='.', position=59, length=1)] + + >>> WordTokenizer().span_tokenize("Powai Campus, Mumbai-400077") + [TextToken(chars='Powai', position=0, length=5), + TextToken(chars='Campus', position=6, length=6), + TextToken(chars=',', position=12, length=1), + TextToken(chars='Mumbai-400077', position=14, length=13)] + + >>> WordTokenizer().span_tokenize("1 5858/ 1800") + [TextToken(chars='1', position=0, length=1), + TextToken(chars='5858/', position=2, length=5), + TextToken(chars='1800', position=8, length=4)] + + >>> WordTokenizer().span_tokenize("Saudi Arabia-") + [TextToken(chars='Saudi', position=0, length=5), + TextToken(chars='Arabia-', position=6, length=7)] """ From a071cd49d014c57fb83e15c47d71ed3b8eb30040 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 14:10:02 +0000 Subject: [PATCH 25/31] remove useless conversion --- webstruct/html_tokenizer.py | 3 +-- webstruct/tests/test_html_tokenizer.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py index bcb65be..bcd4156 100644 --- a/webstruct/html_tokenizer.py +++ b/webstruct/html_tokenizer.py @@ -13,7 +13,6 @@ import copy from itertools import groupby from collections import namedtuple -import six from six.moves import zip from lxml.etree import Comment, iterwalk @@ -314,7 +313,7 @@ def _tokenize_and_split(self, text): text = text or '' input_tokens = [t for t in self.text_tokenize_func(text)] input_tokens = self._limit_tags(input_tokens) - input_tokens = [TextToken(chars=six.text_type(t.chars), + input_tokens = [TextToken(chars=t.chars, position=t.position, length=t.length) for t in input_tokens] chains = self.sequence_encoder.encode(t.chars for t in input_tokens) diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py index fdd0c58..44420de 100644 --- a/webstruct/tests/test_html_tokenizer.py +++ b/webstruct/tests/test_html_tokenizer.py @@ -144,6 +144,22 @@ def test_detokenize_preserve_commas(self): """ + annotated_tree = HtmlLoader().loadbytes(annotated_html) + tokenizer = HtmlTokenizer() + html_tokens, tags = tokenizer.tokenize_single(annotated_tree) + clean_tree = tokenizer.cleanup_tree(annotated_tree) + html_tokens, _ = tokenizer.tokenize_single(clean_tree) + detokenized_tree = tokenizer.detokenize_single(html_tokens, tags) + self.assertHtmlTreeEqual(annotated_tree, detokenized_tree) + + def test_detokenize_handle_unicode(self): + annotated_html = bytes(u""" + + Δ __START_ORG__ hello __END_ORG__ a, b world + + """.encode('utf-8')) + + annotated_tree = HtmlLoader().loadbytes(annotated_html) tokenizer = HtmlTokenizer() html_tokens, tags = tokenizer.tokenize_single(annotated_tree) From a33f5649831487266be37f5f75caaf765b597b3f Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 14:20:29 +0000 Subject: [PATCH 26/31] rename method to avoid confusion with nltk tokenize_span method --- webstruct/text_tokenizers.py | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 67a5bc3..8325a4f 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -11,7 +11,7 @@ class WordTokenizer(object): that doesn't split on @ and ':' symbols and doesn't split contractions:: >>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com''' - >>> WordTokenizer().span_tokenize(s) + >>> WordTokenizer().segment_words(s) [TextToken(chars='Good', position=0, length=4), TextToken(chars='muffins', position=5, length=7), TextToken(chars='cost', position=13, length=4), @@ -24,25 +24,25 @@ class WordTokenizer(object): TextToken(chars='muffins@gmail.com', position=44, length=17)] >>> s = '''Shelbourne Road,''' - >>> WordTokenizer().span_tokenize(s) + >>> WordTokenizer().segment_words(s) [TextToken(chars='Shelbourne', position=0, length=10), TextToken(chars='Road', position=11, length=4), TextToken(chars=',', position=15, length=1)] >>> s = '''population of 100,000''' - >>> WordTokenizer().span_tokenize(s) + >>> WordTokenizer().segment_words(s) [TextToken(chars='population', position=0, length=10), TextToken(chars='of', position=11, length=2), TextToken(chars='100,000', position=14, length=7)] >>> s = '''Hello|World''' - >>> WordTokenizer().span_tokenize(s) + >>> WordTokenizer().segment_words(s) [TextToken(chars='Hello', position=0, length=5), TextToken(chars='|', position=5, length=1), TextToken(chars='World', position=6, length=5)] >>> s2 = '"We beat some pretty good teams to get here," Slocum said.' - >>> WordTokenizer().span_tokenize(s2) # doctest: +NORMALIZE_WHITESPACE + >>> WordTokenizer().segment_words(s2) # doctest: +NORMALIZE_WHITESPACE [TextToken(chars='``', position=0, length=1), TextToken(chars='We', position=1, length=2), TextToken(chars='beat', position=4, length=4), @@ -62,7 +62,7 @@ class WordTokenizer(object): ... cliche-ridden, \"Touched by an ... Angel\" (a show creator John Masius ... worked on) wanna-be if she didn't.''' - >>> WordTokenizer().span_tokenize(s3) # doctest: +NORMALIZE_WHITESPACE + >>> WordTokenizer().segment_words(s3) # doctest: +NORMALIZE_WHITESPACE [TextToken(chars='Well', position=0, length=4), TextToken(chars=',', position=4, length=1), TextToken(chars='we', position=6, length=2), @@ -94,19 +94,19 @@ class WordTokenizer(object): TextToken(chars="didn't", position=133, length=6), TextToken(chars='.', position=139, length=1)] - >>> WordTokenizer().span_tokenize('"') + >>> WordTokenizer().segment_words('"') [TextToken(chars='``', position=0, length=1)] - >>> WordTokenizer().span_tokenize('" a') + >>> WordTokenizer().segment_words('" a') [TextToken(chars='``', position=0, length=1), TextToken(chars='a', position=2, length=1)] Some issues: - >>> WordTokenizer().span_tokenize("Phone:855-349-1914") + >>> WordTokenizer().segment_words("Phone:855-349-1914") [TextToken(chars='Phone:855-349-1914', position=0, length=18)] - >>> WordTokenizer().span_tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") + >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") [TextToken(chars='Copyright', position=0, length=9), TextToken(chars=u'\xa9', position=10, length=1), TextToken(chars='2014', position=12, length=4), @@ -120,18 +120,18 @@ class WordTokenizer(object): TextToken(chars='Reserved', position=51, length=8), TextToken(chars='.', position=59, length=1)] - >>> WordTokenizer().span_tokenize("Powai Campus, Mumbai-400077") + >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077") [TextToken(chars='Powai', position=0, length=5), TextToken(chars='Campus', position=6, length=6), TextToken(chars=',', position=12, length=1), TextToken(chars='Mumbai-400077', position=14, length=13)] - >>> WordTokenizer().span_tokenize("1 5858/ 1800") + >>> WordTokenizer().segment_words("1 5858/ 1800") [TextToken(chars='1', position=0, length=1), TextToken(chars='5858/', position=2, length=5), TextToken(chars='1800', position=8, length=4)] - >>> WordTokenizer().span_tokenize("Saudi Arabia-") + >>> WordTokenizer().segment_words("Saudi Arabia-") [TextToken(chars='Saudi', position=0, length=5), TextToken(chars='Arabia-', position=6, length=7)] @@ -154,17 +154,17 @@ class WordTokenizer(object): open_quotes = re.compile(r'(^|[\s(\[{<])"') - def _span_tokenize(self, text): + def _segment_words(self, text): # this one cannot be placed in the loop because it requires # position check (beginning of the string) or previous char value quote = self.open_quotes.search(text) if quote is not None: end = quote.end() - 1 - for t in self._span_tokenize(text[:end]): + for t in self._segment_words(text[:end]): yield t yield TextToken(chars='``', position=end, length=1) shift = end + 1 - for t in self._span_tokenize(text[shift:]): + for t in self._segment_words(text[shift:]): yield TextToken(chars=t.chars, position=t.position + shift, length=t.length) @@ -199,16 +199,16 @@ def _span_tokenize(self, text): break i += shift - def span_tokenize(self, text): - return [t for t in self._span_tokenize(text) if t.chars] + def segment_words(self, text): + return [t for t in self._segment_words(text) if t.chars] def tokenize(self, text): - return [t.chars for t in self.span_tokenize(text)] + return [t.chars for t in self.segment_words(text)] class DefaultTokenizer(WordTokenizer): - def span_tokenize(self, text): - tokens = super(DefaultTokenizer, self).span_tokenize(text) + def segment_words(self, text): + tokens = super(DefaultTokenizer, self).segment_words(text) # remove standalone commas and semicolons # as they broke tag sets, # e.g. PERSON->FUNCTION in case "PERSON, FUNCTION" @@ -222,4 +222,4 @@ def span_tokenize(self, text): return [t for t in tokens if t.chars not in {',', ';'}] -tokenize = DefaultTokenizer().span_tokenize +tokenize = DefaultTokenizer().segment_words From 75a96981826350d34ba4946003889774ef4daff4 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 16:06:11 +0000 Subject: [PATCH 27/31] remove brittle tests --- webstruct/text_tokenizers.py | 35 ----------------------------------- 1 file changed, 35 deletions(-) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 8325a4f..1c8eba0 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -100,41 +100,6 @@ class WordTokenizer(object): >>> WordTokenizer().segment_words('" a') [TextToken(chars='``', position=0, length=1), TextToken(chars='a', position=2, length=1)] - - Some issues: - - >>> WordTokenizer().segment_words("Phone:855-349-1914") - [TextToken(chars='Phone:855-349-1914', position=0, length=18)] - - >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") - [TextToken(chars='Copyright', position=0, length=9), - TextToken(chars=u'\xa9', position=10, length=1), - TextToken(chars='2014', position=12, length=4), - TextToken(chars='Foo', position=17, length=3), - TextToken(chars='Bar', position=21, length=3), - TextToken(chars='and', position=25, length=3), - TextToken(chars='Buzz', position=29, length=4), - TextToken(chars='Spam.', position=34, length=5), - TextToken(chars='All', position=40, length=3), - TextToken(chars='Rights', position=44, length=6), - TextToken(chars='Reserved', position=51, length=8), - TextToken(chars='.', position=59, length=1)] - - >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077") - [TextToken(chars='Powai', position=0, length=5), - TextToken(chars='Campus', position=6, length=6), - TextToken(chars=',', position=12, length=1), - TextToken(chars='Mumbai-400077', position=14, length=13)] - - >>> WordTokenizer().segment_words("1 5858/ 1800") - [TextToken(chars='1', position=0, length=1), - TextToken(chars='5858/', position=2, length=5), - TextToken(chars='1800', position=8, length=4)] - - >>> WordTokenizer().segment_words("Saudi Arabia-") - [TextToken(chars='Saudi', position=0, length=5), - TextToken(chars='Arabia-', position=6, length=7)] - """ # regex, token From 47293231ab1a4aa090abfae962456fe9a3df7f2c Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Fri, 29 Sep 2017 16:42:43 +0000 Subject: [PATCH 28/31] small benchmark for html tokenizer --- webstruct/html_tokenizer_benchmark.py | 34 +++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 webstruct/html_tokenizer_benchmark.py diff --git a/webstruct/html_tokenizer_benchmark.py b/webstruct/html_tokenizer_benchmark.py new file mode 100644 index 0000000..c7bd17a --- /dev/null +++ b/webstruct/html_tokenizer_benchmark.py @@ -0,0 +1,34 @@ +import os.path +import glob +import timeit +import functools + +import webstruct.webannotator +import webstruct.html_tokenizer + +def load_trees(tokenizer, trees): + for tree in trees: + tokenizer.tokenize_single(tree) + +def main(): + path = os.path.join(os.path.dirname(__file__) , + ".." , + "webstruct_data", + "corpus/business_pages/wa/*.html") + + paths = sorted(glob.glob(path)) + + with open(paths[0], 'rb') as sample_reader: + colors = webstruct.webannotator.EntityColors.from_htmlbytes(sample_reader.read()) + entities = [typ for typ in colors] + + loader = webstruct.WebAnnotatorLoader(known_entities=entities) + + trees = [loader.load(p) for p in paths] + tokenizer = webstruct.html_tokenizer.HtmlTokenizer() + print(timeit.timeit(functools.partial(load_trees, tokenizer, trees), + setup='gc.enable()', + number=3)) + +if __name__ == "__main__": + main() From 943a44ef402cb68ae7d56f7d7b83b619156b9299 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 2 Oct 2017 12:04:42 +0000 Subject: [PATCH 29/31] Revert "remove brittle tests" This reverts commit 75a96981826350d34ba4946003889774ef4daff4. --- webstruct/text_tokenizers.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 1c8eba0..8325a4f 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -100,6 +100,41 @@ class WordTokenizer(object): >>> WordTokenizer().segment_words('" a') [TextToken(chars='``', position=0, length=1), TextToken(chars='a', position=2, length=1)] + + Some issues: + + >>> WordTokenizer().segment_words("Phone:855-349-1914") + [TextToken(chars='Phone:855-349-1914', position=0, length=18)] + + >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") + [TextToken(chars='Copyright', position=0, length=9), + TextToken(chars=u'\xa9', position=10, length=1), + TextToken(chars='2014', position=12, length=4), + TextToken(chars='Foo', position=17, length=3), + TextToken(chars='Bar', position=21, length=3), + TextToken(chars='and', position=25, length=3), + TextToken(chars='Buzz', position=29, length=4), + TextToken(chars='Spam.', position=34, length=5), + TextToken(chars='All', position=40, length=3), + TextToken(chars='Rights', position=44, length=6), + TextToken(chars='Reserved', position=51, length=8), + TextToken(chars='.', position=59, length=1)] + + >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077") + [TextToken(chars='Powai', position=0, length=5), + TextToken(chars='Campus', position=6, length=6), + TextToken(chars=',', position=12, length=1), + TextToken(chars='Mumbai-400077', position=14, length=13)] + + >>> WordTokenizer().segment_words("1 5858/ 1800") + [TextToken(chars='1', position=0, length=1), + TextToken(chars='5858/', position=2, length=5), + TextToken(chars='1800', position=8, length=4)] + + >>> WordTokenizer().segment_words("Saudi Arabia-") + [TextToken(chars='Saudi', position=0, length=5), + TextToken(chars='Arabia-', position=6, length=7)] + """ # regex, token From ba7d6fe0b98e5b5d15e1cd2eedafc11f35d389c1 Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 2 Oct 2017 13:15:24 +0000 Subject: [PATCH 30/31] move brittle tests to pytest xfail --- webstruct/tests/test_text_tokenizer.py | 42 ++++++++++++++++++++++++++ webstruct/text_tokenizers.py | 18 ----------- 2 files changed, 42 insertions(+), 18 deletions(-) create mode 100644 webstruct/tests/test_text_tokenizer.py diff --git a/webstruct/tests/test_text_tokenizer.py b/webstruct/tests/test_text_tokenizer.py new file mode 100644 index 0000000..f65f036 --- /dev/null +++ b/webstruct/tests/test_text_tokenizer.py @@ -0,0 +1,42 @@ +import unittest +import pytest + +from webstruct.text_tokenizers import TextToken, WordTokenizer + +class TestTokenizerTest(unittest.TestCase): + def do_tokenize(self, text, result): + self.assertEqual(result, WordTokenizer().segment_words(text)) + + @pytest.mark.xfail + def test_phone(self): + return self.do_tokenize( + "Phone:855-349-1914", + [TextToken(chars='Phone:855-349-1914', position=0, length=18)] + ) + + @pytest.mark.xfail + def test_hyphen_mid(self): + return self.do_tokenize( + "Powai Campus, Mumbai-400077", + [TextToken(chars='Powai', position=0, length=5), + TextToken(chars='Campus', position=6, length=6), + TextToken(chars=',', position=12, length=1), + TextToken(chars='Mumbai-400077', position=14, length=13)] + ) + + @pytest.mark.xfail + def test_hyphen_end(self): + return self.do_tokenize( + "Saudi Arabia-", + [TextToken(chars='Saudi', position=0, length=5), + TextToken(chars='Arabia-', position=6, length=7)] + ) + + @pytest.mark.xfail + def test_hyphen_end(self): + return self.do_tokenize( + "1 5858/ 1800", + [TextToken(chars='1', position=0, length=1), + TextToken(chars='5858/', position=2, length=5), + TextToken(chars='1800', position=8, length=4)] + ) diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py index 8325a4f..fd1c72c 100644 --- a/webstruct/text_tokenizers.py +++ b/webstruct/text_tokenizers.py @@ -103,9 +103,6 @@ class WordTokenizer(object): Some issues: - >>> WordTokenizer().segment_words("Phone:855-349-1914") - [TextToken(chars='Phone:855-349-1914', position=0, length=18)] - >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") [TextToken(chars='Copyright', position=0, length=9), TextToken(chars=u'\xa9', position=10, length=1), @@ -120,21 +117,6 @@ class WordTokenizer(object): TextToken(chars='Reserved', position=51, length=8), TextToken(chars='.', position=59, length=1)] - >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077") - [TextToken(chars='Powai', position=0, length=5), - TextToken(chars='Campus', position=6, length=6), - TextToken(chars=',', position=12, length=1), - TextToken(chars='Mumbai-400077', position=14, length=13)] - - >>> WordTokenizer().segment_words("1 5858/ 1800") - [TextToken(chars='1', position=0, length=1), - TextToken(chars='5858/', position=2, length=5), - TextToken(chars='1800', position=8, length=4)] - - >>> WordTokenizer().segment_words("Saudi Arabia-") - [TextToken(chars='Saudi', position=0, length=5), - TextToken(chars='Arabia-', position=6, length=7)] - """ # regex, token From b72bcc12c402710c6ed2b06951243c0e4aa5e67b Mon Sep 17 00:00:00 2001 From: Vostretsov Nikita Date: Mon, 2 Oct 2017 14:57:01 +0000 Subject: [PATCH 31/31] expect behaviour of nltk tokenizer --- webstruct/tests/test_text_tokenizer.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/webstruct/tests/test_text_tokenizer.py b/webstruct/tests/test_text_tokenizer.py index f65f036..7427bfc 100644 --- a/webstruct/tests/test_text_tokenizer.py +++ b/webstruct/tests/test_text_tokenizer.py @@ -11,7 +11,9 @@ def do_tokenize(self, text, result): def test_phone(self): return self.do_tokenize( "Phone:855-349-1914", - [TextToken(chars='Phone:855-349-1914', position=0, length=18)] + [TextToken(chars='Phone', position=0, length=5)] + [TextToken(chars=':', position=5, length=1)] + [TextToken(chars='855-349-1914', position=6, length=12)] ) @pytest.mark.xfail @@ -21,7 +23,9 @@ def test_hyphen_mid(self): [TextToken(chars='Powai', position=0, length=5), TextToken(chars='Campus', position=6, length=6), TextToken(chars=',', position=12, length=1), - TextToken(chars='Mumbai-400077', position=14, length=13)] + TextToken(chars='Mumbai', position=14, length=6), + TextToken(chars='-', position=20, length=1), + TextToken(chars='400077', position=21, length=6)] ) @pytest.mark.xfail @@ -29,7 +33,8 @@ def test_hyphen_end(self): return self.do_tokenize( "Saudi Arabia-", [TextToken(chars='Saudi', position=0, length=5), - TextToken(chars='Arabia-', position=6, length=7)] + TextToken(chars='Arabia', position=6, length=6), + TextToken(chars='-', position=12, length=1)] ) @pytest.mark.xfail @@ -37,6 +42,7 @@ def test_hyphen_end(self): return self.do_tokenize( "1 5858/ 1800", [TextToken(chars='1', position=0, length=1), - TextToken(chars='5858/', position=2, length=5), + TextToken(chars='5858', position=2, length=4), + TextToken(chars='/', position=6, length=1), TextToken(chars='1800', position=8, length=4)] )