diff --git a/webstruct/html_tokenizer.py b/webstruct/html_tokenizer.py
index 5140093..bcd4156 100644
--- a/webstruct/html_tokenizer.py
+++ b/webstruct/html_tokenizer.py
@@ -13,21 +13,24 @@
import copy
from itertools import groupby
from collections import namedtuple
-import six
from six.moves import zip
-from lxml.etree import XPathEvaluator, Comment
+from lxml.etree import Comment, iterwalk
from webstruct.sequence_encoding import IobEncoder
-from webstruct.text_tokenizers import tokenize
+from webstruct.text_tokenizers import tokenize, TextToken
from webstruct.utils import (
replace_html_tags,
kill_html_tags,
- smart_join,
)
-_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail')
+_HtmlToken = namedtuple('HtmlToken', ['index',
+ 'tokens',
+ 'elem',
+ 'is_tail',
+ 'position',
+ 'length'])
class HtmlToken(_HtmlToken):
@@ -41,6 +44,8 @@ class HtmlToken(_HtmlToken):
* :attr:`elem` is the current html block (as lxml's Element) - most
likely you want :attr:`parent` instead of it
* :attr:`is_tail` flag indicates that token belongs to element tail
+ * :attr:`position` is logical position(in letters or codepoints) of token start in parent text
+ * :attr:`length` is logical length(in letters or codepoints) of token in parent text
Computed properties:
@@ -64,8 +69,10 @@ def root(self):
return self.elem.getroottree()
def __repr__(self):
- return "HtmlToken(token=%r, parent=%r, index=%s)" % (
- self.token, self.parent, self.index
+ return ("HtmlToken("
+ "token=%r, parent=%r, index=%s, position=%d, length=%d"
+ ")") % (
+ self.token, self.parent, self.index, self.position, self.length
)
@@ -85,7 +92,8 @@ class HtmlTokenizer(object):
----------
tagset : set, optional
- A set of entity types to keep. If not passed, all entity types are kept.
+ A set of entity types to keep.
+ If not passed, all entity types are kept.
Use this argument to discard some entity types from training data.
sequence_encoder : object, optional
Sequence encoder object. If not passed,
@@ -142,7 +150,7 @@ def tokenize_single(self, tree):
>>> tree = loader.loadbytes(b"
hello, John Doe
Mary said
")
>>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
>>> html_tokens
- [HtmlToken(token='hello', parent=, index=0), HtmlToken...]
+ [HtmlToken(token='hello', parent=, index=0, ...), HtmlToken...]
>>> tags
['O', 'B-PER', 'I-PER', 'B-PER', 'O']
>>> for tok, iob_tag in zip(html_tokens, tags):
@@ -180,6 +188,8 @@ def detokenize_single(self, html_tokens, tags):
Build annotated ``lxml.etree.ElementTree`` from
``html_tokens`` (a list of :class:`.HtmlToken` instances)
and ``tags`` (a list of their tags).
+ **ATTENTION**: ``html_tokens`` should be tokenized from tree
+ without tags
Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
text tokens (this is the format :mod:`webstruct.loaders` use).
@@ -190,9 +200,7 @@ def detokenize_single(self, html_tokens, tags):
if not html_tokens:
return None
- orig_tree = html_tokens[0].root
- tree = copy.deepcopy(orig_tree)
- xpatheval = XPathEvaluator(tree)
+ tree = html_tokens[0].root
# find starts/ends of token groups
token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
@@ -206,30 +214,49 @@ def detokenize_single(self, html_tokens, tags):
pos += n_tokens
# mark starts/ends with special tokens
- data = zip(html_tokens, tags, range(len(html_tokens)))
- keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)
+ data = [(s, True) for s in starts]
+ data.extend((s, False) for s in ends)
+ keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail)
+ data.sort(key=keyfunc)
- for (orig_elem, is_tail), g in groupby(data, keyfunc):
+ for (_, is_tail), g in groupby(data, keyfunc):
g = list(g)
- fix = False
- tokens = g[0][0].tokens[:]
- for token, tag, token_idx in g:
- if token_idx in starts:
- text = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
- tokens[token.index] = text
- fix = True
- if token_idx in ends:
- text = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
- tokens[token.index] = text
- fix = True
-
- if fix:
- xpath = orig_tree.getpath(orig_elem)
- elem = xpatheval(xpath)[0]
- if is_tail:
- elem.tail = smart_join(tokens)
+ g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1]))
+
+ if not g:
+ continue
+
+ elem = html_tokens[g[0][0]].elem
+
+ pos_in_source = 0
+ source = elem.text
+ if is_tail:
+ source = elem.tail
+
+ mods = list()
+
+ for idx, is_starts in g:
+ token = html_tokens[idx]
+ tag = tags[idx]
+ mods.append(source[pos_in_source:token.position])
+ pos_in_source = token.position
+ if is_starts:
+ patch = ' __START_%s__ ' % (tag[2:],)
+ mods.append(patch)
else:
- elem.text = smart_join(tokens)
+ end_in_source = pos_in_source + token.length
+ mods.append(source[pos_in_source:end_in_source])
+ pos_in_source = pos_in_source + token.length
+ patch = ' __END_%s__ ' % (tag[2:],)
+ mods.append(patch)
+
+ mods.append(source[pos_in_source:])
+ modded = ''.join(mods)
+
+ if is_tail:
+ elem.tail = modded
+ else:
+ elem.text = modded
return tree
@@ -245,18 +272,35 @@ def _process_tree(self, tree):
return
head_tokens, head_tags = self._tokenize_and_split(tree.text)
+ char_tokens = [t.chars for t in head_tokens]
for index, (token, tag) in enumerate(zip(head_tokens, head_tags)):
- yield HtmlToken(index, head_tokens, tree, False), tag
+ yield HtmlToken(index,
+ char_tokens,
+ tree,
+ False,
+ token.position,
+ token.length), tag
for child in tree: # where is my precious "yield from"?
for html_token, tag in self._process_tree(child):
yield html_token, tag
tail_tokens, tail_tags = self._tokenize_and_split(tree.tail)
+ char_tokens = [t.chars for t in tail_tokens]
for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)):
- yield HtmlToken(index, tail_tokens, tree, True), tag
+ yield HtmlToken(index,
+ char_tokens,
+ tree,
+ True,
+ token.position,
+ token.length), tag
+
+ def cleanup_tree(self, tree):
+ cleaned = copy.deepcopy(tree)
+ for _, elem in iterwalk(cleaned):
+ self._cleanup_elem(elem)
- self._cleanup_elem(tree)
+ return cleaned
def _cleanup_elem(self, elem):
""" Remove special tokens from elem """
@@ -266,16 +310,23 @@ def _cleanup_elem(self, elem):
elem.tail = self._tag_re.sub("", elem.tail)
def _tokenize_and_split(self, text):
- input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
- input_tokens = map(six.text_type, input_tokens)
- return self.sequence_encoder.encode_split(input_tokens)
+ text = text or ''
+ input_tokens = [t for t in self.text_tokenize_func(text)]
+ input_tokens = self._limit_tags(input_tokens)
+ input_tokens = [TextToken(chars=t.chars,
+ position=t.position,
+ length=t.length) for t in input_tokens]
+ chains = self.sequence_encoder.encode(t.chars for t in input_tokens)
+ chains = self.sequence_encoder.from_indices(chains, input_tokens)
+ chains = [l for l in chains]
+ return self.sequence_encoder.split(chains)
def _limit_tags(self, input_tokens):
if self.tagset is None:
return input_tokens
proc = self.sequence_encoder.token_processor
- token_classes = [proc.classify(tok) for tok in input_tokens]
+ token_classes = [proc.classify(tok.chars) for tok in input_tokens]
return [
tok for (tok, (typ, value)) in zip(input_tokens, token_classes)
if not (typ in {'start', 'end'} and value not in self.tagset)
diff --git a/webstruct/html_tokenizer_benchmark.py b/webstruct/html_tokenizer_benchmark.py
new file mode 100644
index 0000000..c7bd17a
--- /dev/null
+++ b/webstruct/html_tokenizer_benchmark.py
@@ -0,0 +1,34 @@
+import os.path
+import glob
+import timeit
+import functools
+
+import webstruct.webannotator
+import webstruct.html_tokenizer
+
+def load_trees(tokenizer, trees):
+ for tree in trees:
+ tokenizer.tokenize_single(tree)
+
+def main():
+ path = os.path.join(os.path.dirname(__file__) ,
+ ".." ,
+ "webstruct_data",
+ "corpus/business_pages/wa/*.html")
+
+ paths = sorted(glob.glob(path))
+
+ with open(paths[0], 'rb') as sample_reader:
+ colors = webstruct.webannotator.EntityColors.from_htmlbytes(sample_reader.read())
+ entities = [typ for typ in colors]
+
+ loader = webstruct.WebAnnotatorLoader(known_entities=entities)
+
+ trees = [loader.load(p) for p in paths]
+ tokenizer = webstruct.html_tokenizer.HtmlTokenizer()
+ print(timeit.timeit(functools.partial(load_trees, tokenizer, trees),
+ setup='gc.enable()',
+ number=3))
+
+if __name__ == "__main__":
+ main()
diff --git a/webstruct/sequence_encoding.py b/webstruct/sequence_encoding.py
index ba212ef..5b55752 100644
--- a/webstruct/sequence_encoding.py
+++ b/webstruct/sequence_encoding.py
@@ -11,13 +11,14 @@ class IobEncoder(object):
>>> iob_encoder = IobEncoder()
>>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"]
- >>> iob_encoder.encode(input_tokens)
+ >>> def encode(encoder, tokens): return [p for p in IobEncoder.from_indices(encoder.encode(tokens), tokens)]
+ >>> encode(iob_encoder, input_tokens)
[('John', 'B-PER'), ('said', 'O')]
- Get the result in another format using ``encode_split`` method::
>>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"]
- >>> tokens, tags = iob_encoder.encode_split(input_tokens)
+ >>> tokens = encode(iob_encoder, input_tokens)
+ >>> tokens, tags = iob_encoder.split(tokens)
>>> tokens, tags
(['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O'])
@@ -25,9 +26,11 @@ class IobEncoder(object):
stream and continue the encoding later::
>>> iob_encoder = IobEncoder()
- >>> iob_encoder.encode(["__START_PER__", "John"])
+ >>> input_tokens_partial = ["__START_PER__", "John"]
+ >>> encode(iob_encoder, input_tokens_partial)
[('John', 'B-PER')]
- >>> iob_encoder.encode(["Mayer", "__END_PER__", "said"])
+ >>> input_tokens_partial = ["Mayer", "__END_PER__", "said"]
+ >>> encode(iob_encoder, input_tokens_partial)
[('Mayer', 'I-PER'), ('said', 'O')]
To reset internal state, use ``reset method``::
@@ -36,7 +39,7 @@ class IobEncoder(object):
Group results to entities::
- >>> iob_encoder.group(iob_encoder.encode(input_tokens))
+ >>> iob_encoder.group(encode(iob_encoder, input_tokens))
[(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')]
Input token stream is processed by ``InputTokenProcessor()`` by default;
@@ -53,7 +56,7 @@ def reset(self):
self.tag = 'O'
def iter_encode(self, input_tokens):
- for token in input_tokens:
+ for number, token in enumerate(input_tokens):
token_type, value = self.token_processor.classify(token)
if token_type == 'start':
@@ -68,7 +71,7 @@ def iter_encode(self, input_tokens):
self.tag = "O"
elif token_type == 'token':
- yield token, self.tag
+ yield number, self.tag
if self.tag[0] == 'B':
self.tag = "I" + self.tag[1:]
@@ -81,13 +84,14 @@ def iter_encode(self, input_tokens):
def encode(self, input_tokens):
return list(self.iter_encode(input_tokens))
- def encode_split(self, input_tokens):
- """ The same as ``encode``, but returns ``(tokens, tags)`` tuple """
- res = self.encode(input_tokens)
- if not res:
- return (), ()
- tokens, tags = zip(*res)
- return list(tokens), list(tags)
+ def split(self, tokens):
+ """ split ``[(token, tag)]`` to ``([token], [tags])`` tuple """
+ return [t[0] for t in tokens], [t[1] for t in tokens]
+
+ @classmethod
+ def from_indices(cls, indices, input_tokens):
+ for idx, tag in indices:
+ yield input_tokens[idx], tag
@classmethod
def group(cls, data, strict=False):
@@ -186,4 +190,3 @@ def classify(self, token):
# regular token
return 'token', token
-
diff --git a/webstruct/tests/test_html_tokenizer.py b/webstruct/tests/test_html_tokenizer.py
index f386606..44420de 100644
--- a/webstruct/tests/test_html_tokenizer.py
+++ b/webstruct/tests/test_html_tokenizer.py
@@ -69,8 +69,6 @@ def assertTokenizationWorks(self, tree):
[u'B-ORG', u'I-ORG', 'O', 'O', 'O', 'O', u'B-CITY']
)
- tree = html_tokens[0].root
- self.assertNotIn(b'__', tostring(tree))
def test_tokenize_single(self):
self.assertTokenizationWorks(self._load())
@@ -84,7 +82,7 @@ def test_detokenize_single(self):
tokenizer = HtmlTokenizer()
html_tokens, tags = tokenizer.tokenize_single(src_tree)
- new_tree = html_tokens[0].root
+ new_tree = tokenizer.cleanup_tree(src_tree)
self.assertIn(b'__START_ORG__', tostring(src_tree))
self.assertNotIn(b'__START_ORG__', tostring(new_tree))
@@ -93,6 +91,7 @@ def test_detokenize_single(self):
html_document_fromstring(UNANNOTATED_HTML)
)
+ html_tokens, _ = tokenizer.tokenize_single(new_tree)
detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
self.assertIn(b'__START_ORG__', tostring(detokenized_tree))
@@ -137,3 +136,34 @@ def test_tokenize_scripts_and_styles(self):
# and restores the tree if needed
detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
self.assertHtmlTreeEqual(tree, detokenized_tree)
+
+ def test_detokenize_preserve_commas(self):
+ annotated_html = b"""
+
+ __START_ORG__ hello __END_ORG__ a, b world
+
+ """
+
+ annotated_tree = HtmlLoader().loadbytes(annotated_html)
+ tokenizer = HtmlTokenizer()
+ html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
+ clean_tree = tokenizer.cleanup_tree(annotated_tree)
+ html_tokens, _ = tokenizer.tokenize_single(clean_tree)
+ detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
+ self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
+
+ def test_detokenize_handle_unicode(self):
+ annotated_html = bytes(u"""
+
+ Δ __START_ORG__ hello __END_ORG__ a, b world
+
+ """.encode('utf-8'))
+
+
+ annotated_tree = HtmlLoader().loadbytes(annotated_html)
+ tokenizer = HtmlTokenizer()
+ html_tokens, tags = tokenizer.tokenize_single(annotated_tree)
+ clean_tree = tokenizer.cleanup_tree(annotated_tree)
+ html_tokens, _ = tokenizer.tokenize_single(clean_tree)
+ detokenized_tree = tokenizer.detokenize_single(html_tokens, tags)
+ self.assertHtmlTreeEqual(annotated_tree, detokenized_tree)
diff --git a/webstruct/tests/test_text_tokenizer.py b/webstruct/tests/test_text_tokenizer.py
new file mode 100644
index 0000000..7427bfc
--- /dev/null
+++ b/webstruct/tests/test_text_tokenizer.py
@@ -0,0 +1,48 @@
+import unittest
+import pytest
+
+from webstruct.text_tokenizers import TextToken, WordTokenizer
+
+class TestTokenizerTest(unittest.TestCase):
+ def do_tokenize(self, text, result):
+ self.assertEqual(result, WordTokenizer().segment_words(text))
+
+ @pytest.mark.xfail
+ def test_phone(self):
+ return self.do_tokenize(
+ "Phone:855-349-1914",
+ [TextToken(chars='Phone', position=0, length=5)]
+ [TextToken(chars=':', position=5, length=1)]
+ [TextToken(chars='855-349-1914', position=6, length=12)]
+ )
+
+ @pytest.mark.xfail
+ def test_hyphen_mid(self):
+ return self.do_tokenize(
+ "Powai Campus, Mumbai-400077",
+ [TextToken(chars='Powai', position=0, length=5),
+ TextToken(chars='Campus', position=6, length=6),
+ TextToken(chars=',', position=12, length=1),
+ TextToken(chars='Mumbai', position=14, length=6),
+ TextToken(chars='-', position=20, length=1),
+ TextToken(chars='400077', position=21, length=6)]
+ )
+
+ @pytest.mark.xfail
+ def test_hyphen_end(self):
+ return self.do_tokenize(
+ "Saudi Arabia-",
+ [TextToken(chars='Saudi', position=0, length=5),
+ TextToken(chars='Arabia', position=6, length=6),
+ TextToken(chars='-', position=12, length=1)]
+ )
+
+ @pytest.mark.xfail
+ def test_hyphen_end(self):
+ return self.do_tokenize(
+ "1 5858/ 1800",
+ [TextToken(chars='1', position=0, length=1),
+ TextToken(chars='5858', position=2, length=4),
+ TextToken(chars='/', position=6, length=1),
+ TextToken(chars='1800', position=8, length=4)]
+ )
diff --git a/webstruct/text_tokenizers.py b/webstruct/text_tokenizers.py
index 6cc166a..fd1c72c 100644
--- a/webstruct/text_tokenizers.py
+++ b/webstruct/text_tokenizers.py
@@ -1,61 +1,121 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import re
+import collections
+
+TextToken = collections.namedtuple('TextToken', 'chars, position, length')
class WordTokenizer(object):
r"""This tokenizer is copy-pasted version of TreebankWordTokenizer
that doesn't split on @ and ':' symbols and doesn't split contractions::
- >>> from nltk.tokenize.treebank import TreebankWordTokenizer # doctest: +SKIP
>>> s = '''Good muffins cost $3.88\nin New York. Email: muffins@gmail.com'''
- >>> TreebankWordTokenizer().tokenize(s) # doctest: +SKIP
- ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email', ':', 'muffins', '@', 'gmail.com']
- >>> WordTokenizer().tokenize(s)
- ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Email:', 'muffins@gmail.com']
+ >>> WordTokenizer().segment_words(s)
+ [TextToken(chars='Good', position=0, length=4),
+ TextToken(chars='muffins', position=5, length=7),
+ TextToken(chars='cost', position=13, length=4),
+ TextToken(chars='$', position=18, length=1),
+ TextToken(chars='3.88', position=19, length=4),
+ TextToken(chars='in', position=24, length=2),
+ TextToken(chars='New', position=27, length=3),
+ TextToken(chars='York.', position=31, length=5),
+ TextToken(chars='Email:', position=37, length=6),
+ TextToken(chars='muffins@gmail.com', position=44, length=17)]
>>> s = '''Shelbourne Road,'''
- >>> WordTokenizer().tokenize(s)
- ['Shelbourne', 'Road', ',']
+ >>> WordTokenizer().segment_words(s)
+ [TextToken(chars='Shelbourne', position=0, length=10),
+ TextToken(chars='Road', position=11, length=4),
+ TextToken(chars=',', position=15, length=1)]
>>> s = '''population of 100,000'''
- >>> WordTokenizer().tokenize(s)
- ['population', 'of', '100,000']
+ >>> WordTokenizer().segment_words(s)
+ [TextToken(chars='population', position=0, length=10),
+ TextToken(chars='of', position=11, length=2),
+ TextToken(chars='100,000', position=14, length=7)]
>>> s = '''Hello|World'''
- >>> WordTokenizer().tokenize(s)
- ['Hello', '|', 'World']
+ >>> WordTokenizer().segment_words(s)
+ [TextToken(chars='Hello', position=0, length=5),
+ TextToken(chars='|', position=5, length=1),
+ TextToken(chars='World', position=6, length=5)]
>>> s2 = '"We beat some pretty good teams to get here," Slocum said.'
- >>> WordTokenizer().tokenize(s2) # doctest: +NORMALIZE_WHITESPACE
- ['``', 'We', 'beat', 'some', 'pretty', 'good',
- 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.']
+ >>> WordTokenizer().segment_words(s2) # doctest: +NORMALIZE_WHITESPACE
+ [TextToken(chars='``', position=0, length=1),
+ TextToken(chars='We', position=1, length=2),
+ TextToken(chars='beat', position=4, length=4),
+ TextToken(chars='some', position=9, length=4),
+ TextToken(chars='pretty', position=14, length=6),
+ TextToken(chars='good', position=21, length=4),
+ TextToken(chars='teams', position=26, length=5),
+ TextToken(chars='to', position=32, length=2),
+ TextToken(chars='get', position=35, length=3),
+ TextToken(chars='here', position=39, length=4),
+ TextToken(chars=',', position=43, length=1),
+ TextToken(chars="''", position=44, length=1),
+ TextToken(chars='Slocum', position=46, length=6),
+ TextToken(chars='said', position=53, length=4),
+ TextToken(chars='.', position=57, length=1)]
>>> s3 = '''Well, we couldn't have this predictable,
... cliche-ridden, \"Touched by an
... Angel\" (a show creator John Masius
... worked on) wanna-be if she didn't.'''
- >>> WordTokenizer().tokenize(s3) # doctest: +NORMALIZE_WHITESPACE
- ['Well', ',', 'we', "couldn't", 'have', 'this', 'predictable',
- ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an',
- 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius',
- 'worked', 'on', ')', 'wanna-be', 'if', 'she', "didn't", '.']
+ >>> WordTokenizer().segment_words(s3) # doctest: +NORMALIZE_WHITESPACE
+ [TextToken(chars='Well', position=0, length=4),
+ TextToken(chars=',', position=4, length=1),
+ TextToken(chars='we', position=6, length=2),
+ TextToken(chars="couldn't", position=9, length=8),
+ TextToken(chars='have', position=18, length=4),
+ TextToken(chars='this', position=23, length=4),
+ TextToken(chars='predictable', position=28, length=11),
+ TextToken(chars=',', position=39, length=1),
+ TextToken(chars='cliche-ridden', position=41, length=13),
+ TextToken(chars=',', position=54, length=1),
+ TextToken(chars='``', position=56, length=1),
+ TextToken(chars='Touched', position=57, length=7),
+ TextToken(chars='by', position=65, length=2),
+ TextToken(chars='an', position=68, length=2),
+ TextToken(chars='Angel', position=71, length=5),
+ TextToken(chars="''", position=76, length=1),
+ TextToken(chars='(', position=78, length=1),
+ TextToken(chars='a', position=79, length=1),
+ TextToken(chars='show', position=81, length=4),
+ TextToken(chars='creator', position=86, length=7),
+ TextToken(chars='John', position=94, length=4),
+ TextToken(chars='Masius', position=99, length=6),
+ TextToken(chars='worked', position=106, length=6),
+ TextToken(chars='on', position=113, length=2),
+ TextToken(chars=')', position=115, length=1),
+ TextToken(chars='wanna-be', position=117, length=8),
+ TextToken(chars='if', position=126, length=2),
+ TextToken(chars='she', position=129, length=3),
+ TextToken(chars="didn't", position=133, length=6),
+ TextToken(chars='.', position=139, length=1)]
+
+ >>> WordTokenizer().segment_words('"')
+ [TextToken(chars='``', position=0, length=1)]
+
+ >>> WordTokenizer().segment_words('" a')
+ [TextToken(chars='``', position=0, length=1),
+ TextToken(chars='a', position=2, length=1)]
Some issues:
- >>> WordTokenizer().tokenize("Phone:855-349-1914") # doctest: +SKIP
- ['Phone', ':', '855-349-1914']
-
- >>> WordTokenizer().tokenize("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.") # doctest: +SKIP
- ['Copyright', '\xc2\xa9', '2014', 'Wall', 'Decor', 'and', 'Home', 'Accents', '.', 'All', 'Rights', 'Reserved', '.']
-
- >>> WordTokenizer().tokenize("Powai Campus, Mumbai-400077") # doctest: +SKIP
- ['Powai', 'Campus', ',', 'Mumbai", "-", "400077']
-
- >>> WordTokenizer().tokenize("1 5858/ 1800") # doctest: +SKIP
- ['1', '5858', '/', '1800']
-
- >>> WordTokenizer().tokenize("Saudi Arabia-") # doctest: +SKIP
- ['Saudi', 'Arabia', '-']
+ >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
+ [TextToken(chars='Copyright', position=0, length=9),
+ TextToken(chars=u'\xa9', position=10, length=1),
+ TextToken(chars='2014', position=12, length=4),
+ TextToken(chars='Foo', position=17, length=3),
+ TextToken(chars='Bar', position=21, length=3),
+ TextToken(chars='and', position=25, length=3),
+ TextToken(chars='Buzz', position=29, length=4),
+ TextToken(chars='Spam.', position=34, length=5),
+ TextToken(chars='All', position=40, length=3),
+ TextToken(chars='Rights', position=44, length=6),
+ TextToken(chars='Reserved', position=51, length=8),
+ TextToken(chars='.', position=59, length=1)]
"""
@@ -76,41 +136,64 @@ class WordTokenizer(object):
open_quotes = re.compile(r'(^|[\s(\[{<])"')
- def _tokenize(self, text):
+ def _segment_words(self, text):
# this one cannot be placed in the loop because it requires
# position check (beginning of the string) or previous char value
- text = self.open_quotes.sub(r'\1``', text)
+ quote = self.open_quotes.search(text)
+ if quote is not None:
+ end = quote.end() - 1
+ for t in self._segment_words(text[:end]):
+ yield t
+ yield TextToken(chars='``', position=end, length=1)
+ shift = end + 1
+ for t in self._segment_words(text[shift:]):
+ yield TextToken(chars=t.chars,
+ position=t.position + shift,
+ length=t.length)
+ return
i = 0
token_start = 0
while 1:
if i >= len(text):
- yield text[token_start:]
+ yield TextToken(chars=text[token_start:],
+ position=token_start,
+ length=len(text) - token_start)
break
shift = 1
partial_text = text[i:]
for regex, token in self.rules:
match = regex.match(partial_text)
if match:
- yield text[token_start:i]
+ yield TextToken(chars=text[token_start:i],
+ position=token_start,
+ length=i - token_start)
shift = match.end() - match.start()
token_start = i + shift
if token is None:
- yield match.group()
+ yield TextToken(chars=match.group(),
+ position=i + match.start(),
+ length=shift)
else:
- yield token
+ yield TextToken(chars=token,
+ position=i + match.start(),
+ length=shift)
break
i += shift
+ def segment_words(self, text):
+ return [t for t in self._segment_words(text) if t.chars]
+
def tokenize(self, text):
- return [t for t in self._tokenize(text) if t]
+ return [t.chars for t in self.segment_words(text)]
class DefaultTokenizer(WordTokenizer):
- def tokenize(self, text):
- tokens = super(DefaultTokenizer, self).tokenize(text)
+ def segment_words(self, text):
+ tokens = super(DefaultTokenizer, self).segment_words(text)
# remove standalone commas and semicolons
- # as they broke tag sets, e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
+ # as they broke tag sets,
+ # e.g. PERSON->FUNCTION in case "PERSON, FUNCTION"
# but it has negative consequences, e.g.
# etalon: [PER-B, PER-I, FUNC-B]
@@ -118,7 +201,7 @@ def tokenize(self, text):
# because we removed punctuation
# FIXME: remove as token, but save as feature left/right_punct:","
- return [t for t in tokens if t not in {',', ';'}]
+ return [t for t in tokens if t.chars not in {',', ';'}]
-tokenize = DefaultTokenizer().tokenize
+tokenize = DefaultTokenizer().segment_words