Skip to content

Letter-precise html tokenization #49

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Oct 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
36d56f2
text tokenizer return postions of token
whalebot-helmsman Sep 21, 2017
2d4d2ef
update tests
whalebot-helmsman Sep 21, 2017
80658ca
separate statement for every action
whalebot-helmsman Sep 21, 2017
c52e449
comma preserving test
whalebot-helmsman Sep 21, 2017
8178776
too much tokens around
whalebot-helmsman Sep 21, 2017
51c0932
encode in indices instead of entities
whalebot-helmsman Sep 21, 2017
1a667ec
handle empty lists
whalebot-helmsman Sep 21, 2017
24465b1
pass token length and position from TextToken to HtmlToken
whalebot-helmsman Sep 21, 2017
06befbb
letter perfect detokenization
whalebot-helmsman Sep 22, 2017
e5730b2
do not cleanup tokenized tree by default, separate method for tree cl…
Sep 25, 2017
e340444
update tests for separate tree cleaning
Sep 25, 2017
89673c1
update tests for correct punctuation positions
Sep 25, 2017
7c45984
correct length for replaced quotes
Sep 25, 2017
46fc4df
pep8
Sep 29, 2017
388170e
comma at line end, not start
Sep 29, 2017
71caf61
one join instead of many additions, dont be Schleimel
Sep 29, 2017
37d7470
correct formatting
Sep 29, 2017
e93c6dc
add clarification
Sep 29, 2017
e02c275
fix typo
Sep 29, 2017
f26569f
pep8
Sep 29, 2017
d1aecbb
preserve tokenize method for compatibility
Sep 29, 2017
35a9d88
function to reduce code in tests
Sep 29, 2017
9033188
remove test for nltk tokenizer
Sep 29, 2017
c14f363
test our behaviour, which difers from original treebank tokenizer
Sep 29, 2017
a071cd4
remove useless conversion
Sep 29, 2017
a33f564
rename method to avoid confusion with nltk tokenize_span method
Sep 29, 2017
75a9698
remove brittle tests
Sep 29, 2017
4729323
small benchmark for html tokenizer
Sep 29, 2017
943a44e
Revert "remove brittle tests"
whalebot-helmsman Oct 2, 2017
ba7d6fe
move brittle tests to pytest xfail
whalebot-helmsman Oct 2, 2017
b72bcc1
expect behaviour of nltk tokenizer
whalebot-helmsman Oct 2, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 91 additions & 40 deletions webstruct/html_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,24 @@
import copy
from itertools import groupby
from collections import namedtuple
import six
from six.moves import zip

from lxml.etree import XPathEvaluator, Comment
from lxml.etree import Comment, iterwalk

from webstruct.sequence_encoding import IobEncoder
from webstruct.text_tokenizers import tokenize
from webstruct.text_tokenizers import tokenize, TextToken
from webstruct.utils import (
replace_html_tags,
kill_html_tags,
smart_join,
)


_HtmlToken = namedtuple('HtmlToken', 'index tokens elem is_tail')
_HtmlToken = namedtuple('HtmlToken', ['index',
'tokens',
'elem',
'is_tail',
'position',
'length'])


class HtmlToken(_HtmlToken):
Expand All @@ -41,6 +44,8 @@ class HtmlToken(_HtmlToken):
* :attr:`elem` is the current html block (as lxml's Element) - most
likely you want :attr:`parent` instead of it
* :attr:`is_tail` flag indicates that token belongs to element tail
* :attr:`position` is logical position(in letters or codepoints) of token start in parent text
* :attr:`length` is logical length(in letters or codepoints) of token in parent text

Computed properties:

Expand All @@ -64,8 +69,10 @@ def root(self):
return self.elem.getroottree()

def __repr__(self):
return "HtmlToken(token=%r, parent=%r, index=%s)" % (
self.token, self.parent, self.index
return ("HtmlToken("
"token=%r, parent=%r, index=%s, position=%d, length=%d"
")") % (
self.token, self.parent, self.index, self.position, self.length
)


Expand All @@ -85,7 +92,8 @@ class HtmlTokenizer(object):
----------

tagset : set, optional
A set of entity types to keep. If not passed, all entity types are kept.
A set of entity types to keep.
If not passed, all entity types are kept.
Use this argument to discard some entity types from training data.
sequence_encoder : object, optional
Sequence encoder object. If not passed,
Expand Down Expand Up @@ -142,7 +150,7 @@ def tokenize_single(self, tree):
>>> tree = loader.loadbytes(b"<p>hello, <PER>John <b>Doe</b></PER> <br> <PER>Mary</PER> said</p>")
>>> html_tokens, tags = html_tokenizer.tokenize_single(tree)
>>> html_tokens
[HtmlToken(token='hello', parent=<Element p at ...>, index=0), HtmlToken...]
[HtmlToken(token='hello', parent=<Element p at ...>, index=0, ...), HtmlToken...]
>>> tags
['O', 'B-PER', 'I-PER', 'B-PER', 'O']
>>> for tok, iob_tag in zip(html_tokens, tags):
Expand Down Expand Up @@ -180,6 +188,8 @@ def detokenize_single(self, html_tokens, tags):
Build annotated ``lxml.etree.ElementTree`` from
``html_tokens`` (a list of :class:`.HtmlToken` instances)
and ``tags`` (a list of their tags).
**ATTENTION**: ``html_tokens`` should be tokenized from tree
without tags

Annotations are encoded as ``__START_TAG__`` and ``__END_TAG__``
text tokens (this is the format :mod:`webstruct.loaders` use).
Expand All @@ -190,9 +200,7 @@ def detokenize_single(self, html_tokens, tags):
if not html_tokens:
return None

orig_tree = html_tokens[0].root
tree = copy.deepcopy(orig_tree)
xpatheval = XPathEvaluator(tree)
tree = html_tokens[0].root

# find starts/ends of token groups
token_groups = self.sequence_encoder.group(zip(html_tokens, tags))
Expand All @@ -206,30 +214,49 @@ def detokenize_single(self, html_tokens, tags):
pos += n_tokens

# mark starts/ends with special tokens
data = zip(html_tokens, tags, range(len(html_tokens)))
keyfunc = lambda rec: (rec[0].elem, rec[0].is_tail)
data = [(s, True) for s in starts]
data.extend((s, False) for s in ends)
keyfunc = lambda rec: (id(html_tokens[rec[0]].elem), html_tokens[rec[0]].is_tail)
data.sort(key=keyfunc)

for (orig_elem, is_tail), g in groupby(data, keyfunc):
for (_, is_tail), g in groupby(data, keyfunc):
g = list(g)
fix = False
tokens = g[0][0].tokens[:]
for token, tag, token_idx in g:
if token_idx in starts:
text = ' __START_%s__ %s' % (tag[2:], tokens[token.index])
tokens[token.index] = text
fix = True
if token_idx in ends:
text = '%s __END_%s__ ' % (tokens[token.index], tag[2:])
tokens[token.index] = text
fix = True

if fix:
xpath = orig_tree.getpath(orig_elem)
elem = xpatheval(xpath)[0]
if is_tail:
elem.tail = smart_join(tokens)
g.sort(key=lambda t: (html_tokens[t[0]].position, not t[1]))

if not g:
continue

elem = html_tokens[g[0][0]].elem

pos_in_source = 0
source = elem.text
if is_tail:
source = elem.tail

mods = list()

for idx, is_starts in g:
token = html_tokens[idx]
tag = tags[idx]
mods.append(source[pos_in_source:token.position])
pos_in_source = token.position
if is_starts:
patch = ' __START_%s__ ' % (tag[2:],)
mods.append(patch)
else:
elem.text = smart_join(tokens)
end_in_source = pos_in_source + token.length
mods.append(source[pos_in_source:end_in_source])
pos_in_source = pos_in_source + token.length
patch = ' __END_%s__ ' % (tag[2:],)
mods.append(patch)

mods.append(source[pos_in_source:])
modded = ''.join(mods)

if is_tail:
elem.tail = modded
else:
elem.text = modded

return tree

Expand All @@ -245,18 +272,35 @@ def _process_tree(self, tree):
return

head_tokens, head_tags = self._tokenize_and_split(tree.text)
char_tokens = [t.chars for t in head_tokens]
for index, (token, tag) in enumerate(zip(head_tokens, head_tags)):
yield HtmlToken(index, head_tokens, tree, False), tag
yield HtmlToken(index,
char_tokens,
tree,
False,
token.position,
token.length), tag

for child in tree: # where is my precious "yield from"?
for html_token, tag in self._process_tree(child):
yield html_token, tag

tail_tokens, tail_tags = self._tokenize_and_split(tree.tail)
char_tokens = [t.chars for t in tail_tokens]
for index, (token, tag) in enumerate(zip(tail_tokens, tail_tags)):
yield HtmlToken(index, tail_tokens, tree, True), tag
yield HtmlToken(index,
char_tokens,
tree,
True,
token.position,
token.length), tag

def cleanup_tree(self, tree):
cleaned = copy.deepcopy(tree)
for _, elem in iterwalk(cleaned):
self._cleanup_elem(elem)

self._cleanup_elem(tree)
return cleaned

def _cleanup_elem(self, elem):
""" Remove special tokens from elem """
Expand All @@ -266,16 +310,23 @@ def _cleanup_elem(self, elem):
elem.tail = self._tag_re.sub("", elem.tail)

def _tokenize_and_split(self, text):
input_tokens = self._limit_tags(self.text_tokenize_func(text or ''))
input_tokens = map(six.text_type, input_tokens)
return self.sequence_encoder.encode_split(input_tokens)
text = text or ''
input_tokens = [t for t in self.text_tokenize_func(text)]
input_tokens = self._limit_tags(input_tokens)
input_tokens = [TextToken(chars=t.chars,
position=t.position,
length=t.length) for t in input_tokens]
chains = self.sequence_encoder.encode(t.chars for t in input_tokens)
chains = self.sequence_encoder.from_indices(chains, input_tokens)
chains = [l for l in chains]
return self.sequence_encoder.split(chains)

def _limit_tags(self, input_tokens):
if self.tagset is None:
return input_tokens

proc = self.sequence_encoder.token_processor
token_classes = [proc.classify(tok) for tok in input_tokens]
token_classes = [proc.classify(tok.chars) for tok in input_tokens]
return [
tok for (tok, (typ, value)) in zip(input_tokens, token_classes)
if not (typ in {'start', 'end'} and value not in self.tagset)
Expand Down
34 changes: 34 additions & 0 deletions webstruct/html_tokenizer_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os.path
import glob
import timeit
import functools

import webstruct.webannotator
import webstruct.html_tokenizer

def load_trees(tokenizer, trees):
for tree in trees:
tokenizer.tokenize_single(tree)

def main():
path = os.path.join(os.path.dirname(__file__) ,
".." ,
"webstruct_data",
"corpus/business_pages/wa/*.html")

paths = sorted(glob.glob(path))

with open(paths[0], 'rb') as sample_reader:
colors = webstruct.webannotator.EntityColors.from_htmlbytes(sample_reader.read())
entities = [typ for typ in colors]

loader = webstruct.WebAnnotatorLoader(known_entities=entities)

trees = [loader.load(p) for p in paths]
tokenizer = webstruct.html_tokenizer.HtmlTokenizer()
print(timeit.timeit(functools.partial(load_trees, tokenizer, trees),
setup='gc.enable()',
number=3))

if __name__ == "__main__":
main()
35 changes: 19 additions & 16 deletions webstruct/sequence_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,26 @@ class IobEncoder(object):

>>> iob_encoder = IobEncoder()
>>> input_tokens = ["__START_PER__", "John", "__END_PER__", "said"]
>>> iob_encoder.encode(input_tokens)
>>> def encode(encoder, tokens): return [p for p in IobEncoder.from_indices(encoder.encode(tokens), tokens)]
>>> encode(iob_encoder, input_tokens)
[('John', 'B-PER'), ('said', 'O')]

Get the result in another format using ``encode_split`` method::

>>> input_tokens = ["hello", "__START_PER__", "John", "Doe", "__END_PER__", "__START_PER__", "Mary", "__END_PER__", "said"]
>>> tokens, tags = iob_encoder.encode_split(input_tokens)
>>> tokens = encode(iob_encoder, input_tokens)
>>> tokens, tags = iob_encoder.split(tokens)
>>> tokens, tags
(['hello', 'John', 'Doe', 'Mary', 'said'], ['O', 'B-PER', 'I-PER', 'B-PER', 'O'])

Note that IobEncoder is stateful. This means you can encode incomplete
stream and continue the encoding later::

>>> iob_encoder = IobEncoder()
>>> iob_encoder.encode(["__START_PER__", "John"])
>>> input_tokens_partial = ["__START_PER__", "John"]
>>> encode(iob_encoder, input_tokens_partial)
[('John', 'B-PER')]
>>> iob_encoder.encode(["Mayer", "__END_PER__", "said"])
>>> input_tokens_partial = ["Mayer", "__END_PER__", "said"]
>>> encode(iob_encoder, input_tokens_partial)
[('Mayer', 'I-PER'), ('said', 'O')]

To reset internal state, use ``reset method``::
Expand All @@ -36,7 +39,7 @@ class IobEncoder(object):

Group results to entities::

>>> iob_encoder.group(iob_encoder.encode(input_tokens))
>>> iob_encoder.group(encode(iob_encoder, input_tokens))
[(['hello'], 'O'), (['John', 'Doe'], 'PER'), (['Mary'], 'PER'), (['said'], 'O')]

Input token stream is processed by ``InputTokenProcessor()`` by default;
Expand All @@ -53,7 +56,7 @@ def reset(self):
self.tag = 'O'

def iter_encode(self, input_tokens):
for token in input_tokens:
for number, token in enumerate(input_tokens):
token_type, value = self.token_processor.classify(token)

if token_type == 'start':
Expand All @@ -68,7 +71,7 @@ def iter_encode(self, input_tokens):
self.tag = "O"

elif token_type == 'token':
yield token, self.tag
yield number, self.tag
if self.tag[0] == 'B':
self.tag = "I" + self.tag[1:]

Expand All @@ -81,13 +84,14 @@ def iter_encode(self, input_tokens):
def encode(self, input_tokens):
return list(self.iter_encode(input_tokens))

def encode_split(self, input_tokens):
""" The same as ``encode``, but returns ``(tokens, tags)`` tuple """
res = self.encode(input_tokens)
if not res:
return (), ()
tokens, tags = zip(*res)
return list(tokens), list(tags)
def split(self, tokens):
""" split ``[(token, tag)]`` to ``([token], [tags])`` tuple """
return [t[0] for t in tokens], [t[1] for t in tokens]

@classmethod
def from_indices(cls, indices, input_tokens):
for idx, tag in indices:
yield input_tokens[idx], tag

@classmethod
def group(cls, data, strict=False):
Expand Down Expand Up @@ -186,4 +190,3 @@ def classify(self, token):

# regular token
return 'token', token

Loading