scrapinghub · kmike · Oct 16, 2013 · Oct 16, 2013 · Oct 17, 2013 · Oct 18, 2013
diff --git a/.gitignore b/.gitignore
@@ -24,7 +24,9 @@ pip-log.txt
 # Unit test / coverage reports
 .coverage
 .tox
+cover
 nosetests.xml
+.cache
 
 # Translations
 *.mo
@@ -35,5 +37,9 @@ nosetests.xml
 .pydevproject
 
 # Other
+.idea
 webstruct_data/datastore
-
+.ipynb_checkpoints
+docs/_build
+webstruct_data/todo
+notebooks/old
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,33 @@
+language: python
+python: 3.5
+sudo: false
+branches:
+    only:
+        - master
+        - /^\d\.\d+$/
+env:
+    - TOXENV=py27
+    - TOXENV=py34
+    - TOXENV=py35
+    - TOXENV=docs
+
+addons:
+    apt:
+        packages:
+            - python-numpy
+            - python-scipy
+            - libatlas-base-dev
+            - liblapack-dev
+            - gfortran
+
+install:
+    - pip install -U pip tox codecov
+
+script: tox
+
+after_success:
+    - codecov
+
+cache:
+    directories:
+        - $HOME/.cache/pip
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -0,0 +1,29 @@
+Changes
+=======
+
+0.4.1 (2016-11-28)
+------------------
+
+* fixed a bug in NER.extract().
+
+0.4 (2016-11-26)
+----------------
+
+* sklearn-crfsuite_ is used as a CRFsuite wrapper, CRFsuiteCRF class
+  is removed;
+* comments are preserved in HTML trees because recent Firefox puts
+  ``<base>`` tags to a comment when saving pages, and this affects
+  WebAnnotator;
+* fixed 'dont_penalize' argument of webstruct.NER.extract_groups_from_url;
+* new webstruct.model.extract_entity_groups utility function;
+* HtmlTokenizer and HtmlToken are modev to their own module
+  (webstruct.html_tokenizer);
+* test improvements;
+
+.. _sklearn-crfsuite: https://github.com/TeamHG-Memex/sklearn-crfsuite
+
+0.3 (2016-09-19)
+----------------
+
+There are many changes from previous version: API is changed,
+Python 3 is supported, better gazetteers support, CRFsuite support, etc.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,35 @@
+Webstruct
+=========
+
+.. image:: https://travis-ci.org/scrapinghub/webstruct.svg?branch=master
+   :target: https://travis-ci.org/scrapinghub/webstruct
+
+.. image:: https://codecov.io/gh/scrapinghub/webstruct/branch/master/graph/badge.svg
+   :target: https://codecov.io/gh/scrapinghub/webstruct
+
+
+Webstruct is a library for creating statistical NER_ systems that work
+on HTML data, i.e. a library for building tools that extract named
+entities (addresses, organization names, open hours, etc) from webpages.
+
+Unlike most NER systems, webstruct works on HTML data, not only
+on text data. This allows to define features that use HTML structure,
+and also to embed annotation results back into HTML.
+
+Read the docs_ for more info.
+
+License is MIT.
+
+.. _docs: http://webstruct.readthedocs.org/en/latest/
+.. _NER: http://en.wikipedia.org/wiki/Named-entity_recognition
+
+Contributing
+------------
+
+* Source code: https://github.com/scrapinghub/webstruct
+* Bug tracker: https://github.com/scrapinghub/webstruct/issues
+
+To run tests, make sure tox_ is installed, then run
+``tox`` from the source root.
+
+.. _tox: https://tox.readthedocs.io/en/latest/
diff --git a/block_model/README.md b/block_model/README.md
diff --git a/block_model/convert_html.py b/block_model/convert_html.py
diff --git a/block_model/convert_labeled_data.py b/block_model/convert_labeled_data.py
diff --git a/block_model/data/1.html b/block_model/data/1.html
diff --git a/block_model/data/1.txt b/block_model/data/1.txt