HTML2Text loader

langchain-ai · Jul 20, 2023 · ff3837e · ff3837e
1 parent ec4f93b
commit ff3837e
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 0 deletions.
diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/html2text.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/html2text.ipynb
@@ -0,0 +1,90 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e229e34c",
+   "metadata": {},
+   "source": [
+    "# html2text\n",
+    "\n",
+    "html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. \n",
+    "\n",
+    "Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89708a68",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! pip install html2text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4c8e4dab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import HTML2TextLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e76b5ddc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\"https://www.espn.com\"]\n",
+    "loader = HTML2TextLoader(urls)\n",
+    "docs = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "5dca1c0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Skip to main content  Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n  * Search\\n\\n  *   * scores\\n\\n  * NFL\\n '"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs[0].page_content[0:100]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
@@ -59,6 +59,7 @@
 from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.hn import HNLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
+from langchain.document_loaders.html2text import HTML2TextLoader
 from langchain.document_loaders.html_bs import BSHTMLLoader
 from langchain.document_loaders.hugging_face_dataset import HuggingFaceDatasetLoader
 from langchain.document_loaders.ifixit import IFixitLoader
@@ -210,6 +211,7 @@
     "GoogleDriveLoader",
     "GutenbergLoader",
     "HNLoader",
+    "HTML2TextLoader",
     "HuggingFaceDatasetLoader",
     "HuggingFaceDatasetLoader",
     "IFixitLoader",

diff --git a/langchain/document_loaders/html2text.py b/langchain/document_loaders/html2text.py
@@ -0,0 +1,39 @@
+from typing import Iterator, List
+
+import requests
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class HTML2TextLoader(BaseLoader):
+    """Loader for websites using html2text for Markdown output."""
+
+    def __init__(self, urls: List[str]):
+        self.web_paths = urls
+
+    def lazy_load(self) -> Iterator[Document]:
+        """Lazy load Documents from urls."""
+
+        try:
+            import html2text
+        except ImportError:
+            raise ValueError(
+                """html2text package not found, please 
+                install it with `pip install html2text`"""
+            )
+
+        # Create an html2text.HTML2Text object and override some properties
+        h = html2text.HTML2Text()
+        h.ignore_links = True
+        h.ignore_images = True
+        # Process each url
+        for path in self.web_paths:
+            response = requests.get(path)
+            text = h.handle(response.text)
+            metadata = {"source": path}
+            yield Document(page_content=text, metadata=metadata)
+
+    def load(self) -> List[Document]:
+        """Load Documents from table."""
+        return list(self.lazy_load())