-
Notifications
You must be signed in to change notification settings - Fork 14.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ec4f93b
commit ff3837e
Showing
3 changed files
with
131 additions
and
0 deletions.
There are no files selected for viewing
90 changes: 90 additions & 0 deletions
90
docs/extras/modules/data_connection/document_loaders/integrations/html2text.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e229e34c", | ||
"metadata": {}, | ||
"source": [ | ||
"# html2text\n", | ||
"\n", | ||
"html2text is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. \n", | ||
"\n", | ||
"Better yet, that ASCII also happens to be valid Markdown (a text-to-HTML format)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "89708a68", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"! pip install html2text" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "4c8e4dab", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import HTML2TextLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "e76b5ddc", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"urls = [\"https://www.espn.com\"]\n", | ||
"loader = HTML2TextLoader(urls)\n", | ||
"docs = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "5dca1c0c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'Skip to main content Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n * Search\\n\\n * * scores\\n\\n * NFL\\n '" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs[0].page_content[0:100]" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from typing import Iterator, List | ||
|
||
import requests | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class HTML2TextLoader(BaseLoader): | ||
"""Loader for websites using html2text for Markdown output.""" | ||
|
||
def __init__(self, urls: List[str]): | ||
self.web_paths = urls | ||
|
||
def lazy_load(self) -> Iterator[Document]: | ||
"""Lazy load Documents from urls.""" | ||
|
||
try: | ||
import html2text | ||
except ImportError: | ||
raise ValueError( | ||
"""html2text package not found, please | ||
install it with `pip install html2text`""" | ||
) | ||
|
||
# Create an html2text.HTML2Text object and override some properties | ||
h = html2text.HTML2Text() | ||
h.ignore_links = True | ||
h.ignore_images = True | ||
# Process each url | ||
for path in self.web_paths: | ||
response = requests.get(path) | ||
text = h.handle(response.text) | ||
metadata = {"source": path} | ||
yield Document(page_content=text, metadata=metadata) | ||
|
||
def load(self) -> List[Document]: | ||
"""Load Documents from table.""" | ||
return list(self.lazy_load()) |