-
Notifications
You must be signed in to change notification settings - Fork 14.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Async HTML loader and HTML2Text transformer (#8036)
New HTML loader that asynchronously loader a list of urls. New transformer using [HTML2Text](https://github.com/Alir3z4/html2text/) for HTML to clean, easy-to-read plain ASCII text (valid Markdown).
- Loading branch information
1 parent
cf60cff
commit 5a084e1
Showing
6 changed files
with
423 additions
and
0 deletions.
There are no files selected for viewing
107 changes: 107 additions & 0 deletions
107
docs/extras/modules/data_connection/document_loaders/integrations/async_html.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "e229e34c", | ||
"metadata": {}, | ||
"source": [ | ||
"# AsyncHtmlLoader\n", | ||
"\n", | ||
"AsyncHtmlLoader loads raw HTML from a list of urls concurrently." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "4c8e4dab", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import AsyncHtmlLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "e76b5ddc", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Fetching pages: 100%|############| 2/2 [00:00<00:00, 9.96it/s]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n", | ||
"loader = AsyncHtmlLoader(urls)\n", | ||
"docs = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "5dca1c0c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"' news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta property=\"og:image\" content=\"https://a1.espncdn.com/combiner/i?img=%2Fi%2Fespn%2Fespn_logos%2Fespn_red.png\"/>\\n<meta property=\"og:image:width\" content=\"1200\" />\\n<meta property=\"og:image:height\" content=\"630\" />\\n<meta property=\"og:type\" content=\"website\" />\\n<meta name=\"twitter:site\" content=\"espn\" />\\n<meta name=\"twitter:url\" content=\"https://www.espn.com\" />\\n<meta name=\"twitter:title\" content=\"ESPN - Serving Sports Fans. Anytime. Anywhere.\"/>\\n<meta name=\"twitter:description\" content=\"Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n<meta name=\"twitter:card\" content=\"summary\">\\n<meta name=\"twitter:app:name:iphone\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:iphone\" content=\"317469184\"/>\\n<meta name=\"twitter:app:name:googleplay\" content=\"ESPN\"/>\\n<meta name=\"twitter:app:id:googleplay\" content=\"com.espn.score_center\"/>\\n<meta name=\"title\" content=\"ESPN - '" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs[0].page_content[1000:2000]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "4d024f0f", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'al\" href=\"https://lilianweng.github.io/posts/2023-06-23-agent/\" />\\n<link crossorigin=\"anonymous\" href=\"/assets/css/stylesheet.min.67a6fb6e33089cb29e856bcc95d7aa39f70049a42b123105531265a0d9f1258b.css\" integrity=\"sha256-Z6b7bjMInLKehWvMldeqOfcASaQrEjEFUxJloNnxJYs=\" rel=\"preload stylesheet\" as=\"style\">\\n<script defer crossorigin=\"anonymous\" src=\"/assets/js/highlight.min.7680afc38aa6b15ddf158a4f3780b7b1f7dde7e91d26f073e6229bb7a0793c92.js\" integrity=\"sha256-doCvw4qmsV3fFYpPN4C3sffd5+kdJvBz5iKbt6B5PJI=\"\\n onload=\"hljs.initHighlightingOnLoad();\"></script>\\n<link rel=\"icon\" href=\"https://lilianweng.github.io/favicon_peach.ico\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"16x16\" href=\"https://lilianweng.github.io/favicon-16x16.png\">\\n<link rel=\"icon\" type=\"image/png\" sizes=\"32x32\" href=\"https://lilianweng.github.io/favicon-32x32.png\">\\n<link rel=\"apple-touch-icon\" href=\"https://lilianweng.github.io/apple-touch-icon.png\">\\n<link rel=\"mask-icon\" href=\"https://lilianweng.github.io/safari-pinned-tab.'" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs[1].page_content[1000:2000]" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
133 changes: 133 additions & 0 deletions
133
docs/extras/modules/data_connection/document_transformers/integrations/html2text.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fe6e5c82", | ||
"metadata": {}, | ||
"source": [ | ||
"# html2text\n", | ||
"\n", | ||
"[html2text](https://github.com/Alir3z4/html2text/) is a Python script that converts a page of HTML into clean, easy-to-read plain ASCII text. \n", | ||
"\n", | ||
"The ASCII also happens to be valid Markdown (a text-to-HTML format)." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ce77e0cb", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"! pip install html2text" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "8ca0974b", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"Fetching pages: 100%|############| 2/2 [00:00<00:00, 10.75it/s]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from langchain.document_loaders import AsyncHtmlLoader\n", | ||
"\n", | ||
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n", | ||
"loader = AsyncHtmlLoader(urls)\n", | ||
"docs = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "ddf2be97", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_transformers import Html2TextTransformer" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "a95a928c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n", | ||
"html2text = Html2TextTransformer()\n", | ||
"docs_transformed = html2text.transform_documents(docs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"id": "18ef9fe9", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"\" * ESPNFC\\n\\n * X Games\\n\\n * SEC Network\\n\\n## ESPN Apps\\n\\n * ESPN\\n\\n * ESPN Fantasy\\n\\n## Follow ESPN\\n\\n * Facebook\\n\\n * Twitter\\n\\n * Instagram\\n\\n * Snapchat\\n\\n * YouTube\\n\\n * The ESPN Daily Podcast\\n\\n2023 FIFA Women's World Cup\\n\\n## Follow live: Canada takes on Nigeria in group stage of Women's World Cup\\n\\n2m\\n\\nEPA/Morgan Hancock\\n\\n## TOP HEADLINES\\n\\n * Snyder fined $60M over findings in investigation\\n * NFL owners approve $6.05B sale of Commanders\\n * Jags assistant comes out as gay in NFL milestone\\n * O's alone atop East after topping slumping Rays\\n * ACC's Phillips: Never condoned hazing at NU\\n\\n * Vikings WR Addison cited for driving 140 mph\\n * 'Taking his time': Patient QB Rodgers wows Jets\\n * Reyna got U.S. assurances after Berhalter rehire\\n * NFL Future Power Rankings\\n\\n## USWNT AT THE WORLD CUP\\n\\n### USA VS. VIETNAM: 9 P.M. ET FRIDAY\\n\\n## How do you defend against Alex Morgan? Former opponents sound off\\n\\nThe U.S. forward is unstoppable at this level, scoring 121 goals and adding 49\"" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs_transformed[0].page_content[1000:2000]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "6045d660", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"\"t's brain,\\ncomplemented by several key components:\\n\\n * **Planning**\\n * Subgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\\n * Reflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final results.\\n * **Memory**\\n * Short-term memory: I would consider all the in-context learning (See Prompt Engineering) as utilizing short-term memory of the model to learn.\\n * Long-term memory: This provides the agent with the capability to retain and recall (infinite) information over extended periods, often by leveraging an external vector store and fast retrieval.\\n * **Tool use**\\n * The agent learns to call external APIs for extra information that is missing from the model weights (often hard to change after pre-training), including current information, code execution c\"" | ||
] | ||
}, | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"docs_transformed[1].page_content[1000:2000]" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.16" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
"""Web base loader class.""" | ||
import asyncio | ||
import logging | ||
import warnings | ||
from typing import Any, Dict, Iterator, List, Optional, Union | ||
|
||
import aiohttp | ||
import requests | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
default_header_template = { | ||
"User-Agent": "", | ||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*" | ||
";q=0.8", | ||
"Accept-Language": "en-US,en;q=0.5", | ||
"Referer": "https://www.google.com/", | ||
"DNT": "1", | ||
"Connection": "keep-alive", | ||
"Upgrade-Insecure-Requests": "1", | ||
} | ||
|
||
|
||
class AsyncHtmlLoader(BaseLoader): | ||
"""Loads HTML asynchronously.""" | ||
|
||
web_paths: List[str] | ||
|
||
requests_per_second: int = 2 | ||
"""Max number of concurrent requests to make.""" | ||
|
||
requests_kwargs: Dict[str, Any] = {} | ||
"""kwargs for requests""" | ||
|
||
raise_for_status: bool = False | ||
"""Raise an exception if http status code denotes an error.""" | ||
|
||
def __init__( | ||
self, | ||
web_path: Union[str, List[str]], | ||
header_template: Optional[dict] = None, | ||
verify_ssl: Optional[bool] = True, | ||
proxies: Optional[dict] = None, | ||
): | ||
"""Initialize with webpage path.""" | ||
|
||
# TODO: Deprecate web_path in favor of web_paths, and remove this | ||
# left like this because there are a number of loaders that expect single | ||
# urls | ||
if isinstance(web_path, str): | ||
self.web_paths = [web_path] | ||
elif isinstance(web_path, List): | ||
self.web_paths = web_path | ||
|
||
headers = header_template or default_header_template | ||
if not headers.get("User-Agent"): | ||
try: | ||
from fake_useragent import UserAgent | ||
|
||
headers["User-Agent"] = UserAgent().random | ||
except ImportError: | ||
logger.info( | ||
"fake_useragent not found, using default user agent." | ||
"To get a realistic header for requests, " | ||
"`pip install fake_useragent`." | ||
) | ||
|
||
self.session = requests.Session() | ||
self.session.headers = dict(headers) | ||
self.session.verify = verify_ssl | ||
|
||
if proxies: | ||
self.session.proxies.update(proxies) | ||
|
||
async def _fetch( | ||
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 | ||
) -> str: | ||
async with aiohttp.ClientSession() as session: | ||
for i in range(retries): | ||
try: | ||
async with session.get( | ||
url, | ||
headers=self.session.headers, | ||
ssl=None if self.session.verify else False, | ||
) as response: | ||
return await response.text() | ||
except aiohttp.ClientConnectionError as e: | ||
if i == retries - 1: | ||
raise | ||
else: | ||
logger.warning( | ||
f"Error fetching {url} with attempt " | ||
f"{i + 1}/{retries}: {e}. Retrying..." | ||
) | ||
await asyncio.sleep(cooldown * backoff**i) | ||
raise ValueError("retry count exceeded") | ||
|
||
async def _fetch_with_rate_limit( | ||
self, url: str, semaphore: asyncio.Semaphore | ||
) -> str: | ||
async with semaphore: | ||
return await self._fetch(url) | ||
|
||
async def fetch_all(self, urls: List[str]) -> Any: | ||
"""Fetch all urls concurrently with rate limiting.""" | ||
semaphore = asyncio.Semaphore(self.requests_per_second) | ||
tasks = [] | ||
for url in urls: | ||
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) | ||
tasks.append(task) | ||
try: | ||
from tqdm.asyncio import tqdm_asyncio | ||
|
||
return await tqdm_asyncio.gather( | ||
*tasks, desc="Fetching pages", ascii=True, mininterval=1 | ||
) | ||
except ImportError: | ||
warnings.warn("For better logging of progress, `pip install tqdm`") | ||
return await asyncio.gather(*tasks) | ||
|
||
def lazy_load(self) -> Iterator[Document]: | ||
"""Lazy load text from the url(s) in web_path.""" | ||
for doc in self.load(): | ||
yield doc | ||
|
||
def load(self) -> List[Document]: | ||
"""Load text from the url(s) in web_path.""" | ||
|
||
results = asyncio.run(self.fetch_all(self.web_paths)) | ||
docs = [] | ||
for i, text in enumerate(results): | ||
metadata = {"source": self.web_paths[i]} | ||
docs.append(Document(page_content=text, metadata=metadata)) | ||
|
||
return docs |
Oops, something went wrong.