Skip to content

merge pearl notebook #152

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
350 changes: 350 additions & 0 deletions notebooks/25_pearl.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0ed2304b",
"metadata": {},
"source": [
"# Crawl, load, and split Pearl of Great Price from The Church of Jesus Christ of Latter-day Saints"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7fe5bf12",
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%load_ext dotenv\n",
"%dotenv"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4bca89a2",
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"import os\n",
"\n",
"from models.load_pearl_of_great_price import load_pogp\n",
"from models.load_utils import Loader, save_docs_to_jsonl\n",
"from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl\n",
"from models.split_model import MarkdownSyntacticEmbeddingSplitter\n",
"from urllib.parse import urljoin, urlparse\n",
"from bs4 import BeautifulSoup\n",
"from models.crawl_utils import get_page, save_page"
]
},
{
"cell_type": "markdown",
"id": "91ece895",
"metadata": {},
"source": [
"## Crawl"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e753397e",
"metadata": {},
"outputs": [],
"source": [
"# config\n",
"hosts = ['https://pearlofgreatpricecentral.org/category/book-of-abraham/', 'https://pearlofgreatpricecentral.org/category/joseph-smith-history/']\n",
"base_dir = '../data/raw/pearl_of_great_price'\n",
"bs_parser = 'html.parser'\n",
"delay_seconds = 15\n",
"if not os.path.exists(base_dir):\n",
" os.makedirs(base_dir)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9225f0f2",
"metadata": {},
"outputs": [],
"source": [
"def extract_next_sibling_href(soup, base_url):\n",
" # Find the span tag with classes 'page-numbers' and 'current'\n",
" span_tag = soup.find('span', class_='page-numbers current') \n",
" if span_tag:\n",
" # Find the next sibling anchor tag\n",
" anchor_tag = span_tag.find_next_sibling('a', href=True)\n",
" if anchor_tag:\n",
" href = anchor_tag['href']\n",
" full_url = urljoin(base_url, href)\n",
" return full_url\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8861659b-9805-45a3-a931-cfb74a293949",
"metadata": {},
"outputs": [],
"source": [
"def fetch_and_extract_hrefs(start_url, base_url, max_pages=10):\n",
" # List to store all extracted hrefs\n",
" all_hrefs = []\n",
" # Loop to fetch pages and extract hrefs\n",
" for _ in range(max_pages):\n",
" # Fetch the current page\n",
" status_code, html = get_page(start_url)\n",
" if status_code == 200:\n",
" # Parse the HTML content using BeautifulSoup\n",
" soup = BeautifulSoup(html, 'html.parser')\n",
" # Extract hrefs from the current page\n",
" hrefs = extract_hrefs_from_elementor_div(soup, base_url)\n",
" # Add the extracted hrefs to the list\n",
" all_hrefs.extend(hrefs)\n",
" # Get the href for the next page\n",
" next_page_href = extract_next_sibling_href(soup, base_url)\n",
" if next_page_href:\n",
" # Update the start_url for the next iteration\n",
" start_url = next_page_href\n",
" else:\n",
" print(\"No next page found. Exiting loop.\")\n",
" break\n",
" else:\n",
" print(f\"Failed to fetch page: {start_url}\")\n",
" break\n",
" return all_hrefs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2d33629-31c0-4968-8346-db60b0549e58",
"metadata": {},
"outputs": [],
"source": [
"all_hrefs = []\n",
"for start_url in hosts:\n",
" hrefs = fetch_and_extract_hrefs(start_url, start_url)\n",
" all_hrefs.extend(hrefs)\n",
"print(all_hrefs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "658047a1",
"metadata": {},
"outputs": [],
"source": [
"len(all_hrefs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c793dba8",
"metadata": {},
"outputs": [],
"source": [
"def get_path(url):\n",
" path_components = urlparse(url).path.split('/')\n",
" return os.path.join(base_dir, f\"{path_components[-2]}.json\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fa244c64-5d40-4d93-ac95-e5fce805b9fd",
"metadata": {},
"outputs": [],
"source": [
"for url in all_hrefs:\n",
" path_file = get_path(url)\n",
" print(path_file)\n",
" if os.path.exists(path_file):\n",
" continue\n",
" status_code, html = get_page(url, delay_seconds)\n",
" if status_code != 200:\n",
" print(\"Error!\", status_code , url)\n",
" continue\n",
" save_page(path_file,url,html) \n",
"print(\"End\")"
]
},
{
"cell_type": "markdown",
"id": "90af220c",
"metadata": {},
"source": [
"## Load"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4399a0c9",
"metadata": {},
"outputs": [],
"source": [
"# config\n",
"input_dir = '../data/raw/pearl_of_great_price'\n",
"output_dir = '../data/load/pearl_of_great_price'\n",
"\n",
"if not os.path.exists(output_dir):\n",
" os.makedirs(output_dir)\n",
"\n",
"today = datetime.today().strftime('%Y-%m-%d')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cc10596",
"metadata": {},
"outputs": [],
"source": [
"loader = Loader(load_pogp, input_dir)\n",
"docs = loader.load(verbose=True)\n",
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8755943",
"metadata": {},
"outputs": [],
"source": [
"print(docs[0].metadata)\n",
"print(docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "486c3f00",
"metadata": {},
"outputs": [],
"source": [
"output_filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
"\n",
"save_docs_to_jsonl(docs, output_filename)"
]
},
{
"cell_type": "markdown",
"id": "c9c4ec07",
"metadata": {},
"source": [
"## Split"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1d47019",
"metadata": {},
"outputs": [],
"source": [
"# configure\n",
"input_path = '../data/load/pearl_of_great_price/2023-11-27.jsonl'\n",
"output_dir = '../data/split/pearl_of_great_price/'\n",
"if not os.path.exists(output_dir):\n",
" os.makedirs(output_dir)\n",
"today = datetime.today().strftime('%Y-%m-%d')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d40ff1fb",
"metadata": {},
"outputs": [],
"source": [
"docs = load_docs_from_jsonl(input_path)\n",
"len(docs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e40b8c7",
"metadata": {},
"outputs": [],
"source": [
"text_splitter = MarkdownSyntacticEmbeddingSplitter()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d26d2e3",
"metadata": {},
"outputs": [],
"source": [
"splits = text_splitter.split_documents(docs, verbose=True)\n",
"len(splits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "da9a292c",
"metadata": {},
"outputs": [],
"source": [
"for ix, split in enumerate(splits[:10]):\n",
" print(ix, split.metadata)\n",
" print(split.page_content)\n",
" print(\"\\n!!! SPLIT !!!\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "56bb464e",
"metadata": {},
"outputs": [],
"source": [
"filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
"save_docs_to_jsonl(splits, filename)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c49eaf4-677e-439c-8d9e-0e494b1d3c4e",
"metadata": {},
"outputs": [],
"source": [
"print(len(docs), len(splits))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "models",
"language": "python",
"name": "models"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}