scripturecentralqa · unclejoefx · Dec 31, 2023
diff --git a/notebooks/25_pearl.ipynb b/notebooks/25_pearl.ipynb
@@ -0,0 +1,350 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0ed2304b",
+   "metadata": {},
+   "source": [
+    "# Crawl, load, and split Pearl of Great Price from The Church of Jesus Christ of Latter-day Saints"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fe5bf12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2\n",
+    "%load_ext dotenv\n",
+    "%dotenv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4bca89a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "import os\n",
+    "\n",
+    "from models.load_pearl_of_great_price import load_pogp\n",
+    "from models.load_utils import Loader, save_docs_to_jsonl\n",
+    "from models.load_utils import load_docs_from_jsonl, save_docs_to_jsonl\n",
+    "from models.split_model import MarkdownSyntacticEmbeddingSplitter\n",
+    "from urllib.parse import urljoin, urlparse\n",
+    "from bs4 import BeautifulSoup\n",
+    "from models.crawl_utils import get_page, save_page"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91ece895",
+   "metadata": {},
+   "source": [
+    "## Crawl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e753397e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "hosts = ['https://pearlofgreatpricecentral.org/category/book-of-abraham/', 'https://pearlofgreatpricecentral.org/category/joseph-smith-history/']\n",
+    "base_dir = '../data/raw/pearl_of_great_price'\n",
+    "bs_parser = 'html.parser'\n",
+    "delay_seconds = 15\n",
+    "if not os.path.exists(base_dir):\n",
+    "    os.makedirs(base_dir)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9225f0f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_next_sibling_href(soup, base_url):\n",
+    "    # Find the span tag with classes 'page-numbers' and 'current'\n",
+    "    span_tag = soup.find('span', class_='page-numbers current')    \n",
+    "    if span_tag:\n",
+    "        # Find the next sibling anchor tag\n",
+    "        anchor_tag = span_tag.find_next_sibling('a', href=True)\n",
+    "        if anchor_tag:\n",
+    "            href = anchor_tag['href']\n",
+    "            full_url = urljoin(base_url, href)\n",
+    "            return full_url\n",
+    "    return None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8861659b-9805-45a3-a931-cfb74a293949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fetch_and_extract_hrefs(start_url, base_url, max_pages=10):\n",
+    "    # List to store all extracted hrefs\n",
+    "    all_hrefs = []\n",
+    "    # Loop to fetch pages and extract hrefs\n",
+    "    for _ in range(max_pages):\n",
+    "        # Fetch the current page\n",
+    "        status_code, html = get_page(start_url)\n",
+    "        if status_code == 200:\n",
+    "            # Parse the HTML content using BeautifulSoup\n",
+    "            soup = BeautifulSoup(html, 'html.parser')\n",
+    "            # Extract hrefs from the current page\n",
+    "            hrefs = extract_hrefs_from_elementor_div(soup, base_url)\n",
+    "            # Add the extracted hrefs to the list\n",
+    "            all_hrefs.extend(hrefs)\n",
+    "            # Get the href for the next page\n",
+    "            next_page_href = extract_next_sibling_href(soup, base_url)\n",
+    "            if next_page_href:\n",
+    "                # Update the start_url for the next iteration\n",
+    "                start_url = next_page_href\n",
+    "            else:\n",
+    "                print(\"No next page found. Exiting loop.\")\n",
+    "                break\n",
+    "        else:\n",
+    "            print(f\"Failed to fetch page: {start_url}\")\n",
+    "            break\n",
+    "    return all_hrefs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e2d33629-31c0-4968-8346-db60b0549e58",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_hrefs = []\n",
+    "for start_url in hosts:\n",
+    "    hrefs = fetch_and_extract_hrefs(start_url, start_url)\n",
+    "    all_hrefs.extend(hrefs)\n",
+    "print(all_hrefs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "658047a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(all_hrefs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c793dba8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_path(url):\n",
+    "    path_components = urlparse(url).path.split('/')\n",
+    "    return os.path.join(base_dir, f\"{path_components[-2]}.json\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa244c64-5d40-4d93-ac95-e5fce805b9fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for url in all_hrefs:\n",
+    "    path_file =  get_path(url)\n",
+    "    print(path_file)\n",
+    "    if os.path.exists(path_file):\n",
+    "        continue\n",
+    "    status_code, html = get_page(url, delay_seconds)\n",
+    "    if status_code != 200:\n",
+    "        print(\"Error!\", status_code , url)\n",
+    "        continue\n",
+    "    save_page(path_file,url,html)    \n",
+    "print(\"End\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90af220c",
+   "metadata": {},
+   "source": [
+    "## Load"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4399a0c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# config\n",
+    "input_dir = '../data/raw/pearl_of_great_price'\n",
+    "output_dir = '../data/load/pearl_of_great_price'\n",
+    "\n",
+    "if not os.path.exists(output_dir):\n",
+    "   os.makedirs(output_dir)\n",
+    "\n",
+    "today = datetime.today().strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cc10596",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = Loader(load_pogp, input_dir)\n",
+    "docs = loader.load(verbose=True)\n",
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8755943",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(docs[0].metadata)\n",
+    "print(docs[0].page_content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "486c3f00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
+    "\n",
+    "save_docs_to_jsonl(docs, output_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c9c4ec07",
+   "metadata": {},
+   "source": [
+    "## Split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1d47019",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# configure\n",
+    "input_path = '../data/load/pearl_of_great_price/2023-11-27.jsonl'\n",
+    "output_dir = '../data/split/pearl_of_great_price/'\n",
+    "if not os.path.exists(output_dir):\n",
+    "    os.makedirs(output_dir)\n",
+    "today = datetime.today().strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d40ff1fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = load_docs_from_jsonl(input_path)\n",
+    "len(docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e40b8c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_splitter = MarkdownSyntacticEmbeddingSplitter()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d26d2e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "splits = text_splitter.split_documents(docs, verbose=True)\n",
+    "len(splits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da9a292c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for ix, split in enumerate(splits[:10]):\n",
+    "    print(ix, split.metadata)\n",
+    "    print(split.page_content)\n",
+    "    print(\"\\n!!! SPLIT !!!\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56bb464e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filename = os.path.join(output_dir, f\"{today}.jsonl\")\n",
+    "save_docs_to_jsonl(splits, filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c49eaf4-677e-439c-8d9e-0e494b1d3c4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(len(docs), len(splits))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "models",
+   "language": "python",
+   "name": "models"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}