diff --git a/examples/graph_example.py b/examples/graph_example.py new file mode 100644 index 00000000..cf56ed9c --- /dev/null +++ b/examples/graph_example.py @@ -0,0 +1,17 @@ +from yosoai.graphs import SmartScraper + +OPENAI_API_KEY = '' + +llm_config = { + "api_key": OPENAI_API_KEY, + "model_name": "gpt-3.5-turbo", +} + +url = "https://perinim.github.io/projects/" +prompt = "List me all the titles and project descriptions" + +smart_scraper = SmartScraper(prompt, url, llm_config) + +answer = smart_scraper.run() +print(answer) + diff --git a/requirements-dev.txt b/requirements-dev.txt index 0bc50efc..782190cf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,6 @@ wheel==0.42.0 +setuptools==65.5.1 twine==4.0.2 sphinx==7.1.2 sphinx-rtd-theme==2.0.0 -pytest==8.0.0 \ No newline at end of file +pytest==8.0.0 diff --git a/requirements.txt b/requirements.txt index 5fc88bd9..816a95e6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,8 @@ langchain==0.1.6 langchain_community==0.0.19 langchain_core==0.1.22 langchain_openai==0.0.5 +beautifulsoup4==4.12.3 pandas==2.0.3 -pytest==8.0.0 python-dotenv==1.0.1 -setuptools==65.5.1 tiktoken>=0.5.2,<0.6.0 tqdm==4.66.1 diff --git a/setup.py b/setup.py index ab5bf3b6..98588cd6 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def load_requirements(filename): 'Programming Language :: Python :: 3.12', 'Operating System :: OS Independent', ], - packages = ['yosoai'], + packages = find_packages(), python_requires='>=3.9, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*, !=3.8.*', include_package_data=True, # https://packaging.python.org/en/latest/discussions/install-requires-vs-requirements/ diff --git a/yosoai/graphs/__init__.py b/yosoai/graphs/__init__.py new file mode 100644 index 00000000..3f3eca9c --- /dev/null +++ b/yosoai/graphs/__init__.py @@ -0,0 +1,2 @@ +from .base_graph import BaseGraph +from .smart_scraper_graph import SmartScraper \ No newline at end of file diff --git a/yosoai/graphs/base_graph.py b/yosoai/graphs/base_graph.py new file mode 100644 index 00000000..636bce78 --- /dev/null +++ b/yosoai/graphs/base_graph.py @@ -0,0 +1,74 @@ +class BaseGraph: + """ + BaseGraph manages the execution flow of a graph composed of interconnected nodes. + + Attributes: + nodes (dict): A dictionary mapping each node's name to its corresponding node instance. + edges (dict): A dictionary representing the directed edges of the graph where each + key-value pair corresponds to the from-node and to-node relationship. + entry_point (str): The name of the entry point node from which the graph execution begins. + + Methods: + execute(initial_state): Executes the graph's nodes starting from the entry point and + traverses the graph based on the provided initial state. + + Args: + nodes (iterable): An iterable of node instances that will be part of the graph. + edges (iterable): An iterable of tuples where each tuple represents a directed edge + in the graph, defined by a pair of nodes (from_node, to_node). + entry_point (BaseNode): The node instance that represents the entry point of the graph. + """ + + def __init__(self, nodes, edges, entry_point): + """ + Initializes the graph with nodes, edges, and the entry point. + """ + self.nodes = {node.node_name: node for node in nodes} + self.edges = self._create_edges(edges) + self.entry_point = entry_point.node_name + + def _create_edges(self, edges): + """ + Helper method to create a dictionary of edges from the given iterable of tuples. + + Args: + edges (iterable): An iterable of tuples representing the directed edges. + + Returns: + dict: A dictionary of edges with the from-node as keys and to-node as values. + """ + edge_dict = {} + for from_node, to_node in edges: + edge_dict[from_node.node_name] = to_node.node_name + return edge_dict + + def execute(self, initial_state): + """ + Executes the graph by traversing nodes starting from the entry point. The execution + follows the edges based on the result of each node's execution and continues until + it reaches a node with no outgoing edges. + + Args: + initial_state (dict): The initial state to pass to the entry point node. + + Returns: + dict: The state after execution has completed, which may have been altered by the nodes. + """ + current_node_name = self.entry_point + state = initial_state + + while current_node_name is not None: + current_node = self.nodes[current_node_name] + result = current_node.execute(state) + + if current_node.node_type == "conditional_node": + # For ConditionalNode, result is the next node based on the condition + current_node_name = result + elif current_node_name in self.edges: + # For regular nodes, move to the next node based on the defined edges + current_node_name = self.edges[current_node_name] + else: + # No further edges, end the execution + current_node_name = None + + return state diff --git a/yosoai/graphs/smart_scraper_graph.py b/yosoai/graphs/smart_scraper_graph.py new file mode 100644 index 00000000..ab285a66 --- /dev/null +++ b/yosoai/graphs/smart_scraper_graph.py @@ -0,0 +1,98 @@ +from langchain_openai import ChatOpenAI +from .base_graph import BaseGraph +from ..nodes import FetchHTMLNode,ConditionalNode, GetProbableTagsNode, GenerateAnswerNode, ParseHTMLNode + +class SmartScraper: + """ + SmartScraper is a comprehensive web scraping tool that automates the process of extracting + information from web pages using a natural language model to interpret and answer prompts. + + Attributes: + prompt (str): The user's natural language prompt for the information to be extracted. + url (str): The URL of the web page to scrape. + llm_config (dict): Configuration parameters for the language model, with 'api_key' being mandatory. + llm (ChatOpenAI): An instance of the ChatOpenAI class configured with llm_config. + graph (BaseGraph): An instance of the BaseGraph class representing the scraping workflow. + + Methods: + run(): Executes the web scraping process and returns the answer to the prompt. + + Args: + prompt (str): The user's natural language prompt for the information to be extracted. + url (str): The URL of the web page to scrape. + llm_config (dict): A dictionary containing configuration options for the language model. + Must include 'api_key', may also specify 'model_name', 'temperature', and 'streaming'. + """ + + def __init__(self, prompt, url, llm_config): + """ + Initializes the SmartScraper with a prompt, URL, and language model configuration. + """ + self.prompt = prompt + self.url = url + self.llm_config = llm_config + self.llm = self._create_llm() + self.graph = self._create_graph() + + def _create_llm(self): + """ + Creates an instance of the ChatOpenAI class with the provided language model configuration. + + Returns: + ChatOpenAI: An instance of the ChatOpenAI class. + + Raises: + ValueError: If 'api_key' is not provided in llm_config. + """ + llm_defaults = { + "model_name": "gpt-3.5-turbo", + "temperature": 0, + "streaming": True + } + # Update defaults with any LLM parameters that were provided + llm_params = {**llm_defaults, **self.llm_config} + # Ensure the api_key is set, raise an error if it's not + if "api_key" not in llm_params: + raise ValueError("LLM configuration must include an 'api_key'.") + # Create the ChatOpenAI instance with the provided and default parameters + return ChatOpenAI(**llm_params) + + def _create_graph(self): + """ + Creates the graph of nodes representing the workflow for web scraping. + + Returns: + BaseGraph: An instance of the BaseGraph class. + """ + fetch_html_node = FetchHTMLNode("fetch_html") + get_probable_tags_node = GetProbableTagsNode(self.llm, "get_probable_tags") + parse_document_node = ParseHTMLNode("parse_document") + generate_answer_node = GenerateAnswerNode(self.llm, "generate_answer") + conditional_node = ConditionalNode("conditional", [parse_document_node, generate_answer_node]) + + return BaseGraph( + nodes={ + fetch_html_node, + get_probable_tags_node, + conditional_node, + parse_document_node, + generate_answer_node, + }, + edges={ + (fetch_html_node, get_probable_tags_node), + (get_probable_tags_node, conditional_node), + (parse_document_node, generate_answer_node) + }, + entry_point=fetch_html_node + ) + + def run(self): + """ + Executes the scraping process by running the graph and returns the extracted information. + + Returns: + str: The answer extracted from the web page, corresponding to the given prompt. + """ + inputs = {"keys": {"user_input": self.prompt, "url": self.url}} + final_state = self.graph.execute(inputs) + return final_state["keys"].get("answer", "No answer found.") diff --git a/yosoai/nodes/__init__.py b/yosoai/nodes/__init__.py new file mode 100644 index 00000000..89949364 --- /dev/null +++ b/yosoai/nodes/__init__.py @@ -0,0 +1,5 @@ +from .fetch_html_node import FetchHTMLNode +from .conditional_node import ConditionalNode +from .get_probable_tags_node import GetProbableTagsNode +from .generate_answer_node import GenerateAnswerNode +from .parse_html_node import ParseHTMLNode \ No newline at end of file diff --git a/yosoai/nodes/base_node.py b/yosoai/nodes/base_node.py new file mode 100644 index 00000000..5c62b729 --- /dev/null +++ b/yosoai/nodes/base_node.py @@ -0,0 +1,61 @@ +from abc import ABC, abstractmethod + +class BaseNode(ABC): + """ + An abstract base class for nodes in a graph-based workflow. Each node is + intended to perform a specific action when executed as part of the graph's + processing flow. + + Attributes: + node_name (str): A unique identifier for the node. + node_type (str): Specifies the node's type, which influences how the + node interacts within the graph. Valid values are + "node" for standard nodes and "conditional_node" for + nodes that determine the flow based on conditions. + + Methods: + execute(state): An abstract method that subclasses must implement. This + method should contain the logic that the node executes + when it is reached in the graph's flow. It takes the + graph's current state as input and returns the updated + state after execution. + + Args: + node_name (str): The unique identifier name for the node. This name is + used to reference the node within the graph. + node_type (str): The type of the node, limited to "node" or + "conditional_node". This categorization helps in + determining the node's role and behavior within the + graph. + + Raises: + ValueError: If the provided `node_type` is not one of the allowed + values ("node" or "conditional_node"), a ValueError is + raised to indicate the incorrect usage. + """ + + def __init__(self, node_name: str, node_type: str): + """ + Initialize the node with a unique identifier and a specified node type. + + Args: + node_name (str): The unique identifier name for the node. + node_type (str): The type of the node, limited to "node" or "conditional_node". + + Raises: + ValueError: If node_type is not "node" or "conditional_node". + """ + self.node_name = node_name + if node_type not in ["node", "conditional_node"]: + raise ValueError(f"node_type must be 'node' or 'conditional_node', got '{node_type}'") + self.node_type = node_type + + @abstractmethod + def execute(self, state): + """ + Execute the node's logic and return the updated state. + + :param state: The current state of the graph. + :return: The updated state after executing this node. + """ + pass \ No newline at end of file diff --git a/yosoai/nodes/conditional_node.py b/yosoai/nodes/conditional_node.py new file mode 100644 index 00000000..84153a6f --- /dev/null +++ b/yosoai/nodes/conditional_node.py @@ -0,0 +1,65 @@ +from .base_node import BaseNode + +class ConditionalNode(BaseNode): + """ + A node that determines the next step in the graph's execution flow based on + the presence and content of a specified key in the graph's state. It extends + the BaseNode by adding condition-based logic to the execution process. + + This node type is used to implement branching logic within the graph, allowing + for dynamic paths based on the data available in the current state. + + Attributes: + key_name (str): The name of the key in the state to check for its presence. + next_nodes (list): A list of two node instances. The first node is chosen + for execution if the key exists and has a non-empty value, + and the second node is chosen if the key does not exist or + is empty. + + Args: + key_name (str): The name of the key to check in the graph's state. This is + used to determine the path the graph's execution should take. + next_nodes (list): A list containing exactly two node instances, specifying + the next nodes to execute based on the condition's outcome. + node_name (str, optional): The unique identifier name for the node. Defaults + to "ConditionalNode". + + Raises: + ValueError: If next_nodes does not contain exactly two elements, indicating + a misconfiguration in specifying the conditional paths. + """ + + def __init__(self, key_name, next_nodes, node_name="ConditionalNode"): + """ + Initializes the node with the key to check and the next node names based on the condition. + + Args: + key_name (str): The name of the key to check in the state. + next_nodes (list): A list containing exactly two names of the next nodes. + The first is used if the key exists, the second if it does not. + + Raises: + ValueError: If next_nodes does not contain exactly two elements. + """ + + super().__init__(node_name, "conditional_node") + self.key_name = key_name + if len(next_nodes) != 2: + raise ValueError("next_nodes must contain exactly two elements.") + self.next_nodes = next_nodes + + def execute(self, state): + """ + Checks if the specified key is present in the state and decides the next node accordingly. + + Args: + state (dict): The current state of the graph. + + Returns: + str: The name of the next node to execute based on the presence of the key. + """ + + if self.key_name in state.get("keys", {}) and len(state["keys"][self.key_name]) > 0: + return self.next_nodes[0].node_name + else: + return self.next_nodes[1].node_name \ No newline at end of file diff --git a/yosoai/nodes/fetch_html_node.py b/yosoai/nodes/fetch_html_node.py new file mode 100644 index 00000000..d42d9d48 --- /dev/null +++ b/yosoai/nodes/fetch_html_node.py @@ -0,0 +1,62 @@ +from langchain_community.document_loaders import AsyncHtmlLoader +from .base_node import BaseNode + +class FetchHTMLNode(BaseNode): + """ + A node responsible for fetching the HTML content of a specified URL and updating + the graph's state with this content. It uses the AsyncHtmlLoader for asynchronous + document loading. + + This node acts as a starting point in many scraping workflows, preparing the state + with the necessary HTML content for further processing by subsequent nodes in the graph. + + Attributes: + node_name (str): The unique identifier name for the node. + node_type (str): The type of the node, defaulting to "node". This categorization + helps in determining the node's role and behavior within the graph. + The "node" type is used for standard operational nodes. + + Args: + node_name (str): The unique identifier name for the node. This name is used to + reference the node within the graph. + node_type (str, optional): The type of the node, limited to "node" or + "conditional_node". Defaults to "node". + + Methods: + execute(state): Fetches the HTML content for the URL specified in the state and + updates the state with this content under the 'document' key. + The 'url' key must be present in the state for the operation + to succeed. + """ + + def __init__(self, node_name, node_type="node"): + """ + Initializes the FetchHTMLNode with a node name and node type. + """ + super().__init__(node_name, node_type) + + def execute(self, state): + """ + Executes the node's logic to fetch HTML content from a specified URL and + update the state with this content. + + Args: + state (dict): The current state of the graph, expected to contain a 'url' key. + + Returns: + dict: The updated state with a new 'document' key containing the fetched HTML content. + + Raises: + KeyError: If the 'url' key is not found in the state, indicating that the + necessary information to perform the operation is missing. + """ + try: + url = state["keys"]["url"] + except KeyError as e: + print(f"Error: {e} not found in state.") + raise + loader = AsyncHtmlLoader(url) + document = loader.load() + state["keys"]["document"] = document + + return state diff --git a/yosoai/nodes/generate_answer_node.py b/yosoai/nodes/generate_answer_node.py new file mode 100644 index 00000000..8b1de9b1 --- /dev/null +++ b/yosoai/nodes/generate_answer_node.py @@ -0,0 +1,93 @@ +from .base_node import BaseNode +from langchain.prompts import PromptTemplate +from langchain_core.output_parsers import JsonOutputParser + +class GenerateAnswerNode(BaseNode): + """ + A node that generates an answer using a language model (LLM) based on the user's input + and the content extracted from a webpage. It constructs a prompt from the user's input + and the scraped content, feeds it to the LLM, and parses the LLM's response to produce + an answer. + + Attributes: + llm (ChatOpenAI): An instance of a language model client, configured for generating answers. + node_name (str): The unique identifier name for the node, defaulting to "GenerateAnswerNode". + node_type (str): The type of the node, set to "node" indicating a standard operational node. + + Args: + llm: An instance of the language model client (e.g., ChatOpenAI) used for generating answers. + node_name (str, optional): The unique identifier name for the node. Defaults to "GenerateAnswerNode". + + Methods: + execute(state): Processes the input and document from the state to generate an answer, + updating the state with the generated answer under the 'answer' key. + """ + + def __init__(self, llm, node_name="GenerateAnswerNode"): + """ + Initializes the GenerateAnswerNode with a language model client and a node name. + """ + super().__init__(node_name, "node") + self.llm = llm + + def execute(self, state): + """ + Generates an answer by constructing a prompt from the user's input and the scraped + content, querying the language model, and parsing its response. + + The method updates the state with the generated answer under the 'answer' key. + + Args: + state (dict): The current state of the graph, expected to contain 'user_input', + and optionally 'parsed_document' or 'relevant_chunks' within 'keys'. + + Returns: + dict: The updated state with the 'answer' key containing the generated answer. + + Raises: + KeyError: If 'user_input' or 'document' is not found in the state, indicating + that the necessary information for generating an answer is missing. + """ + + print("---GENERATE ANSWER---") + try: + user_input = state["keys"]["user_input"] + document = state["keys"]["document"] + except KeyError as e: + print(f"Error: {e} not found in state.") + raise + + parsed_document = state["keys"].get("parsed_document", None) + relevant_chunks = state["keys"].get("relevant_chunks", None) + + # Use relevant chunks if available, otherwise use the parsed document or the original document + if relevant_chunks: + context = relevant_chunks + elif parsed_document: + context = parsed_document + else: + context = document + + output_parser = JsonOutputParser() + format_instructions = output_parser.get_format_instructions() + + template = """You are a website scraper and you have just scraped the following content from a website. You are now asked to answer a question about the content you have scraped.\n {format_instructions} \n The content is as follows: + {context} + + Question: {question} + """ + + schema_prompt = PromptTemplate( + template=template, + input_variables=["context", "question"], + partial_variables={"format_instructions": format_instructions}, + ) + + # Chain + schema_chain = schema_prompt | self.llm | output_parser + answer = schema_chain.invoke({"context": context, "question": user_input}) + + # Update the state with the generated answer + state["keys"].update({"answer": answer}) + return state + diff --git a/yosoai/nodes/get_probable_tags_node.py b/yosoai/nodes/get_probable_tags_node.py new file mode 100644 index 00000000..5ba81af7 --- /dev/null +++ b/yosoai/nodes/get_probable_tags_node.py @@ -0,0 +1,81 @@ +from .base_node import BaseNode +from langchain.prompts import PromptTemplate +from langchain.output_parsers import CommaSeparatedListOutputParser + +class GetProbableTagsNode(BaseNode): + """ + A node that utilizes a language model to identify probable HTML tags within a document that + are likely to contain the information relevant to a user's query. This node generates a prompt + describing the task, submits it to the language model, and processes the output to produce a + list of probable tags. + + Attributes: + llm: An instance of a language model client, configured for generating tag predictions. + node_name (str): The unique identifier name for the node, defaulting to "GetProbableTagsNode". + node_type (str): The type of the node, set to "node" indicating a standard operational node. + + Args: + llm: An instance of the language model client (e.g., ChatOpenAI) used for tag predictions. + node_name (str, optional): The unique identifier name for the node. Defaults to "GetProbableTagsNode". + + Methods: + execute(state): Processes the user's input and the URL from the state to generate a list of + probable HTML tags, updating the state with these tags under the 'tags' key. + """ + + def __init__(self, llm, node_name="GetProbableTagsNode"): + """ + Initializes the GetProbableTagsNode with a language model client and a node name. + """ + super().__init__(node_name, "node") + self.llm = llm + + def execute(self, state): + """ + Generates a list of probable HTML tags based on the user's input and updates the state + with this list. The method constructs a prompt for the language model, submits it, and + parses the output to identify probable tags. + + Args: + state (dict): The current state of the graph, expected to contain 'user_input', 'url', + and optionally 'document' within 'keys'. + + Returns: + dict: The updated state with the 'tags' key containing a list of probable HTML tags. + + Raises: + KeyError: If 'user_input' or 'url' is not found in the state, indicating that the + necessary information for generating tag predictions is missing. + """ + + print("---GET PROBABLE TAGS---") + # Accessing the nested structure + try: + user_input = state["keys"]["user_input"] + url = state["keys"]["url"] + except KeyError as e: + print(f"Error: {e} not found in state.") + raise + + output_parser = CommaSeparatedListOutputParser() + format_instructions = output_parser.get_format_instructions() + + template = """You are a website scraper that knows all the types of html tags. You are now asked to list all the html tags where you think you can find the information of the asked question.\n {format_instructions} \n The webpage is: {webpage} \n The asked question is the following: + {question} + """ + + tag_prompt = PromptTemplate( + template=template, + input_variables=["question"], + partial_variables={"format_instructions": format_instructions, "webpage": url}, + ) + + # Execute the chain to get probable tags + tag_answer = tag_prompt | self.llm | output_parser + probable_tags = tag_answer.invoke({"question": user_input}) + + print("Possible tags: ", *probable_tags) + + # Update the nested 'keys' dictionary with probable tags + state["keys"].update({"tags": probable_tags}) + return state \ No newline at end of file diff --git a/yosoai/nodes/parse_html_node.py b/yosoai/nodes/parse_html_node.py new file mode 100644 index 00000000..5502c5ab --- /dev/null +++ b/yosoai/nodes/parse_html_node.py @@ -0,0 +1,74 @@ +from .base_node import BaseNode +from langchain_community.document_transformers import BeautifulSoupTransformer + +class ParseHTMLNode(BaseNode): + """ + A node responsible for parsing HTML content from a document using specified tags. + It uses BeautifulSoupTransformer for parsing, providing flexibility in extracting + specific parts of an HTML document based on the tags provided in the state. + + This node enhances the scraping workflow by allowing for targeted extraction of + content, thereby optimizing the processing of large HTML documents. + + Attributes: + node_name (str): The unique identifier name for the node, defaulting to "ParseHTMLNode". + node_type (str): The type of the node, set to "node" indicating a standard operational node. + + Args: + node_name (str, optional): The unique identifier name for the node. Defaults to "ParseHTMLNode". + + Methods: + execute(state): Parses the HTML document contained within the state using the specified tags, + if provided, and updates the state with the parsed content. + """ + + def __init__(self, node_name="ParseHTMLNode"): + """ + Initializes the ParseHTMLNode with a node name. + """ + super().__init__(node_name, "node") + + def execute(self, state): + """ + Executes the node's logic to parse the HTML document based on specified tags. + If tags are provided in the state, the document is parsed accordingly; otherwise, + the document remains unchanged. The method updates the state with either the original + or parsed document under the 'parsed_document' key. + + Args: + state (dict): The current state of the graph, expected to contain 'document' within 'keys', + and optionally 'tags' for targeted parsing. + + Returns: + dict: The updated state with the 'parsed_document' key containing the parsed content, + if tags were provided, or the original document otherwise. + + Raises: + KeyError: If 'document' is not found in the state, indicating that the necessary + information for parsing is missing. + """ + + print("---PARSE HTML DOCUMENT---") + try: + document = state["keys"]["document"] + except KeyError as e: + print(f"Error: {e} not found in state.") + raise + + # Check if tags are specified in the state + tags = state["keys"].get("tags", None) + + if tags: + # Initialize the BeautifulSoupTransformer with any required configurations + bs_transformer = BeautifulSoupTransformer() + # Parse the document with specified tags + parsed_document = bs_transformer.transform_documents(document, tags_to_extract=tags) + print("Document parsed with specified tags.") + else: + # If no tags are specified, return the document as is + print("No specific tags provided; returning document as is.") + return state + + # Update the state with the parsed document + state["keys"].update({"parsed_document": parsed_document}) + return state \ No newline at end of file