Merge pull request #1 from NREL/gb/tests

Gb/tests
NREL · Sep 13, 2023 · 715cdde · 715cdde
2 parents ed55963 + 724d45a
commit 715cdde
Show file tree

Hide file tree

Showing 13 changed files with 476 additions and 37 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -0,0 +1,47 @@
+name: pytests
+
+on: pull_request
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ['3.10']
+        include:
+          - os: ubuntu-latest
+            python-version: 3.9
+          - os: ubuntu-latest
+            python-version: 3.8
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        ref: ${{ github.event.pull_request.head.ref }}
+        fetch-depth: 1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install pytest
+        python -m pip install pytest-mock
+        python -m pip install pytest-cov
+        python -m pip install .
+    - name: Run pytest and Generate coverage report
+      run: |
+        python -m pytest -v --disable-warnings --cov=./ --cov-report=xml:coverage.xml
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v1
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: ./coverage.xml
+        flags: unittests
+        env_vars: OS,PYTHON
+        name: codecov-umbrella
+        fail_ci_if_error: false
+        verbose: true
diff --git a/elm/__init__.py b/elm/__init__.py
@@ -3,6 +3,7 @@
 Energy Language Model
 """
 
+import os
 from elm.base import ApiBase
 from elm.chunk import Chunker
 from elm.embed import ChunkAndEmbed
@@ -13,3 +14,6 @@
 
 __author__ = """Grant Buster"""
 __email__ = "Grant.Buster@nrel.gov"
+
+ELM_DIR = os.path.dirname(os.path.realpath(__file__))
+TEST_DATA_DIR = os.path.join(os.path.dirname(ELM_DIR), 'tests', 'data')
diff --git a/elm/embed.py b/elm/embed.py
@@ -20,29 +20,23 @@ class ChunkAndEmbed(ApiBase):
     DEFAULT_MODEL = 'text-embedding-ada-002'
     """Default model to do embeddings."""
 
-    def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
-                 overlap=1):
+    def __init__(self, text, model=None, **chunk_kwargs):
         """
         Parameters
         ----------
         text : str
             Single continuous piece of text to chunk up by paragraph and embed
             or filepath to .txt file containing one piece of text.
-        tag : None | str
-            Optional reference tag to include at the beginning of each text
-            chunk
         model : None | str
             Optional specification of OpenAI model to use. Default is
             cls.DEFAULT_MODEL
-        tokens_per_chunk : float
-            Nominal token count per text chunk
-        overlap : int
-            Number of paragraphs to overlap between chunks
+        chunk_kwargs : dict | None
+            kwargs for initialization of :class:`elm.chunk.Chunker`
         """
+
         super().__init__(model)
 
         self.text = text
-        self.tag = tag
 
         if os.path.isfile(text):
             logger.info('Loading text file: {}'.format(text))
@@ -52,9 +46,7 @@ def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
         assert isinstance(self.text, str)
         self.text = self.clean_tables(self.text)
 
-        self.text_chunks = Chunker(self.text, tag=tag,
-                                   tokens_per_chunk=tokens_per_chunk,
-                                   overlap=overlap)
+        self.text_chunks = Chunker(self.text, **chunk_kwargs)
 
     @staticmethod
     def clean_tables(text):
@@ -81,8 +73,50 @@ def clean_tables(text):
 
         return '\n'.join(lines)
 
+    def run(self, rate_limit=175e3):
+        """Run text embedding in serial
+
+        Parameters
+        ----------
+        rate_limit : float
+            OpenAI API rate limit (tokens / minute). Note that the
+            embedding limit is 350k as of 4/2023, but we're using a large
+            factor of safety (~1/2) because we can only count the tokens on the
+            input side and assume the output is about the same count.
+
+        Returns
+        -------
+        embedding : list
+            List of 1D arrays representing the embeddings for all text chunks
+        """
+
+        logger.info('Embedding {} text chunks...'
+                    .format(len(self.text_chunks)))
+
+        embeddings = []
+        for i, chunk in enumerate(self.text_chunks):
+            req = {"input": chunk, "model": self.model}
+
+            if 'azure' in str(openai.api_type).lower():
+                req['engine'] = self.model
+
+            out = self.call_api(self.EMBEDDING_URL, self.HEADERS, req)
+
+            try:
+                out = out['data'][0]['embedding']
+                embeddings.append(out)
+            except Exception:
+                msg = ('Could not get embeddings for chunk {}, '
+                       'received API response: {}'.format(i + 1, out))
+                logger.error(msg)
+                embeddings.append(None)
+
+        logger.info('Finished all embeddings.')
+
+        return embeddings
+
     async def run_async(self, rate_limit=175e3):
-        """Run text embedding
+        """Run text embedding on chunks asynchronously
 
         NOTE: you need to call this using the await command in ipython or
         jupyter, e.g.: `out = await ChunkAndEmbed.run_async()`
@@ -101,12 +135,6 @@ async def run_async(self, rate_limit=175e3):
             List of 1D arrays representing the embeddings for all text chunks
         """
 
-        if not isinstance(self.text_chunks, Chunker):
-            msg = ('You must init a Chunker obj with the text before '
-                   'running async embeddings!')
-            logger.error(msg)
-            raise RuntimeError(msg)
-
         logger.info('Embedding {} text chunks...'
                     .format(len(self.text_chunks)))
 

diff --git a/elm/summary.py b/elm/summary.py
@@ -24,8 +24,7 @@ class Summary(ApiBase):
     """Prefix to the engineered prompt. That `n_words` is an initialization
     argument for the Summary class."""
 
-    def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
-                 split_on='\n\n', n_words=500):
+    def __init__(self, text, model=None, n_words=500, **chunk_kwargs):
         """
         Parameters
         ----------
@@ -34,18 +33,13 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
             document with empty lines between paragraphs.
         model : str
             GPT model name, default is the DEFAULT_MODEL global var
-        tokens_per_chunk : float
-            Nominal token count per text chunk. Overlap paragraphs will exceed
-            this.
-        overlap : int
-            Number of paragraphs to overlap between chunks
-        split_on : str
-            Sub string to split text into paragraphs.
         n_words : int
             Desired length of the output text. Note that this is never perfect
             but helps guide the LLM to an approximate desired output length.
             400-600 words seems to work quite well with GPT-4. This gets
             formatted into the MODEL_INSTRUCTION attribute.
+        chunk_kwargs : dict | None
+            kwargs for initialization of :class:`elm.chunk.Chunker`
         """
 
         super().__init__(model)
@@ -60,9 +54,7 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
 
         assert isinstance(self.text, str)
 
-        self.text_chunks = Chunker(self.text,
-                                   tokens_per_chunk=tokens_per_chunk,
-                                   overlap=overlap, split_on=split_on)
+        self.text_chunks = Chunker(self.text, **chunk_kwargs)
 
     def combine(self, text_summary):
         """Combine separate chunk summaries into one more comprehensive

diff --git a/elm/tree.py b/elm/tree.py
@@ -57,6 +57,7 @@ def __init__(self, graph):
             transition.
         """
         self._g = graph
+        self._history = []
         assert isinstance(self.graph, nx.DiGraph)
         assert 'api' in self.graph.graph
 
@@ -93,6 +94,16 @@ def all_messages_txt(self):
         messages = '\n\n'.join(messages)
         return messages
 
+    @property
+    def history(self):
+        """Get a record of the nodes traversed in the tree
+
+        Returns
+        -------
+        list
+        """
+        return self._history
+
     @property
     def graph(self):
         """Get the networkx graph object
@@ -122,6 +133,7 @@ def call_node(self, node0):
         txt_fmt = {k: v for k, v in self.graph.graph.items() if k != 'api'}
         prompt = prompt.format(**txt_fmt)
 
+        self._history.append(node0)
         out = self.api.chat(prompt)
 
         successors = list(self.graph.successors(node0))
@@ -168,6 +180,9 @@ def run(self, node0='init'):
         out : str
             Final response from LLM at the leaf node.
         """
+
+        self._history = []
+
         while True:
             try:
                 out = self.call_node(node0)

diff --git a/setup.py b/setup.py
@@ -4,10 +4,6 @@
 import os
 from codecs import open
 from setuptools import setup, find_packages
-from setuptools.command.develop import develop
-from subprocess import check_call
-import shlex
-from warnings import warn
 
 here = os.path.abspath(os.path.dirname(__file__))
 
@@ -24,7 +20,7 @@
     install_requires = f.readlines()
 
 
-test_requires = ["pytest>=5.2", ]
+test_requires = ["pytest>=5.2", "pytest-mock"]
 description = "Energy Language Model"
 
 setup(

diff --git a/tests/data/GPT-4.pdf b/tests/data/GPT-4.pdf