Skip to content

Commit

Permalink
Merge pull request #1 from NREL/gb/tests
Browse files Browse the repository at this point in the history
Gb/tests
  • Loading branch information
grantbuster authored Sep 13, 2023
2 parents ed55963 + 724d45a commit 715cdde
Show file tree
Hide file tree
Showing 13 changed files with 476 additions and 37 deletions.
47 changes: 47 additions & 0 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: pytests

on: pull_request

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ['3.10']
include:
- os: ubuntu-latest
python-version: 3.9
- os: ubuntu-latest
python-version: 3.8

steps:
- uses: actions/checkout@v2
with:
ref: ${{ github.event.pull_request.head.ref }}
fetch-depth: 1
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install pytest
python -m pip install pytest-mock
python -m pip install pytest-cov
python -m pip install .
- name: Run pytest and Generate coverage report
run: |
python -m pytest -v --disable-warnings --cov=./ --cov-report=xml:coverage.xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
flags: unittests
env_vars: OS,PYTHON
name: codecov-umbrella
fail_ci_if_error: false
verbose: true
4 changes: 4 additions & 0 deletions elm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
Energy Language Model
"""

import os
from elm.base import ApiBase
from elm.chunk import Chunker
from elm.embed import ChunkAndEmbed
Expand All @@ -13,3 +14,6 @@

__author__ = """Grant Buster"""
__email__ = "Grant.Buster@nrel.gov"

ELM_DIR = os.path.dirname(os.path.realpath(__file__))
TEST_DATA_DIR = os.path.join(os.path.dirname(ELM_DIR), 'tests', 'data')
68 changes: 48 additions & 20 deletions elm/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,23 @@ class ChunkAndEmbed(ApiBase):
DEFAULT_MODEL = 'text-embedding-ada-002'
"""Default model to do embeddings."""

def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
overlap=1):
def __init__(self, text, model=None, **chunk_kwargs):
"""
Parameters
----------
text : str
Single continuous piece of text to chunk up by paragraph and embed
or filepath to .txt file containing one piece of text.
tag : None | str
Optional reference tag to include at the beginning of each text
chunk
model : None | str
Optional specification of OpenAI model to use. Default is
cls.DEFAULT_MODEL
tokens_per_chunk : float
Nominal token count per text chunk
overlap : int
Number of paragraphs to overlap between chunks
chunk_kwargs : dict | None
kwargs for initialization of :class:`elm.chunk.Chunker`
"""

super().__init__(model)

self.text = text
self.tag = tag

if os.path.isfile(text):
logger.info('Loading text file: {}'.format(text))
Expand All @@ -52,9 +46,7 @@ def __init__(self, text, tag=None, model=None, tokens_per_chunk=500,
assert isinstance(self.text, str)
self.text = self.clean_tables(self.text)

self.text_chunks = Chunker(self.text, tag=tag,
tokens_per_chunk=tokens_per_chunk,
overlap=overlap)
self.text_chunks = Chunker(self.text, **chunk_kwargs)

@staticmethod
def clean_tables(text):
Expand All @@ -81,8 +73,50 @@ def clean_tables(text):

return '\n'.join(lines)

def run(self, rate_limit=175e3):
"""Run text embedding in serial
Parameters
----------
rate_limit : float
OpenAI API rate limit (tokens / minute). Note that the
embedding limit is 350k as of 4/2023, but we're using a large
factor of safety (~1/2) because we can only count the tokens on the
input side and assume the output is about the same count.
Returns
-------
embedding : list
List of 1D arrays representing the embeddings for all text chunks
"""

logger.info('Embedding {} text chunks...'
.format(len(self.text_chunks)))

embeddings = []
for i, chunk in enumerate(self.text_chunks):
req = {"input": chunk, "model": self.model}

if 'azure' in str(openai.api_type).lower():
req['engine'] = self.model

out = self.call_api(self.EMBEDDING_URL, self.HEADERS, req)

try:
out = out['data'][0]['embedding']
embeddings.append(out)
except Exception:
msg = ('Could not get embeddings for chunk {}, '
'received API response: {}'.format(i + 1, out))
logger.error(msg)
embeddings.append(None)

logger.info('Finished all embeddings.')

return embeddings

async def run_async(self, rate_limit=175e3):
"""Run text embedding
"""Run text embedding on chunks asynchronously
NOTE: you need to call this using the await command in ipython or
jupyter, e.g.: `out = await ChunkAndEmbed.run_async()`
Expand All @@ -101,12 +135,6 @@ async def run_async(self, rate_limit=175e3):
List of 1D arrays representing the embeddings for all text chunks
"""

if not isinstance(self.text_chunks, Chunker):
msg = ('You must init a Chunker obj with the text before '
'running async embeddings!')
logger.error(msg)
raise RuntimeError(msg)

logger.info('Embedding {} text chunks...'
.format(len(self.text_chunks)))

Expand Down
16 changes: 4 additions & 12 deletions elm/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ class Summary(ApiBase):
"""Prefix to the engineered prompt. That `n_words` is an initialization
argument for the Summary class."""

def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
split_on='\n\n', n_words=500):
def __init__(self, text, model=None, n_words=500, **chunk_kwargs):
"""
Parameters
----------
Expand All @@ -34,18 +33,13 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,
document with empty lines between paragraphs.
model : str
GPT model name, default is the DEFAULT_MODEL global var
tokens_per_chunk : float
Nominal token count per text chunk. Overlap paragraphs will exceed
this.
overlap : int
Number of paragraphs to overlap between chunks
split_on : str
Sub string to split text into paragraphs.
n_words : int
Desired length of the output text. Note that this is never perfect
but helps guide the LLM to an approximate desired output length.
400-600 words seems to work quite well with GPT-4. This gets
formatted into the MODEL_INSTRUCTION attribute.
chunk_kwargs : dict | None
kwargs for initialization of :class:`elm.chunk.Chunker`
"""

super().__init__(model)
Expand All @@ -60,9 +54,7 @@ def __init__(self, text, model=None, tokens_per_chunk=500, overlap=1,

assert isinstance(self.text, str)

self.text_chunks = Chunker(self.text,
tokens_per_chunk=tokens_per_chunk,
overlap=overlap, split_on=split_on)
self.text_chunks = Chunker(self.text, **chunk_kwargs)

def combine(self, text_summary):
"""Combine separate chunk summaries into one more comprehensive
Expand Down
15 changes: 15 additions & 0 deletions elm/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(self, graph):
transition.
"""
self._g = graph
self._history = []
assert isinstance(self.graph, nx.DiGraph)
assert 'api' in self.graph.graph

Expand Down Expand Up @@ -93,6 +94,16 @@ def all_messages_txt(self):
messages = '\n\n'.join(messages)
return messages

@property
def history(self):
"""Get a record of the nodes traversed in the tree
Returns
-------
list
"""
return self._history

@property
def graph(self):
"""Get the networkx graph object
Expand Down Expand Up @@ -122,6 +133,7 @@ def call_node(self, node0):
txt_fmt = {k: v for k, v in self.graph.graph.items() if k != 'api'}
prompt = prompt.format(**txt_fmt)

self._history.append(node0)
out = self.api.chat(prompt)

successors = list(self.graph.successors(node0))
Expand Down Expand Up @@ -168,6 +180,9 @@ def run(self, node0='init'):
out : str
Final response from LLM at the leaf node.
"""

self._history = []

while True:
try:
out = self.call_node(node0)
Expand Down
6 changes: 1 addition & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,6 @@
import os
from codecs import open
from setuptools import setup, find_packages
from setuptools.command.develop import develop
from subprocess import check_call
import shlex
from warnings import warn

here = os.path.abspath(os.path.dirname(__file__))

Expand All @@ -24,7 +20,7 @@
install_requires = f.readlines()


test_requires = ["pytest>=5.2", ]
test_requires = ["pytest>=5.2", "pytest-mock"]
description = "Energy Language Model"

setup(
Expand Down
Binary file added tests/data/GPT-4.pdf
Binary file not shown.
Loading

0 comments on commit 715cdde

Please sign in to comment.