Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve summary chunking #397

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 24 additions & 13 deletions recipes/natural_language_processing/summarizer/app/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from langchain.prompts import PromptTemplate
from langchain_community.callbacks import StreamlitCallbackHandler
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rouge_score import rouge_scorer
import streamlit as st
import tempfile
Expand Down Expand Up @@ -34,22 +35,32 @@ def checking_model_service():
with st.spinner("Checking Model Service Availability..."):
checking_model_service()

def split_append_chunk(chunk, list):
chunk_length = len(chunk)
chunk1 = " ".join(chunk.split()[:chunk_length])
chunk2 = " ".join(chunk.split()[chunk_length:])
list.extend([chunk1, chunk2])

def chunk_text(text):
chunks = []
chunk_size = 1024
tokens = requests.post(f"{model_service[:-2]}extras/tokenize/",
json={"input":text}).content
tokens = json.loads(tokens)["tokens"]
num_tokens = len(tokens)
num_chunks = (num_tokens//chunk_size)+1
for i in range(num_chunks):
chunk = tokens[:chunk_size]
chunk = requests.post(f"{model_service[:-2]}extras/detokenize/",
json={"tokens":chunk}).content
chunk = json.loads(chunk)["text"]
chunks.append(chunk)
tokens = tokens[chunk_size:]
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=3048,
chunk_overlap=0,
Gregory-Pereira marked this conversation as resolved.
Show resolved Hide resolved
length_function=len,
is_separator_regex=False
)

text_chunks = text_splitter.create_documents([text])
for chunk in text_chunks:
chunk = chunk.page_content
count = requests.post(f"{model_service[:-2]}extras/tokenize/count",
json={"input":chunk}).content
count = json.loads(count)["count"]
if count >= 2048:
Gregory-Pereira marked this conversation as resolved.
Show resolved Hide resolved
split_append_chunk(chunk, chunks)
else:
chunks.append(chunk)

return chunks

def read_file(file):
Expand Down