-
-
Notifications
You must be signed in to change notification settings - Fork 602
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
10 additions
and
306 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,135 +1,7 @@ | ||
--- | ||
description: "Self-Refine involves getting LLMs to iteratively generate new responses based off feedback or unitl a stopping condition is met" | ||
--- | ||
|
||
Self Refine <sup><a href="https://arxiv.org/pdf/2303.17651">1</a></sup> involves prompting a LLM to provide feedback on an answer. This iterative process continues until a stopping condition is met. | ||
|
||
We can implement this using `instructor` as seen below using our validation context below. | ||
|
||
```python hl_lines="25-27 56-65" | ||
import instructor | ||
from openai import OpenAI | ||
from pydantic import BaseModel, field_validator, ValidationInfo | ||
from typing import Literal | ||
|
||
client = instructor.from_openai(OpenAI()) | ||
|
||
|
||
class Sentiment(BaseModel): | ||
text: str | ||
|
||
@field_validator("text") | ||
@classmethod | ||
def validate_text(cls, v: str, info: ValidationInfo) -> str: | ||
if not v or not v.strip(): | ||
raise ValueError("Text must not be empty or whitespace") | ||
pairwise_comparison_result = validate_sentiment( | ||
v, | ||
info.context["reference_statement"], # type: ignore | ||
info.context["sentiment"], # type: ignore | ||
) | ||
|
||
if pairwise_comparison_result.alignment_result == "Review B": | ||
raise ValueError( | ||
f"""{pairwise_comparison_result.feedback}. Please modify | ||
your statement to be more aligned with the target sentiment | ||
and do not copy the statement provided for reference""" | ||
) | ||
|
||
if v == info.context["reference_statement"]: # type: ignore | ||
raise ValueError( | ||
"""Your statement is the same as the reference statement. | ||
It should be a separate statement from the reference | ||
statement.""" | ||
) | ||
|
||
return v | ||
title: "" | ||
description: "" | ||
keywords: "" | ||
|
||
--- | ||
|
||
class PairwiseEvaluation(BaseModel): | ||
feedback: str | ||
alignment_result: Literal[ | ||
"Review A", | ||
"Review B", | ||
"Both", | ||
] | ||
|
||
|
||
def validate_sentiment(review_a: str, review_b: str, target_sentiment: str): | ||
return client.chat.completions.create( | ||
model="gpt-4o", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": f""" | ||
Which review is aligned with the sentiment | ||
{target_sentiment}? | ||
Review A: {review_a} | ||
Review B: {review_b}. | ||
Pick your answer from ['Review A', 'Review B', 'both', | ||
'neither']. Generate a short explanation for your choice | ||
first. Then generate your response on which review is more | ||
aligned | ||
""", | ||
} | ||
], | ||
response_model=PairwiseEvaluation, | ||
) | ||
|
||
|
||
def generate_sentiment_analysis( | ||
initial_statement: str, target_sentiment: str, reference_statement: str | ||
) -> Sentiment: | ||
return client.chat.completions.create( | ||
model="gpt-4o", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": """ | ||
You are an expert at sentiment analysis. | ||
Rewrite a statement so that it is more | ||
closely aligned with the target sentiment. | ||
""", | ||
}, | ||
{ | ||
"role": "user", | ||
"content": f""" | ||
The statement is {initial_statement} and | ||
the desired target sentiment is | ||
{target_sentiment} | ||
""", | ||
}, | ||
], | ||
response_model=Sentiment, | ||
validation_context={ | ||
"sentiment": target_sentiment, | ||
"reference_statement": reference_statement, | ||
}, | ||
max_retries=5, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
aligned_sentiment = generate_sentiment_analysis( | ||
"""The food was fantastic, with every dish | ||
surpassing our expectations in terms of flavor, | ||
presentation, and overall dining experience.""", | ||
"Negative", | ||
"""The food was awful, with each dish failing to | ||
meet even the most basic standards of taste, | ||
quality, and presentation, resulting in a highly | ||
disappointing dining experience.""", | ||
) | ||
print(aligned_sentiment) | ||
""" | ||
text = 'The food was terrible, with every dish failing to meet our | ||
expectations in terms of flavor, presentation, and overall dining | ||
experience.' | ||
""" | ||
``` | ||
|
||
### References | ||
|
||
<sup id="ref-1">1</sup>: [Self-Refine: Iterative Refinement with Self-Feedback](https://arxiv.org/pdf/2303.17651) | ||
[wip] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,175 +1,7 @@ | ||
--- | ||
description: "Self Verification involves getting language models to generate a candidate response before evaluating each individual intermediate reasoning step to verify if it's logical entailment holds" | ||
--- | ||
|
||
We can verify the correctness of the reasoning steps taken by our Large Language Model by rewriting them as logical entailments. This enables us to use an LLM to check if the original statement can be derived from the new logical entailment. | ||
|
||
By doing this, we can score each reasoning step and obtain a metric for the quality of the response. This process is known as Self Verification <sup><a href="https://arxiv.org/pdf/2212.09561">1</a></sup> | ||
|
||
We can scale this out to multiple candidate solutions to choose the best solution. | ||
|
||
```python hl_lines="21-24 34-38 56-83 95-98" | ||
import instructor | ||
from openai import AsyncOpenAI | ||
from pydantic import BaseModel, Field | ||
import asyncio | ||
|
||
client = instructor.from_openai(AsyncOpenAI()) | ||
|
||
|
||
class SelfVerification(BaseModel): | ||
chain_of_thought: str | ||
is_valid_reasoning_step: bool | ||
|
||
|
||
class RewrittenValidationStep(BaseModel): | ||
chain_of_thought: str | ||
rewritten_reasoning_step: str | ||
|
||
|
||
class Response(BaseModel): | ||
reasoning_steps: list[str] = Field( | ||
description="""Logic reasoning steps that allow | ||
us to arrive at the final answer. Make sure to | ||
include specific figures and calculations that | ||
allow you to derive the final correct answer""", | ||
) | ||
correct_answer: str | ||
|
||
|
||
def generate_reasoning_and_response(query: str): | ||
return client.chat.completions.create( | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": """You are an expert AI question | ||
answering system. Make sure to generate a | ||
list of reasoning steps that are consistent | ||
and logical before generating your final | ||
response.""", | ||
}, | ||
{ | ||
"role": "user", | ||
"content": query, | ||
}, | ||
], | ||
response_model=Response, | ||
model="gpt-4o", | ||
) | ||
title: "" | ||
description: "" | ||
keywords: "" | ||
|
||
--- | ||
|
||
async def evaluate_reasoning_step(reasoning_step: str): | ||
rewritten_reasoning_step = await client.chat.completions.create( | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": """ | ||
You are an expert AI Rewritter. You are | ||
about to be passed a reasoning step and | ||
your goal is to rewrite it so that we can | ||
verify if the final conclusion can be | ||
obtained from the conclusion. | ||
Here are some examples | ||
Example 1 | ||
Reasoning Step: Jackie has 10 apples so | ||
Jackie has 10-8=2 more apples than Adam. | ||
So the answer is 2. | ||
Rewritten Reasoning Step: Jackie has X | ||
apples. Adam has 8 apples. Jackie has 2 | ||
more apples than Adam. Therefore Jackie | ||
must have had 10 apples at the start. | ||
Therefore X must be 10. | ||
Example 2 | ||
Reasoning Step: John reads 4 books a day. | ||
He reads every Monday and Tuesday which | ||
are 2 days per week. Therefore he reads 8 | ||
books a week. | ||
Rewritten Reasoning Step: John reads X | ||
books a day. He reads every Monday and | ||
Tuesday which are 2 days per week. | ||
Therefore he reads 8 books a week. | ||
Therefore X must be 4. | ||
""", | ||
}, | ||
{"role": "user", "content": reasoning_step}, | ||
], | ||
response_model=RewrittenValidationStep, | ||
model="gpt-4o", | ||
) | ||
return await client.chat.completions.create( | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": """You are an expert AI Statement | ||
Verification tool. You are about to be | ||
passed a logical step, and asked to verify | ||
if it's a valid reasoning step or not.""", | ||
}, | ||
{ | ||
"role": "user", | ||
"content": rewritten_reasoning_step.rewritten_reasoning_step, | ||
}, | ||
], | ||
response_model=SelfVerification, | ||
model="gpt-4o", | ||
) | ||
|
||
|
||
async def evaluate_model_reasoning(reasoning: list[str]): | ||
tasks = [evaluate_reasoning_step(step) for step in reasoning] | ||
results = await asyncio.gather(*tasks) | ||
return results | ||
|
||
|
||
if __name__ == "__main__": | ||
query = """ | ||
Tim wanted to make lemonade for a pool party. For a | ||
gallon of lemonade, his recipe called for 1 cup of | ||
fresh lemon juice. He found that 6 lemons would | ||
yield 1 cup of juice. He figured he would need to | ||
make 4 gallons of lemonade for the party. His best | ||
friend Allen asked if Tim could make an extra | ||
gallon for him that was twice as tart as the other | ||
gallons. How many lemons will Tim need? | ||
""" | ||
response = asyncio.run(generate_reasoning_and_response(query)) | ||
|
||
for step in response.reasoning_steps: | ||
print(step) | ||
""" | ||
To calculate the number of lemons needed, first | ||
determine the lemons required for the primary | ||
lemonade. For 4 gallons, 4 cups of lemon juice | ||
are needed, equating to 4 * 6 = 24 lemons. | ||
""" | ||
""" | ||
For the extra gallon of tart lemonade, which is | ||
twice as tart, it requires 2 cups of lemon | ||
juice. This equates to 2 * 6 = 12 lemons. | ||
""" | ||
""" | ||
Summing these amounts, Tim needs 24 + 12 = 36 | ||
lemons in total. | ||
""" | ||
|
||
reasoning_evaluation = asyncio.run( | ||
evaluate_model_reasoning(response.reasoning_steps) | ||
) | ||
|
||
valid_reasoning_count = 0 | ||
for reasoning in reasoning_evaluation: | ||
if reasoning.is_valid_reasoning_step: | ||
valid_reasoning_count += 1 | ||
|
||
print(valid_reasoning_count/len(reasoning_evaluation)) | ||
#> 1 | ||
print(response.correct_answer) | ||
#> 36 | ||
``` | ||
|
||
### References | ||
|
||
<sup id="ref-1">1</sup>: [Large Language Models are Better Reasoners with Self-Verification](https://arxiv.org/pdf/2212.09561) | ||
[wip] |