Skip to content

Commit

Permalink
Reverted the two
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanleomk committed Jul 9, 2024
1 parent 1074e0a commit b4cef66
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 306 deletions.
138 changes: 5 additions & 133 deletions docs/prompting/self_criticism/self_refine.md
Original file line number Diff line number Diff line change
@@ -1,135 +1,7 @@
---
description: "Self-Refine involves getting LLMs to iteratively generate new responses based off feedback or unitl a stopping condition is met"
---

Self Refine <sup><a href="https://arxiv.org/pdf/2303.17651">1</a></sup> involves prompting a LLM to provide feedback on an answer. This iterative process continues until a stopping condition is met.

We can implement this using `instructor` as seen below using our validation context below.

```python hl_lines="25-27 56-65"
import instructor
from openai import OpenAI
from pydantic import BaseModel, field_validator, ValidationInfo
from typing import Literal

client = instructor.from_openai(OpenAI())


class Sentiment(BaseModel):
text: str

@field_validator("text")
@classmethod
def validate_text(cls, v: str, info: ValidationInfo) -> str:
if not v or not v.strip():
raise ValueError("Text must not be empty or whitespace")
pairwise_comparison_result = validate_sentiment(
v,
info.context["reference_statement"], # type: ignore
info.context["sentiment"], # type: ignore
)

if pairwise_comparison_result.alignment_result == "Review B":
raise ValueError(
f"""{pairwise_comparison_result.feedback}. Please modify
your statement to be more aligned with the target sentiment
and do not copy the statement provided for reference"""
)

if v == info.context["reference_statement"]: # type: ignore
raise ValueError(
"""Your statement is the same as the reference statement.
It should be a separate statement from the reference
statement."""
)

return v
title: ""
description: ""
keywords: ""

---

class PairwiseEvaluation(BaseModel):
feedback: str
alignment_result: Literal[
"Review A",
"Review B",
"Both",
]


def validate_sentiment(review_a: str, review_b: str, target_sentiment: str):
return client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": f"""
Which review is aligned with the sentiment
{target_sentiment}?
Review A: {review_a}
Review B: {review_b}.
Pick your answer from ['Review A', 'Review B', 'both',
'neither']. Generate a short explanation for your choice
first. Then generate your response on which review is more
aligned
""",
}
],
response_model=PairwiseEvaluation,
)


def generate_sentiment_analysis(
initial_statement: str, target_sentiment: str, reference_statement: str
) -> Sentiment:
return client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """
You are an expert at sentiment analysis.
Rewrite a statement so that it is more
closely aligned with the target sentiment.
""",
},
{
"role": "user",
"content": f"""
The statement is {initial_statement} and
the desired target sentiment is
{target_sentiment}
""",
},
],
response_model=Sentiment,
validation_context={
"sentiment": target_sentiment,
"reference_statement": reference_statement,
},
max_retries=5,
)


if __name__ == "__main__":
aligned_sentiment = generate_sentiment_analysis(
"""The food was fantastic, with every dish
surpassing our expectations in terms of flavor,
presentation, and overall dining experience.""",
"Negative",
"""The food was awful, with each dish failing to
meet even the most basic standards of taste,
quality, and presentation, resulting in a highly
disappointing dining experience.""",
)
print(aligned_sentiment)
"""
text = 'The food was terrible, with every dish failing to meet our
expectations in terms of flavor, presentation, and overall dining
experience.'
"""
```

### References

<sup id="ref-1">1</sup>: [Self-Refine: Iterative Refinement with Self-Feedback](https://arxiv.org/pdf/2303.17651)
[wip]
178 changes: 5 additions & 173 deletions docs/prompting/self_criticism/self_verification.md
Original file line number Diff line number Diff line change
@@ -1,175 +1,7 @@
---
description: "Self Verification involves getting language models to generate a candidate response before evaluating each individual intermediate reasoning step to verify if it's logical entailment holds"
---

We can verify the correctness of the reasoning steps taken by our Large Language Model by rewriting them as logical entailments. This enables us to use an LLM to check if the original statement can be derived from the new logical entailment.

By doing this, we can score each reasoning step and obtain a metric for the quality of the response. This process is known as Self Verification <sup><a href="https://arxiv.org/pdf/2212.09561">1</a></sup>

We can scale this out to multiple candidate solutions to choose the best solution.

```python hl_lines="21-24 34-38 56-83 95-98"
import instructor
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
import asyncio

client = instructor.from_openai(AsyncOpenAI())


class SelfVerification(BaseModel):
chain_of_thought: str
is_valid_reasoning_step: bool


class RewrittenValidationStep(BaseModel):
chain_of_thought: str
rewritten_reasoning_step: str


class Response(BaseModel):
reasoning_steps: list[str] = Field(
description="""Logic reasoning steps that allow
us to arrive at the final answer. Make sure to
include specific figures and calculations that
allow you to derive the final correct answer""",
)
correct_answer: str


def generate_reasoning_and_response(query: str):
return client.chat.completions.create(
messages=[
{
"role": "system",
"content": """You are an expert AI question
answering system. Make sure to generate a
list of reasoning steps that are consistent
and logical before generating your final
response.""",
},
{
"role": "user",
"content": query,
},
],
response_model=Response,
model="gpt-4o",
)
title: ""
description: ""
keywords: ""

---

async def evaluate_reasoning_step(reasoning_step: str):
rewritten_reasoning_step = await client.chat.completions.create(
messages=[
{
"role": "system",
"content": """
You are an expert AI Rewritter. You are
about to be passed a reasoning step and
your goal is to rewrite it so that we can
verify if the final conclusion can be
obtained from the conclusion.
Here are some examples
Example 1
Reasoning Step: Jackie has 10 apples so
Jackie has 10-8=2 more apples than Adam.
So the answer is 2.
Rewritten Reasoning Step: Jackie has X
apples. Adam has 8 apples. Jackie has 2
more apples than Adam. Therefore Jackie
must have had 10 apples at the start.
Therefore X must be 10.
Example 2
Reasoning Step: John reads 4 books a day.
He reads every Monday and Tuesday which
are 2 days per week. Therefore he reads 8
books a week.
Rewritten Reasoning Step: John reads X
books a day. He reads every Monday and
Tuesday which are 2 days per week.
Therefore he reads 8 books a week.
Therefore X must be 4.
""",
},
{"role": "user", "content": reasoning_step},
],
response_model=RewrittenValidationStep,
model="gpt-4o",
)
return await client.chat.completions.create(
messages=[
{
"role": "system",
"content": """You are an expert AI Statement
Verification tool. You are about to be
passed a logical step, and asked to verify
if it's a valid reasoning step or not.""",
},
{
"role": "user",
"content": rewritten_reasoning_step.rewritten_reasoning_step,
},
],
response_model=SelfVerification,
model="gpt-4o",
)


async def evaluate_model_reasoning(reasoning: list[str]):
tasks = [evaluate_reasoning_step(step) for step in reasoning]
results = await asyncio.gather(*tasks)
return results


if __name__ == "__main__":
query = """
Tim wanted to make lemonade for a pool party. For a
gallon of lemonade, his recipe called for 1 cup of
fresh lemon juice. He found that 6 lemons would
yield 1 cup of juice. He figured he would need to
make 4 gallons of lemonade for the party. His best
friend Allen asked if Tim could make an extra
gallon for him that was twice as tart as the other
gallons. How many lemons will Tim need?
"""
response = asyncio.run(generate_reasoning_and_response(query))

for step in response.reasoning_steps:
print(step)
"""
To calculate the number of lemons needed, first
determine the lemons required for the primary
lemonade. For 4 gallons, 4 cups of lemon juice
are needed, equating to 4 * 6 = 24 lemons.
"""
"""
For the extra gallon of tart lemonade, which is
twice as tart, it requires 2 cups of lemon
juice. This equates to 2 * 6 = 12 lemons.
"""
"""
Summing these amounts, Tim needs 24 + 12 = 36
lemons in total.
"""

reasoning_evaluation = asyncio.run(
evaluate_model_reasoning(response.reasoning_steps)
)

valid_reasoning_count = 0
for reasoning in reasoning_evaluation:
if reasoning.is_valid_reasoning_step:
valid_reasoning_count += 1

print(valid_reasoning_count/len(reasoning_evaluation))
#> 1
print(response.correct_answer)
#> 36
```

### References

<sup id="ref-1">1</sup>: [Large Language Models are Better Reasoners with Self-Verification](https://arxiv.org/pdf/2212.09561)
[wip]

0 comments on commit b4cef66

Please sign in to comment.