Skip to content

Commit 762d945

Browse files
author
Preetam Joshi
committed
Adding tests for evaluate
1 parent a88d0af commit 762d945

File tree

1 file changed

+111
-8
lines changed

1 file changed

+111
-8
lines changed

tests/test_detect.py

Lines changed: 111 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def test_detect_with_multiple_detectors(self):
9696
# Create the decorator with multiple detectors
9797
config = {
9898
'hallucination': {'detector_name': 'default'},
99-
'instruction_adherence': {'detector_name': 'v1'},
99+
'instruction_adherence': {'detector_name': 'default'},
100100
'toxicity': {'detector_name': 'default'}
101101
}
102102
values_returned = ["context", "generated_text", "user_query", "instructions"]
@@ -120,7 +120,7 @@ def generate_response(context, query, instructions):
120120
# Call the decorated function
121121
context = "AI systems should be developed responsibly with proper oversight."
122122
query = "What does the text say about AI?"
123-
instructions = "Provide a concise response with at most two sentences."
123+
instructions = ["Provide a concise response with at most two sentences."]
124124

125125
self.log_info("Input - Context", context)
126126
self.log_info("Input - Query", query)
@@ -143,7 +143,7 @@ def generate_response(context, query, instructions):
143143

144144
# Check key fields without verifying values
145145
assert "score" in result.detect_response.hallucination
146-
assert "results" in result.detect_response.instruction_adherence
146+
assert "instructions_list" in result.detect_response.instruction_adherence
147147
assert "score" in result.detect_response.toxicity
148148

149149
def test_detect_with_different_iterables(self):
@@ -482,7 +482,7 @@ def generate_summary(context, query):
482482

483483
def test_instruction_adherence_v1(self):
484484
"""Test the Detect decorator with instruction adherence detector using v1."""
485-
config = {'instruction_adherence': {'detector_name': 'v1'}}
485+
config = {'instruction_adherence': {'detector_name': 'default'}}
486486
values_returned = ["context", "generated_text", "instructions"]
487487

488488
self.log_info("Test", "Instruction Adherence with detector_name=v1")
@@ -501,7 +501,7 @@ def generate_with_instructions(context, instructions):
501501
return context, generated_text, instructions
502502

503503
context = "Climate change and its effects on our planet."
504-
instructions = "Provide a short response in one sentence."
504+
instructions = ["Provide a short response in one sentence."]
505505

506506
self.log_info("Input - Context", context)
507507
self.log_info("Input - Instructions", instructions)
@@ -519,7 +519,6 @@ def generate_with_instructions(context, instructions):
519519
assert isinstance(result, DetectResult)
520520
assert result.status == 200
521521
assert hasattr(result.detect_response, 'instruction_adherence')
522-
assert "results" in result.detect_response.instruction_adherence
523522

524523
def test_instruction_adherence_default(self):
525524
"""Test the Detect decorator with instruction adherence detector using default."""
@@ -596,7 +595,7 @@ def test_all_detectors_combination(self):
596595
config = {
597596
'hallucination': {'detector_name': 'default'},
598597
'toxicity': {'detector_name': 'default'},
599-
'instruction_adherence': {'detector_name': 'v1'}, # Using v1 format which expects a string
598+
'instruction_adherence': {'detector_name': 'default'},
600599
'retrieval_relevance': {'detector_name': 'default'},
601600
'conciseness': {'detector_name': 'default'},
602601
'completeness': {'detector_name': 'default'}
@@ -626,7 +625,7 @@ def comprehensive_response(context, query, instructions):
626625

627626
context = "Renewable energy sources like solar and wind are becoming increasingly cost-effective alternatives to fossil fuels."
628627
query = "What are the trends in renewable energy?"
629-
instructions = "Provide a factual response based only on the given context."
628+
instructions = ["Provide a factual response based only on the given context."]
630629

631630
self.log_info("Input - Context", context)
632631
self.log_info("Input - Query", query)
@@ -722,3 +721,107 @@ def generate_with_multiple_instructions(context, instructions, query):
722721
self.log_info("Error occurred during test", str(e))
723722
# Log the error but don't fail the test
724723
pytest.skip(f"Test skipped due to error: {str(e)}")
724+
725+
def test_evaluate_with_new_model(self):
726+
"""Test the evaluate function with a new model name that should be auto-created."""
727+
import uuid
728+
from aimon import evaluate, Client
729+
730+
# Generate a unique model name to ensure it doesn't exist
731+
unique_model_name = f"test_model_{uuid.uuid4().hex[:8]}"
732+
application_name = "test_application"
733+
evaluation_name = f"test_eval_{uuid.uuid4().hex[:8]}"
734+
735+
self.log_info("Test", "Evaluate with new model auto-creation")
736+
self.log_info("Model Name", unique_model_name)
737+
self.log_info("Application Name", application_name)
738+
739+
# Create client
740+
aimon_client = Client(auth_header=f"Bearer {self.api_key}")
741+
742+
# Create a test dataset CSV in memory or file
743+
import tempfile
744+
import csv
745+
746+
with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as tmp:
747+
writer = csv.writer(tmp)
748+
writer.writerow(["context_docs", "user_query", "output"])
749+
writer.writerow([
750+
"AI systems should be developed responsibly with proper oversight.",
751+
"What does the text say about AI?",
752+
"The text states that AI systems should be developed responsibly with proper oversight."
753+
])
754+
dataset_path = tmp.name
755+
756+
try:
757+
# Upload the dataset
758+
import json # Add import at top of function if not already there
759+
dataset_args = json.dumps({"name": "test_dataset.csv", "description": "Test dataset for evaluation"})
760+
with open(dataset_path, 'rb') as file:
761+
dataset = aimon_client.datasets.create(
762+
file=file,
763+
json_data=dataset_args
764+
)
765+
766+
# Create dataset collection
767+
collection_name = f"test_collection_{uuid.uuid4().hex[:8]}"
768+
collection = aimon_client.datasets.collection.create(
769+
name=collection_name,
770+
dataset_ids=[dataset.sha],
771+
description="Test collection for evaluation"
772+
)
773+
774+
# Configure evaluation
775+
eval_config = {
776+
'hallucination': {'detector_name': 'default'},
777+
'toxicity': {'detector_name': 'default'}
778+
}
779+
780+
# Run evaluation
781+
results = evaluate(
782+
dataset_collection_name=collection_name,
783+
headers=["context_docs", "user_query", "output"],
784+
application_name=application_name,
785+
model_name=unique_model_name,
786+
evaluation_name=evaluation_name,
787+
api_key=self.api_key,
788+
aimon_client=aimon_client,
789+
config=eval_config
790+
)
791+
792+
self.log_info("Evaluation Results", results)
793+
794+
# Based on EvaluateResponse structure in aimon/decorators/evaluate.py
795+
assert results is not None
796+
797+
# EvaluateResponse likely contains 'evaluation_id' or other identifying information
798+
# Just verify it's not empty and log its structure for debugging
799+
self.log_info("Results type", type(results))
800+
801+
# Log attributes if we can
802+
try:
803+
if hasattr(results, "__dict__"):
804+
self.log_info("Results attributes", results.__dict__)
805+
else:
806+
self.log_info("Results dir", dir(results))
807+
except:
808+
self.log_info("Could not log results attributes")
809+
810+
# Check for common attributes in evaluation responses
811+
if hasattr(results, "evaluation_id"):
812+
self.log_info("Evaluation ID", results.evaluation_id)
813+
814+
if hasattr(results, "task_id"):
815+
self.log_info("Task ID", results.task_id)
816+
817+
self.log_info("Result", f"Successfully created and evaluated with new model: {unique_model_name}")
818+
819+
except Exception as e:
820+
self.log_info("Error occurred during test", str(e))
821+
raise
822+
823+
finally:
824+
# Cleanup
825+
import os
826+
if os.path.exists(dataset_path):
827+
os.remove(dataset_path)

0 commit comments

Comments
 (0)