@@ -96,7 +96,7 @@ def test_detect_with_multiple_detectors(self):
96
96
# Create the decorator with multiple detectors
97
97
config = {
98
98
'hallucination' : {'detector_name' : 'default' },
99
- 'instruction_adherence' : {'detector_name' : 'v1 ' },
99
+ 'instruction_adherence' : {'detector_name' : 'default ' },
100
100
'toxicity' : {'detector_name' : 'default' }
101
101
}
102
102
values_returned = ["context" , "generated_text" , "user_query" , "instructions" ]
@@ -120,7 +120,7 @@ def generate_response(context, query, instructions):
120
120
# Call the decorated function
121
121
context = "AI systems should be developed responsibly with proper oversight."
122
122
query = "What does the text say about AI?"
123
- instructions = "Provide a concise response with at most two sentences."
123
+ instructions = [ "Provide a concise response with at most two sentences." ]
124
124
125
125
self .log_info ("Input - Context" , context )
126
126
self .log_info ("Input - Query" , query )
@@ -143,7 +143,7 @@ def generate_response(context, query, instructions):
143
143
144
144
# Check key fields without verifying values
145
145
assert "score" in result .detect_response .hallucination
146
- assert "results " in result .detect_response .instruction_adherence
146
+ assert "instructions_list " in result .detect_response .instruction_adherence
147
147
assert "score" in result .detect_response .toxicity
148
148
149
149
def test_detect_with_different_iterables (self ):
@@ -482,7 +482,7 @@ def generate_summary(context, query):
482
482
483
483
def test_instruction_adherence_v1 (self ):
484
484
"""Test the Detect decorator with instruction adherence detector using v1."""
485
- config = {'instruction_adherence' : {'detector_name' : 'v1 ' }}
485
+ config = {'instruction_adherence' : {'detector_name' : 'default ' }}
486
486
values_returned = ["context" , "generated_text" , "instructions" ]
487
487
488
488
self .log_info ("Test" , "Instruction Adherence with detector_name=v1" )
@@ -501,7 +501,7 @@ def generate_with_instructions(context, instructions):
501
501
return context , generated_text , instructions
502
502
503
503
context = "Climate change and its effects on our planet."
504
- instructions = "Provide a short response in one sentence."
504
+ instructions = [ "Provide a short response in one sentence." ]
505
505
506
506
self .log_info ("Input - Context" , context )
507
507
self .log_info ("Input - Instructions" , instructions )
@@ -519,7 +519,6 @@ def generate_with_instructions(context, instructions):
519
519
assert isinstance (result , DetectResult )
520
520
assert result .status == 200
521
521
assert hasattr (result .detect_response , 'instruction_adherence' )
522
- assert "results" in result .detect_response .instruction_adherence
523
522
524
523
def test_instruction_adherence_default (self ):
525
524
"""Test the Detect decorator with instruction adherence detector using default."""
@@ -596,7 +595,7 @@ def test_all_detectors_combination(self):
596
595
config = {
597
596
'hallucination' : {'detector_name' : 'default' },
598
597
'toxicity' : {'detector_name' : 'default' },
599
- 'instruction_adherence' : {'detector_name' : 'v1 ' }, # Using v1 format which expects a string
598
+ 'instruction_adherence' : {'detector_name' : 'default ' },
600
599
'retrieval_relevance' : {'detector_name' : 'default' },
601
600
'conciseness' : {'detector_name' : 'default' },
602
601
'completeness' : {'detector_name' : 'default' }
@@ -626,7 +625,7 @@ def comprehensive_response(context, query, instructions):
626
625
627
626
context = "Renewable energy sources like solar and wind are becoming increasingly cost-effective alternatives to fossil fuels."
628
627
query = "What are the trends in renewable energy?"
629
- instructions = "Provide a factual response based only on the given context."
628
+ instructions = [ "Provide a factual response based only on the given context." ]
630
629
631
630
self .log_info ("Input - Context" , context )
632
631
self .log_info ("Input - Query" , query )
@@ -722,3 +721,107 @@ def generate_with_multiple_instructions(context, instructions, query):
722
721
self .log_info ("Error occurred during test" , str (e ))
723
722
# Log the error but don't fail the test
724
723
pytest .skip (f"Test skipped due to error: { str (e )} " )
724
+
725
+ def test_evaluate_with_new_model (self ):
726
+ """Test the evaluate function with a new model name that should be auto-created."""
727
+ import uuid
728
+ from aimon import evaluate , Client
729
+
730
+ # Generate a unique model name to ensure it doesn't exist
731
+ unique_model_name = f"test_model_{ uuid .uuid4 ().hex [:8 ]} "
732
+ application_name = "test_application"
733
+ evaluation_name = f"test_eval_{ uuid .uuid4 ().hex [:8 ]} "
734
+
735
+ self .log_info ("Test" , "Evaluate with new model auto-creation" )
736
+ self .log_info ("Model Name" , unique_model_name )
737
+ self .log_info ("Application Name" , application_name )
738
+
739
+ # Create client
740
+ aimon_client = Client (auth_header = f"Bearer { self .api_key } " )
741
+
742
+ # Create a test dataset CSV in memory or file
743
+ import tempfile
744
+ import csv
745
+
746
+ with tempfile .NamedTemporaryFile (mode = 'w' , suffix = '.csv' , delete = False ) as tmp :
747
+ writer = csv .writer (tmp )
748
+ writer .writerow (["context_docs" , "user_query" , "output" ])
749
+ writer .writerow ([
750
+ "AI systems should be developed responsibly with proper oversight." ,
751
+ "What does the text say about AI?" ,
752
+ "The text states that AI systems should be developed responsibly with proper oversight."
753
+ ])
754
+ dataset_path = tmp .name
755
+
756
+ try :
757
+ # Upload the dataset
758
+ import json # Add import at top of function if not already there
759
+ dataset_args = json .dumps ({"name" : "test_dataset.csv" , "description" : "Test dataset for evaluation" })
760
+ with open (dataset_path , 'rb' ) as file :
761
+ dataset = aimon_client .datasets .create (
762
+ file = file ,
763
+ json_data = dataset_args
764
+ )
765
+
766
+ # Create dataset collection
767
+ collection_name = f"test_collection_{ uuid .uuid4 ().hex [:8 ]} "
768
+ collection = aimon_client .datasets .collection .create (
769
+ name = collection_name ,
770
+ dataset_ids = [dataset .sha ],
771
+ description = "Test collection for evaluation"
772
+ )
773
+
774
+ # Configure evaluation
775
+ eval_config = {
776
+ 'hallucination' : {'detector_name' : 'default' },
777
+ 'toxicity' : {'detector_name' : 'default' }
778
+ }
779
+
780
+ # Run evaluation
781
+ results = evaluate (
782
+ dataset_collection_name = collection_name ,
783
+ headers = ["context_docs" , "user_query" , "output" ],
784
+ application_name = application_name ,
785
+ model_name = unique_model_name ,
786
+ evaluation_name = evaluation_name ,
787
+ api_key = self .api_key ,
788
+ aimon_client = aimon_client ,
789
+ config = eval_config
790
+ )
791
+
792
+ self .log_info ("Evaluation Results" , results )
793
+
794
+ # Based on EvaluateResponse structure in aimon/decorators/evaluate.py
795
+ assert results is not None
796
+
797
+ # EvaluateResponse likely contains 'evaluation_id' or other identifying information
798
+ # Just verify it's not empty and log its structure for debugging
799
+ self .log_info ("Results type" , type (results ))
800
+
801
+ # Log attributes if we can
802
+ try :
803
+ if hasattr (results , "__dict__" ):
804
+ self .log_info ("Results attributes" , results .__dict__ )
805
+ else :
806
+ self .log_info ("Results dir" , dir (results ))
807
+ except :
808
+ self .log_info ("Could not log results attributes" )
809
+
810
+ # Check for common attributes in evaluation responses
811
+ if hasattr (results , "evaluation_id" ):
812
+ self .log_info ("Evaluation ID" , results .evaluation_id )
813
+
814
+ if hasattr (results , "task_id" ):
815
+ self .log_info ("Task ID" , results .task_id )
816
+
817
+ self .log_info ("Result" , f"Successfully created and evaluated with new model: { unique_model_name } " )
818
+
819
+ except Exception as e :
820
+ self .log_info ("Error occurred during test" , str (e ))
821
+ raise
822
+
823
+ finally :
824
+ # Cleanup
825
+ import os
826
+ if os .path .exists (dataset_path ):
827
+ os .remove (dataset_path )
0 commit comments