From 4fd660042e61c18091bd98ccf6fd7b2ce1ac61a6 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Fri, 20 Sep 2024 17:30:53 +0200 Subject: [PATCH 1/2] enable mini probe by default --- garak/probes/latentinjection.py | 1 + 1 file changed, 1 insertion(+) diff --git a/garak/probes/latentinjection.py b/garak/probes/latentinjection.py index 9392b991..0e82a3a3 100644 --- a/garak/probes/latentinjection.py +++ b/garak/probes/latentinjection.py @@ -547,6 +547,7 @@ class LatentJailbreakMini(LatentJailbreak): "instruction_count": 3, "trigger_count": 4, } + active = True # has a lot of prompts - default to mini version def __init__(self, config_root=_config): super().__init__(config_root) From 951832fd79137dcd56ecba5a6fac1fa27e58abe7 Mon Sep 17 00:00:00 2001 From: Leon Derczynski Date: Mon, 23 Sep 2024 10:24:14 +0200 Subject: [PATCH 2/2] amend inheritance order to avoid clobbering mixin vars --- garak/probes/latentinjection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/garak/probes/latentinjection.py b/garak/probes/latentinjection.py index 0e82a3a3..faa0dd7d 100644 --- a/garak/probes/latentinjection.py +++ b/garak/probes/latentinjection.py @@ -463,7 +463,7 @@ class LatentInjectionFactSnippetLegal(LatentInjectionFactSnippetEiffel): ] -class LatentJailbreak(Probe, LatentInjectionMixin): +class LatentJailbreak(LatentInjectionMixin, Probe): """Implementation of Latent Jailbreak, using latent prompt injection- style distraction to get model to write hateful/harmful text