diff --git a/wtpsplit/evaluation/intrinsic_pairwise.py b/wtpsplit/evaluation/intrinsic_pairwise.py
index 457f5033..bf3a7fba 100644
--- a/wtpsplit/evaluation/intrinsic_pairwise.py
+++ b/wtpsplit/evaluation/intrinsic_pairwise.py
@@ -19,6 +19,7 @@
 
 import wtpsplit.models  # noqa: F401
 from wtpsplit.evaluation import evaluate_mixture, get_labels, train_mixture, token_to_char_probs
+from wtpsplit.evaluation.intrinsic_baselines import split_language_data
 from wtpsplit.extract import PyTorchWrapper
 from wtpsplit.extract_batched import extract_batched
 from wtpsplit.utils import Constants
@@ -241,11 +242,10 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st
     total_test_time = 0  # Initialize total test processing time
 
     start_time = time.time()
-    with h5py.File(logits_path, "a") as f, torch.no_grad():
+    with h5py.File(logits_path, "w") as f, torch.no_grad():
         for lang_code in Constants.LANGINFO.index:
             if args.include_langs is not None and lang_code not in args.include_langs:
                 continue
-
             print(f"Processing {lang_code}...")
             if lang_code not in f:
                 lang_group = f.create_group(lang_code)
@@ -254,8 +254,11 @@ def load_or_compute_logits(args, model, eval_data, valid_data=None, save_str: st
 
             # eval data
             for dataset_name, dataset in eval_data[lang_code]["sentence"].items():
-                if args.skip_corrupted and "corrupted" in dataset_name and"ted2020" not in dataset_name:
+                if args.skip_corrupted and "corrupted" in dataset_name and "ted2020" not in dataset_name:
                     continue
+                if "-" in lang_code and "canine" in args.model_path and "no-adapters" not in args.model_path:
+                    # code-switched data: eval 2x
+                    lang_code = lang_code.split("_")[1].lower()
                 try:
                     if args.adapter_path:
                         model.model.load_adapter(
@@ -377,6 +380,8 @@ def main(args):
 
     print(save_str)
     eval_data = torch.load(args.eval_data_path)
+    if "canine" in args.model_path and not "no-adapters" in args.model_path:
+        eval_data = split_language_data(eval_data)
     if args.valid_text_path is not None:
         valid_data = load_dataset("parquet", data_files=args.valid_text_path, split="train")
     else:
@@ -530,7 +535,9 @@ def main(args):
             acc_t = np.mean(acc_t) if score_t else None
             acc_punct = np.mean(acc_punct) if score_punct else None
             threshold = np.mean(thresholds)
-
+            u_indices.append(cur_u_indices["pred_indices"] if cur_u_indices["pred_indices"] else [])
+            true_indices.append(cur_u_indices["true_indices"] if cur_u_indices["true_indices"] else [])
+            length.append(cur_u_indices["length"])
 
             results[lang_code][dataset_name] = {
                 "u": score_u,
@@ -596,7 +603,7 @@ def main(args):
         ),
         indent=4,
     )
-    
+
     if args.return_indices:
         json.dump(
             indices,