diff --git a/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py b/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py
index 85fc0da40d..e91ae2030c 100644
--- a/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py
+++ b/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py
@@ -6,25 +6,33 @@
 from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError
 from helm.benchmark.adaptation.request_state import RequestState
 from helm.common.cache import CacheConfig
-from helm.common.optional_dependencies import handle_module_not_found_error
+from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled
 
 try:
-    from PIL import Image
+    from PIL import Image, ImageOps
 except ModuleNotFoundError as ex:
     handle_module_not_found_error(ex, suggestions=["images"])
 
 
-class LilyPondAnnotator(ImageCompilerAnnotator):
+class LilypondCompilerAnnotator(ImageCompilerAnnotator):
     """Annotator that compiles the text completions into a music sheet with LilyPond."""
 
     name: str = "lilypond_compiler"
+    base_path = "/home/josselin/installs/lilypond-2.24.3/bin"
 
-    def __int__(self, cache_config: CacheConfig, file_storage_path: str):
+    def __init__(self, cache_config: CacheConfig, file_storage_path: str):
         super().__init__(cache_config, file_storage_path)
-        result = subprocess.run(["lilypond", "--version"], capture_output=True, text=True)
-        assert (
-            result.returncode == 0
-        ), "LilyPond is not installed. Download and install it from https://lilypond.org/download.html"
+        try:
+            result = subprocess.run([f"{self.base_path}/lilypond", "--version"], capture_output=True, text=True)
+            if result.returncode != 0:
+                raise OptionalDependencyNotInstalled(
+                    "LilyPond is not installed. Download and install it from https://lilypond.org/download.html"
+                )
+        except FileNotFoundError as e:
+            raise OptionalDependencyNotInstalled(
+                "LilyPond is not installed. Download and install it from https://lilypond.org/download.html.\n"
+                f"Original error: {e}"
+            ) from e
 
     def compile_completion_into_image(
         self, request_state: RequestState, completion_text: str
@@ -43,18 +51,27 @@ def compile_completion_into_image(
 
         try:
             # Edits the LilyPond file to be compatible with the current version
-            result = subprocess.run(["convert-ly", "-e", ly_file_path], capture_output=True, text=True)
+            result = subprocess.run(
+                [f"{self.base_path}/convert-ly", "-e", ly_file_path], capture_output=True, text=True
+            )
             assert result.returncode == 0, f"convert-ly failed: {result.stderr}"
 
             # Generate PNG image from the LilyPond file
             # LilyPond supports partial compilation, which means it attempts to produce an image
             # for the correct portions of the code, even if there are errors elsewhere
-            subprocess.run(["lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True)
+            subprocess.run(
+                [f"{self.base_path}/lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True
+            )
             # If an image file is not generated, we consider it an absolute compilation failure
             assert os.path.exists(sheet_music_path), "lilypond did not generate the image"
 
             # Load the image as a PIL Image object
             image = Image.open(sheet_music_path)
+
+            # Crop the image to remove the white space around the music sheet
+            (w, h) = image.size
+            image = image.crop((0, 0, w, h - int(h * 0.2)))  # Remove pagination
+            image = image.crop(ImageOps.invert(image).getbbox())  # Remove white border
         except (AssertionError, RuntimeError) as e:
             raise CompilationError(str(e)) from e
         finally:
diff --git a/src/helm/benchmark/presentation/run_specs_image2structure.conf b/src/helm/benchmark/presentation/run_specs_image2structure.conf
index ce01914b4a..f53124c3f9 100644
--- a/src/helm/benchmark/presentation/run_specs_image2structure.conf
+++ b/src/helm/benchmark/presentation/run_specs_image2structure.conf
@@ -8,7 +8,7 @@ entries: [
     {description: "image2latex:subset=algorithm,model=vlm", priority: 1, groups: ["image2latex"]}
 
     # sheetmusic2lilypond
-    {description: "sheetmusic2lilypond:model=vlm", priority: 1}
+    {description: "image2musicsheet:model=vlm", priority: 1, groups: ["image2musicsheet"]}
 
     # webpages
     {description: "image2webpage:subset=css,model=vlm", priority: 1, groups: ["image2webpage"]}
diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py
index 8ed082cd5c..6e1a06b38a 100644
--- a/src/helm/benchmark/run_specs/vlm_run_specs.py
+++ b/src/helm/benchmark/run_specs/vlm_run_specs.py
@@ -96,7 +96,6 @@ def get_image2structure_metric_specs(
         metric_names = [
             AnnotatedImageMetrics.PIXEL_SIMILARITY,
             AnnotatedImageMetrics.FID_SIMILARITY,
-            AnnotatedImageMetrics.EDIT_SIMILARITY,
             AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY,
         ]
     if include_edit_similarity:
@@ -268,6 +267,39 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op
     )
 
 
+@run_spec_function("image2musicsheet")
+def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec:
+    scenario_spec = ScenarioSpec(
+        class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario",
+        args={"subset": "music", "recompile_prompt": False},  # There os only one subset for music sheets
+    )
+    adapter_spec: AdapterSpec = get_generation_adapter_spec(
+        instructions="Just give a short answer without answering in a complete sentence.",
+        max_tokens=2000,
+    )
+    metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
+        generation_type="musicsheet",
+        args=args,
+        include_edit_similarity=False,  # No ground truth for music sheets
+        size_handling_method="padding",
+    )
+    annotator_specs: List[AnnotatorSpec] = [
+        AnnotatorSpec(
+            class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilypondCompilerAnnotator",  # noqa: E501
+        )
+    ]
+
+    run_spec_name: str = "image2musicsheet"
+    return RunSpec(
+        name=f"{run_spec_name}",
+        scenario_spec=scenario_spec,
+        adapter_spec=adapter_spec,
+        metric_specs=metric_specs,
+        groups=[run_spec_name],
+        annotators=annotator_specs,
+    )
+
+
 @run_spec_function("mmmu")
 def get_mmmu_spec(subject: str, question_type: str) -> RunSpec:
     scenario_spec = ScenarioSpec(
@@ -318,37 +350,3 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec:
         metric_specs=metric_specs,
         groups=[run_spec_name],
     )
-
-
-@run_spec_function("sheetmusic2lilypond")
-def get_sheetmusic2lilypond_spec() -> RunSpec:
-    scenario_spec = ScenarioSpec(
-        class_name="helm.benchmark.scenarios.vision_language.image2structure.sheetmusic2lilypond_scenario."
-        "SheetMusic2LilyPondScenario",
-        args={},
-    )
-    adapter_spec: AdapterSpec = get_generation_adapter_spec(
-        instructions="Generate the LilyPond code for the following sheet music. "
-        "Just give the LilyPond code without any explanation.",
-        max_tokens=1500,
-    )
-
-    metric_specs: List[MetricSpec] = get_image2structure_metric_specs(
-        generation_type="lilypond",
-        include_edit_similarity=False,
-    )
-    annotator_specs: List[AnnotatorSpec] = [
-        AnnotatorSpec(
-            class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilyPondAnnotator",
-        )
-    ]
-
-    run_spec_name: str = "sheetmusic2lilypond"
-    return RunSpec(
-        name=run_spec_name,
-        scenario_spec=scenario_spec,
-        adapter_spec=adapter_spec,
-        metric_specs=metric_specs,
-        annotators=annotator_specs,
-        groups=[run_spec_name],
-    )
diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py
index 7ee5b01fee..77a18ddd1f 100644
--- a/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py
+++ b/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py
@@ -102,10 +102,10 @@ def get_instances(self, output_path: str) -> List[Instance]:
                 cache_dir=output_path,
             )
         ):
-            question_id: str = row["num_id"]
+            question_uuid: str = row["uuid"]
             if row["category"][1:-1] != self._subset:
                 hlog(
-                    f"Skipping instance {question_id} as it belong in category"
+                    f"Skipping instance {question_uuid} as it belong in category"
                     f" {row['category']} and not {self._subset}"
                 )
                 continue
@@ -114,11 +114,13 @@ def get_instances(self, output_path: str) -> List[Instance]:
             row = self.preprocess_row(row, assets_path)
 
             # Step 2: Save the image locally
-            image_path: str = os.path.join(images_path, f"{question_id}.png")
+            image_path: str = os.path.join(images_path, f"{question_uuid}.png")
             if not os.path.exists(image_path):
                 if not self._recompile_prompt:  # 2.a
                     row["image"].save(image_path)
                 else:  # 2.b
+                    if "structure" not in row:
+                        raise ValueError("Cannot recompile prompt without structure")
                     structure: str = row["structure"]
                     text: str = self.compile_and_save(structure, assets_path, image_path)
                     row["text"] = text
@@ -135,28 +137,40 @@ def get_instances(self, output_path: str) -> List[Instance]:
 
             # Step 5: Create the references
             # 5.a Create the reference containing the structure and the associated image.
-            multimedia_object: MultimediaObject
-            if os.path.exists(row["structure"]):
-                # 5.a.1 The structure is a path, therefore represent it as a multimedia object
-                # containing the files used to compile the structure (such as a repository
-                # containing the HTML, CSS, and JavaScript files used to generate a webpage)
-                multimedia_object = MultimediaObject(
-                    [image_object, MediaObject(location=row["structure"], content_type="path/path")]
+            reference: Reference
+            if "structure" in row:
+                multimedia_object: MultimediaObject
+                if os.path.exists(row["structure"]):
+                    # 5.a.1 The structure is a path, therefore represent it as a multimedia object
+                    # containing the files used to compile the structure (such as a repository
+                    # containing the HTML, CSS, and JavaScript files used to generate a webpage)
+                    multimedia_object = MultimediaObject(
+                        [image_object, MediaObject(location=row["structure"], content_type="path/path")]
+                    )
+                elif row["structure"] == PROCESSED:
+                    # 5.a.2 The structure has been processed and is no longer present in the row
+                    # This can be the case if the structure is a base64 encoding of an archive that
+                    # has been extracted to a temporary path and processed but the path is no longer
+                    # existing (deleted after the processing is done)
+                    multimedia_object = MultimediaObject([image_object])
+                else:
+                    # 5.a.3 The structure is not a path, therefore it is directly a valid string
+                    # representing the structure (such as LaTeX code)
+                    multimedia_object = MultimediaObject([image_object])
+                reference = Reference(
+                    output=Output(text=row["text"], multimedia_content=multimedia_object),
+                    tags=[CORRECT_TAG],
                 )
-            elif row["structure"] == PROCESSED:
-                # 5.a.2 The structure has been processed and is no longer present in the row
-                # This can be the case if the structure is a base64 encoding of an archive that
-                # has been extracted to a temporary path and processed but the path is no longer
-                # existing (deleted after the processing is done)
-                multimedia_object = MultimediaObject([image_object])
             else:
-                # 5.a.3 The structure is not a path, therefore it is directly a valid string
-                # representing the structure (such as LaTeX code)
-                multimedia_object = MultimediaObject([image_object])
-            reference = Reference(
-                output=Output(text=row["text"], multimedia_content=multimedia_object),
-                tags=[CORRECT_TAG],
-            )
+                if "text" in row:
+                    reference = Reference(
+                        output=Output(text=row["text"], multimedia_content=MultimediaObject([image_object])),
+                        tags=[CORRECT_TAG],
+                    )
+                else:
+                    reference = Reference(
+                        output=Output(multimedia_content=MultimediaObject([image_object])), tags=[CORRECT_TAG]
+                    )
             references: List[Reference] = [reference]
 
             # 5.b Create the reference containing the assets
diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py
new file mode 100644
index 0000000000..e36d013d44
--- /dev/null
+++ b/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py
@@ -0,0 +1,20 @@
+from helm.benchmark.scenarios.scenario import VALID_SPLIT
+from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario
+
+
+class MusicSheetScenario(Image2StructureScenario):
+    BASE_PROMPT = (
+        "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n"  # noqa: E501
+        "This music sheet was created by me, and I would like to recreate it using Lilypond."
+    )
+    HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet"
+    SUBSETS = ["music"]
+
+    name = "image2musicsheet"
+    description = "Evaluate multimodal models on Lilypond generation to recreate a provided image"
+
+    def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT):
+        super().__init__(subset, recompile_prompt, split)
+
+    def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str:
+        raise Exception("Music sheets have no ground truth, compilation is not possible")
diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py
deleted file mode 100644
index da8ab5ab71..0000000000
--- a/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from typing import List
-import os
-
-from helm.benchmark.scenarios.scenario import (
-    CORRECT_TAG,
-    TEST_SPLIT,
-    Instance,
-    Input,
-    Output,
-    Reference,
-    Scenario,
-)
-from helm.common.media_object import MediaObject, MultimediaObject
-
-
-class SheetMusic2LilyPondScenario(Scenario):
-    """
-    Sheet music to LilyPond scenario
-    LilyPond is a powerful music engraving program that produces high-quality sheet music. It allows
-    musicians to create elegant and readable scores, following the best traditions of
-    classical music engraving.
-    """
-
-    name = "sheetmusic2lilypond"
-    description = "Convert sheet music to LilyPond"
-    tags = ["vision-language", "sheetmusic2lilypond"]
-
-    def get_instances(self, output_path: str) -> List[Instance]:
-        assert os.path.exists(output_path), f"Dataset does not exist at {output_path}"
-        instances: List[Instance] = []
-
-        for image_file in os.listdir(output_path):
-            if not image_file.endswith(".png"):
-                continue
-
-            image_path: str = os.path.join(output_path, image_file)
-            content: List[MediaObject] = [
-                MediaObject(location=image_path, content_type="image/png"),
-            ]
-            instances.append(
-                Instance(
-                    Input(multimedia_content=MultimediaObject(content)),
-                    references=[Reference(Output(multimedia_content=MultimediaObject(content)), tags=[CORRECT_TAG])],
-                    split=TEST_SPLIT,
-                )
-            )
-
-        return instances
diff --git a/src/helm/benchmark/static/schema_vlm.yaml b/src/helm/benchmark/static/schema_vlm.yaml
index f5bfed235f..8bffc90204 100644
--- a/src/helm/benchmark/static/schema_vlm.yaml
+++ b/src/helm/benchmark/static/schema_vlm.yaml
@@ -255,20 +255,24 @@ metric_groups:
     - name: num_output_tokens
       split: ${main_split}
 
-  - name: image_generation
-    display_name: Image generation
+  - name: generation_image
+    display_name: Generation (image)
     metrics:
       - name: pixel_similarity
         split: ${main_split}
       - name: compilation_success
         split: ${main_split}
-      - name: edit_similarity
-        split: ${main_split}
       - name: fid_similarity
         split: ${main_split}
       - name: earth_mover_similarity
         split: ${main_split}
 
+  - name: generation_text
+    display_name: Generation (text)
+    metrics:
+      - name: edit_similarity
+        split: ${main_split}
+
 ############################################################
 run_groups:
   - name: core_scenarios
@@ -305,9 +309,9 @@ run_groups:
     description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images.
     category: All scenarios
     subgroups:
-      - chart2csv
       - image2latex
       - image2webpage
+      - image2musicsheet
 
   - name: hateful_memes
     display_name: Hateful Memes
@@ -382,7 +386,8 @@ run_groups:
     description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX.
     metric_groups:
       - accuracy
-      - image_generation
+      - generation_image
+      - generation_text
       - efficiency
       - general_information
     environment:
@@ -400,7 +405,8 @@ run_groups:
     description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript.
     metric_groups:
       - accuracy
-      - image_generation
+      - generation_image
+      - generation_text
       - efficiency
       - general_information
     environment:
@@ -413,26 +419,27 @@ run_groups:
       when: "2024"
       language: English
 
-  - name: chart2csv
-    display_name: Chart2CSV
-    description: The Chart2CSV benchmark for converting images of charts to CSV.
+  - name: image2musicsheet
+    display_name: Image2musicsheet
+    description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond.
     metric_groups:
       - accuracy
+      - generation_image
       - efficiency
       - general_information
     environment:
-      main_name: exact_match
-      main_split: test
+      main_name: earth_mover_similarity
+      main_split: valid
     taxonomy:
-      task: chart to CSV
-      what: plots
+      task: image-to-text
+      what: music sheets
       who: n/a
       when: "2024"
       language: English
 
-  - name: sheetmusic2lilypond
-    display_name: SheetMusic2LilyPond
-    description: Sheet music to LilyPond
+  - name: chart2csv
+    display_name: Chart2CSV
+    description: The Chart2CSV benchmark for converting images of charts to CSV.
     metric_groups:
       - accuracy
       - efficiency
@@ -441,8 +448,8 @@ run_groups:
       main_name: exact_match
       main_split: test
     taxonomy:
-      task: Sheet music to LilyPond
-      what: sheet music
+      task: chart to CSV
+      what: plots
       who: n/a
       when: "2024"
       language: English