diff --git a/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py b/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py index 85fc0da40d..e91ae2030c 100644 --- a/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +++ b/src/helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py @@ -6,25 +6,33 @@ from helm.benchmark.annotation.image2structure.image_compiler_annotator import ImageCompilerAnnotator, CompilationError from helm.benchmark.adaptation.request_state import RequestState from helm.common.cache import CacheConfig -from helm.common.optional_dependencies import handle_module_not_found_error +from helm.common.optional_dependencies import handle_module_not_found_error, OptionalDependencyNotInstalled try: - from PIL import Image + from PIL import Image, ImageOps except ModuleNotFoundError as ex: handle_module_not_found_error(ex, suggestions=["images"]) -class LilyPondAnnotator(ImageCompilerAnnotator): +class LilypondCompilerAnnotator(ImageCompilerAnnotator): """Annotator that compiles the text completions into a music sheet with LilyPond.""" name: str = "lilypond_compiler" + base_path = "/home/josselin/installs/lilypond-2.24.3/bin" - def __int__(self, cache_config: CacheConfig, file_storage_path: str): + def __init__(self, cache_config: CacheConfig, file_storage_path: str): super().__init__(cache_config, file_storage_path) - result = subprocess.run(["lilypond", "--version"], capture_output=True, text=True) - assert ( - result.returncode == 0 - ), "LilyPond is not installed. Download and install it from https://lilypond.org/download.html" + try: + result = subprocess.run([f"{self.base_path}/lilypond", "--version"], capture_output=True, text=True) + if result.returncode != 0: + raise OptionalDependencyNotInstalled( + "LilyPond is not installed. Download and install it from https://lilypond.org/download.html" + ) + except FileNotFoundError as e: + raise OptionalDependencyNotInstalled( + "LilyPond is not installed. Download and install it from https://lilypond.org/download.html.\n" + f"Original error: {e}" + ) from e def compile_completion_into_image( self, request_state: RequestState, completion_text: str @@ -43,18 +51,27 @@ def compile_completion_into_image( try: # Edits the LilyPond file to be compatible with the current version - result = subprocess.run(["convert-ly", "-e", ly_file_path], capture_output=True, text=True) + result = subprocess.run( + [f"{self.base_path}/convert-ly", "-e", ly_file_path], capture_output=True, text=True + ) assert result.returncode == 0, f"convert-ly failed: {result.stderr}" # Generate PNG image from the LilyPond file # LilyPond supports partial compilation, which means it attempts to produce an image # for the correct portions of the code, even if there are errors elsewhere - subprocess.run(["lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True) + subprocess.run( + [f"{self.base_path}/lilypond", "--png", "-o", output_path, ly_file_path], capture_output=True, text=True + ) # If an image file is not generated, we consider it an absolute compilation failure assert os.path.exists(sheet_music_path), "lilypond did not generate the image" # Load the image as a PIL Image object image = Image.open(sheet_music_path) + + # Crop the image to remove the white space around the music sheet + (w, h) = image.size + image = image.crop((0, 0, w, h - int(h * 0.2))) # Remove pagination + image = image.crop(ImageOps.invert(image).getbbox()) # Remove white border except (AssertionError, RuntimeError) as e: raise CompilationError(str(e)) from e finally: diff --git a/src/helm/benchmark/presentation/run_specs_image2structure.conf b/src/helm/benchmark/presentation/run_specs_image2structure.conf index ce01914b4a..f53124c3f9 100644 --- a/src/helm/benchmark/presentation/run_specs_image2structure.conf +++ b/src/helm/benchmark/presentation/run_specs_image2structure.conf @@ -8,7 +8,7 @@ entries: [ {description: "image2latex:subset=algorithm,model=vlm", priority: 1, groups: ["image2latex"]} # sheetmusic2lilypond - {description: "sheetmusic2lilypond:model=vlm", priority: 1} + {description: "image2musicsheet:model=vlm", priority: 1, groups: ["image2musicsheet"]} # webpages {description: "image2webpage:subset=css,model=vlm", priority: 1, groups: ["image2webpage"]} diff --git a/src/helm/benchmark/run_specs/vlm_run_specs.py b/src/helm/benchmark/run_specs/vlm_run_specs.py index 8ed082cd5c..6e1a06b38a 100644 --- a/src/helm/benchmark/run_specs/vlm_run_specs.py +++ b/src/helm/benchmark/run_specs/vlm_run_specs.py @@ -96,7 +96,6 @@ def get_image2structure_metric_specs( metric_names = [ AnnotatedImageMetrics.PIXEL_SIMILARITY, AnnotatedImageMetrics.FID_SIMILARITY, - AnnotatedImageMetrics.EDIT_SIMILARITY, AnnotatedImageMetrics.EARTH_MOVER_SIMILARITY, ] if include_edit_similarity: @@ -268,6 +267,39 @@ def get_image2webpage_spec(subset: str, recompile_prompt: bool = False, args: Op ) +@run_spec_function("image2musicsheet") +def get_image2musicsheet_spec(args: Optional[Dict] = None) -> RunSpec: + scenario_spec = ScenarioSpec( + class_name="helm.benchmark.scenarios.vision_language.image2structure.musicsheet_scenario.MusicSheetScenario", + args={"subset": "music", "recompile_prompt": False}, # There os only one subset for music sheets + ) + adapter_spec: AdapterSpec = get_generation_adapter_spec( + instructions="Just give a short answer without answering in a complete sentence.", + max_tokens=2000, + ) + metric_specs: List[MetricSpec] = get_image2structure_metric_specs( + generation_type="musicsheet", + args=args, + include_edit_similarity=False, # No ground truth for music sheets + size_handling_method="padding", + ) + annotator_specs: List[AnnotatorSpec] = [ + AnnotatorSpec( + class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilypondCompilerAnnotator", # noqa: E501 + ) + ] + + run_spec_name: str = "image2musicsheet" + return RunSpec( + name=f"{run_spec_name}", + scenario_spec=scenario_spec, + adapter_spec=adapter_spec, + metric_specs=metric_specs, + groups=[run_spec_name], + annotators=annotator_specs, + ) + + @run_spec_function("mmmu") def get_mmmu_spec(subject: str, question_type: str) -> RunSpec: scenario_spec = ScenarioSpec( @@ -318,37 +350,3 @@ def get_heim_human_eval_spec(question_type: str) -> RunSpec: metric_specs=metric_specs, groups=[run_spec_name], ) - - -@run_spec_function("sheetmusic2lilypond") -def get_sheetmusic2lilypond_spec() -> RunSpec: - scenario_spec = ScenarioSpec( - class_name="helm.benchmark.scenarios.vision_language.image2structure.sheetmusic2lilypond_scenario." - "SheetMusic2LilyPondScenario", - args={}, - ) - adapter_spec: AdapterSpec = get_generation_adapter_spec( - instructions="Generate the LilyPond code for the following sheet music. " - "Just give the LilyPond code without any explanation.", - max_tokens=1500, - ) - - metric_specs: List[MetricSpec] = get_image2structure_metric_specs( - generation_type="lilypond", - include_edit_similarity=False, - ) - annotator_specs: List[AnnotatorSpec] = [ - AnnotatorSpec( - class_name="helm.benchmark.annotation.image2structure.lilypond_compiler_annotator.LilyPondAnnotator", - ) - ] - - run_spec_name: str = "sheetmusic2lilypond" - return RunSpec( - name=run_spec_name, - scenario_spec=scenario_spec, - adapter_spec=adapter_spec, - metric_specs=metric_specs, - annotators=annotator_specs, - groups=[run_spec_name], - ) diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py index 7ee5b01fee..77a18ddd1f 100644 --- a/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +++ b/src/helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py @@ -102,10 +102,10 @@ def get_instances(self, output_path: str) -> List[Instance]: cache_dir=output_path, ) ): - question_id: str = row["num_id"] + question_uuid: str = row["uuid"] if row["category"][1:-1] != self._subset: hlog( - f"Skipping instance {question_id} as it belong in category" + f"Skipping instance {question_uuid} as it belong in category" f" {row['category']} and not {self._subset}" ) continue @@ -114,11 +114,13 @@ def get_instances(self, output_path: str) -> List[Instance]: row = self.preprocess_row(row, assets_path) # Step 2: Save the image locally - image_path: str = os.path.join(images_path, f"{question_id}.png") + image_path: str = os.path.join(images_path, f"{question_uuid}.png") if not os.path.exists(image_path): if not self._recompile_prompt: # 2.a row["image"].save(image_path) else: # 2.b + if "structure" not in row: + raise ValueError("Cannot recompile prompt without structure") structure: str = row["structure"] text: str = self.compile_and_save(structure, assets_path, image_path) row["text"] = text @@ -135,28 +137,40 @@ def get_instances(self, output_path: str) -> List[Instance]: # Step 5: Create the references # 5.a Create the reference containing the structure and the associated image. - multimedia_object: MultimediaObject - if os.path.exists(row["structure"]): - # 5.a.1 The structure is a path, therefore represent it as a multimedia object - # containing the files used to compile the structure (such as a repository - # containing the HTML, CSS, and JavaScript files used to generate a webpage) - multimedia_object = MultimediaObject( - [image_object, MediaObject(location=row["structure"], content_type="path/path")] + reference: Reference + if "structure" in row: + multimedia_object: MultimediaObject + if os.path.exists(row["structure"]): + # 5.a.1 The structure is a path, therefore represent it as a multimedia object + # containing the files used to compile the structure (such as a repository + # containing the HTML, CSS, and JavaScript files used to generate a webpage) + multimedia_object = MultimediaObject( + [image_object, MediaObject(location=row["structure"], content_type="path/path")] + ) + elif row["structure"] == PROCESSED: + # 5.a.2 The structure has been processed and is no longer present in the row + # This can be the case if the structure is a base64 encoding of an archive that + # has been extracted to a temporary path and processed but the path is no longer + # existing (deleted after the processing is done) + multimedia_object = MultimediaObject([image_object]) + else: + # 5.a.3 The structure is not a path, therefore it is directly a valid string + # representing the structure (such as LaTeX code) + multimedia_object = MultimediaObject([image_object]) + reference = Reference( + output=Output(text=row["text"], multimedia_content=multimedia_object), + tags=[CORRECT_TAG], ) - elif row["structure"] == PROCESSED: - # 5.a.2 The structure has been processed and is no longer present in the row - # This can be the case if the structure is a base64 encoding of an archive that - # has been extracted to a temporary path and processed but the path is no longer - # existing (deleted after the processing is done) - multimedia_object = MultimediaObject([image_object]) else: - # 5.a.3 The structure is not a path, therefore it is directly a valid string - # representing the structure (such as LaTeX code) - multimedia_object = MultimediaObject([image_object]) - reference = Reference( - output=Output(text=row["text"], multimedia_content=multimedia_object), - tags=[CORRECT_TAG], - ) + if "text" in row: + reference = Reference( + output=Output(text=row["text"], multimedia_content=MultimediaObject([image_object])), + tags=[CORRECT_TAG], + ) + else: + reference = Reference( + output=Output(multimedia_content=MultimediaObject([image_object])), tags=[CORRECT_TAG] + ) references: List[Reference] = [reference] # 5.b Create the reference containing the assets diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py new file mode 100644 index 0000000000..e36d013d44 --- /dev/null +++ b/src/helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py @@ -0,0 +1,20 @@ +from helm.benchmark.scenarios.scenario import VALID_SPLIT +from helm.benchmark.scenarios.vision_language.image2structure.image2structure_scenario import Image2StructureScenario + + +class MusicSheetScenario(Image2StructureScenario): + BASE_PROMPT = ( + "Please generate the Lilypond code to generate a music sheet that looks like this image as much as feasible possible.\n" # noqa: E501 + "This music sheet was created by me, and I would like to recreate it using Lilypond." + ) + HUGGINGFACE_DATASET_NAME = "stanford-crfm/i2s-musicsheet" + SUBSETS = ["music"] + + name = "image2musicsheet" + description = "Evaluate multimodal models on Lilypond generation to recreate a provided image" + + def __init__(self, subset: str, recompile_prompt: bool = True, split: str = VALID_SPLIT): + super().__init__(subset, recompile_prompt, split) + + def compile_and_save(self, structure: str, assets_path: str, destination_path: str) -> str: + raise Exception("Music sheets have no ground truth, compilation is not possible") diff --git a/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py b/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py deleted file mode 100644 index da8ab5ab71..0000000000 --- a/src/helm/benchmark/scenarios/vision_language/image2structure/sheetmusic2lilypond_scenario.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import List -import os - -from helm.benchmark.scenarios.scenario import ( - CORRECT_TAG, - TEST_SPLIT, - Instance, - Input, - Output, - Reference, - Scenario, -) -from helm.common.media_object import MediaObject, MultimediaObject - - -class SheetMusic2LilyPondScenario(Scenario): - """ - Sheet music to LilyPond scenario - LilyPond is a powerful music engraving program that produces high-quality sheet music. It allows - musicians to create elegant and readable scores, following the best traditions of - classical music engraving. - """ - - name = "sheetmusic2lilypond" - description = "Convert sheet music to LilyPond" - tags = ["vision-language", "sheetmusic2lilypond"] - - def get_instances(self, output_path: str) -> List[Instance]: - assert os.path.exists(output_path), f"Dataset does not exist at {output_path}" - instances: List[Instance] = [] - - for image_file in os.listdir(output_path): - if not image_file.endswith(".png"): - continue - - image_path: str = os.path.join(output_path, image_file) - content: List[MediaObject] = [ - MediaObject(location=image_path, content_type="image/png"), - ] - instances.append( - Instance( - Input(multimedia_content=MultimediaObject(content)), - references=[Reference(Output(multimedia_content=MultimediaObject(content)), tags=[CORRECT_TAG])], - split=TEST_SPLIT, - ) - ) - - return instances diff --git a/src/helm/benchmark/static/schema_vlm.yaml b/src/helm/benchmark/static/schema_vlm.yaml index f5bfed235f..8bffc90204 100644 --- a/src/helm/benchmark/static/schema_vlm.yaml +++ b/src/helm/benchmark/static/schema_vlm.yaml @@ -255,20 +255,24 @@ metric_groups: - name: num_output_tokens split: ${main_split} - - name: image_generation - display_name: Image generation + - name: generation_image + display_name: Generation (image) metrics: - name: pixel_similarity split: ${main_split} - name: compilation_success split: ${main_split} - - name: edit_similarity - split: ${main_split} - name: fid_similarity split: ${main_split} - name: earth_mover_similarity split: ${main_split} + - name: generation_text + display_name: Generation (text) + metrics: + - name: edit_similarity + split: ${main_split} + ############################################################ run_groups: - name: core_scenarios @@ -305,9 +309,9 @@ run_groups: description: Scenarios for evaluating the ability of Vision-Language models to generate structured outputs from images. category: All scenarios subgroups: - - chart2csv - image2latex - image2webpage + - image2musicsheet - name: hateful_memes display_name: Hateful Memes @@ -382,7 +386,8 @@ run_groups: description: The Image2LaTeX benchmark for converting images of mathematical equations, tables. algorithms and tikz to LaTeX. metric_groups: - accuracy - - image_generation + - generation_image + - generation_text - efficiency - general_information environment: @@ -400,7 +405,8 @@ run_groups: description: The Image2webpage benchmark for converting images of webpages to HTML/CSS/Javascript. metric_groups: - accuracy - - image_generation + - generation_image + - generation_text - efficiency - general_information environment: @@ -413,26 +419,27 @@ run_groups: when: "2024" language: English - - name: chart2csv - display_name: Chart2CSV - description: The Chart2CSV benchmark for converting images of charts to CSV. + - name: image2musicsheet + display_name: Image2musicsheet + description: The Image2musicsheet benchmark for converting images of music sheets to LilyPond. metric_groups: - accuracy + - generation_image - efficiency - general_information environment: - main_name: exact_match - main_split: test + main_name: earth_mover_similarity + main_split: valid taxonomy: - task: chart to CSV - what: plots + task: image-to-text + what: music sheets who: n/a when: "2024" language: English - - name: sheetmusic2lilypond - display_name: SheetMusic2LilyPond - description: Sheet music to LilyPond + - name: chart2csv + display_name: Chart2CSV + description: The Chart2CSV benchmark for converting images of charts to CSV. metric_groups: - accuracy - efficiency @@ -441,8 +448,8 @@ run_groups: main_name: exact_match main_split: test taxonomy: - task: Sheet music to LilyPond - what: sheet music + task: chart to CSV + what: plots who: n/a when: "2024" language: English