stanford-crfm · yifanmai · Feb 6, 2024 · Jan 26, 2024 · Jan 27, 2024 · Jan 27, 2024
diff --git a/src/helm/benchmark/static/schema_instruction_following.yaml b/src/helm/benchmark/static/schema_instruction_following.yaml
@@ -0,0 +1,210 @@
+---
+############################################################
+adapter:
+  - name: method
+    description: The high-level strategy for converting instances into a prompt for the language model.
+    values:
+      - name: generation
+        description: Given the input, the model generates the output free-form.
+  - name: instructions
+    description: The description of the task that is included at the very beginning of the prompt.
+  - name: global_prefix
+    description: The string that is prepended to the prompt.
+  - name: instance_prefix
+    description: The string that is included before each instance (e.g., '\n\n').
+  - name: input_prefix
+    description: The string that is included before each input (e.g., 'Question:').
+  - name: input_suffix
+    description: The string that is included after each input (e.g., '\n').
+  - name: reference_prefix
+    description: The string that is included before each reference (for multiple-choice questions).
+  - name: reference_suffix
+    description: The string that is included after each reference (for multiple-choice questions).
+  - name: output_prefix
+    description: The string that is included before the correct answer/predicted output (e.g., 'Answer:').
+  - name: output_suffix
+    description: The string that is included after the correct answer/predicted output (e.g., '\n').
+  - name: substitutions
+    description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt.
+  - name: max_train_instances
+    description: Maximum number of training instances to include in the prompt (currently by randomly sampling).
+  - name: max_eval_instances
+    description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.).
+  - name: num_outputs
+    description: Maximum number of possible outputs to generate by sampling multiple outputs.
+  - name: num_train_trials
+    description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance.
+  - name: sample_train
+    description: If true, randomly sample N training examples; if false, select N consecutive training examples
+  - name: model
+    description: Name of the language model (<creator_organization>/<model name>) to send requests to.
+  - name: model_deployment
+    description: Name of the language model (<host_organization>/<model name>) to send requests to.
+  - name: temperature
+    description: Temperature parameter used in generation.
+  - name: max_tokens
+    description: Maximum number of tokens to generate.
+  - name: stop_sequences
+    description: List of sequences, where we stop generation if we encounter any of them.
+  - name: random
+    description: Random seed (string), which guarantees reproducibility.
+  - name: multi_label
+    description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references.
+
+############################################################
+metrics:
+  - name: Helpfulness
+    display_name: Helpfulness
+    short_display_name: Helpfulness
+    description: Whether the model appears to do what it is instructed to.
+    lower_is_better: false
+  - name: Understandability
+    display_name: Understandability
+    short_display_name: Understandability
+    description: Whether the answer is easy to comprehend.
+    lower_is_better: false
+  - name: Completeness
+    display_name: Completeness
+    short_display_name: Completeness
+    description: Whether the response provides a comprehensive answer to the question or request.
+    lower_is_better: false
+  - name: Conciseness
+    display_name: Conciseness
+    short_display_name: Conciseness
+    description: Whether the response is concise and to the point, without unnecessary repetition or elaboration.
+    lower_is_better: false
+  - name: Harmlessness
+    display_name: Harmlessness
+    short_display_name: Harmlessness
+    description: Whether the response avoids using toxic language or helping the user with harmful goals.
+    lower_is_better: false
+
+############################################################
+metric_groups:
+  - name: instruction_following_metrics
+    display_name: Instruction Following
+    metrics:
+      - name: Helpfulness
+        split: ${main_split}
+      - name: Understandability
+        split: ${main_split}
+      - name: Completeness
+        split: ${main_split}
+      - name: Conciseness
+        split: ${main_split}
+      - name: Harmlessness
+        split: ${main_split}
+
+############################################################
+run_groups:
+  - name: instruction_following
+    display_name: Instruction Following
+    description: Given an open-ended instruction in natural language, the goal is to produce a text response that is helpful, understandable, complete, concise and harmless.
+    category: Evaluations
+    subgroups:
+      - anthropic_hh_rlhf
+      - grammar
+      - koala
+      - open_assistant
+      - self_instruct
+      - vicuna
+
+  - name: anthropic_hh_rlhf
+    display_name: Anthropic RLHF dataset
+    short_display_name: Anthropic RLHF dataset
+    description: The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue.
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: test
+    taxonomy:
+      task: open-ended instruction following
+      what: "Human-LM dialogues and preference labels"
+      who: "Workers from MTurk and Upwork, language models from Anthropic"
+      when: "2022"
+      language: English
+
+  # Ideally, the name should be "best_chatgpt_prompts".
+  # But unfortunately the group name in the results is "grammar",
+  # so the schema has to match the same group name.
+  # TODO: Change the group name in the "grammar" run spec, and then change this group name.
+  - name: grammar
+    display_name: Best ChatGPT Prompts
+    short_display_name: Best ChatGPT Prompts
+    description: A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: test
+    taxonomy:
+      task: open-ended instruction following
+      what: "Instructions for LLMs"
+      who: "Gridfiti Staff"
+      when: "2023"
+      language: English
+
+  - name: koala
+    display_name: Koala test dataset
+    short_display_name: Koala test dataset
+    description: The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models.
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: test
+    taxonomy:
+      task: open-ended instruction following
+      what: "Instructions for LLMs"
+      who: "Web users"
+      when: "Before 2023"
+      language: English
+
+  - name: open_assistant
+    display_name: Open Assistant
+    short_display_name: Open Assistant
+    description: LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation.
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: valid
+    taxonomy:
+      task: open-ended instruction following
+      what: "Human-written dialogues and response rankings"
+      who: "Open Assistant participants"
+      when: "2023"
+      language: "35 languages"
+
+  - name: self_instruct
+    display_name: Self Instruct
+    short_display_name: Self Instruct
+    description: The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)).
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: test
+    taxonomy:
+      task: open-ended instruction following
+      what: "Instructions for LLMs"
+      who: "Authors of the research paper"
+      when: "2022"
+      language: English
+
+  - name: vicuna
+    display_name: Vicuna
+    short_display_name: Vicuna
+    description: The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models.
+    metric_groups:
+      - instruction_following_metrics
+    environment:
+      main_name: Helpfulness
+      main_split: test
+    taxonomy:
+      task: open-ended instruction following
+      what: "Instructions for LLMs"
+      who: "Unknown"
+      when: "Before 2023"
+      language: English