-
Notifications
You must be signed in to change notification settings - Fork 243
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update schema for instruction-following (#2292)
- Loading branch information
1 parent
027988d
commit e1d2c5a
Showing
1 changed file
with
210 additions
and
0 deletions.
There are no files selected for viewing
210 changes: 210 additions & 0 deletions
210
src/helm/benchmark/static/schema_instruction_following.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
--- | ||
############################################################ | ||
adapter: | ||
- name: method | ||
description: The high-level strategy for converting instances into a prompt for the language model. | ||
values: | ||
- name: generation | ||
description: Given the input, the model generates the output free-form. | ||
- name: instructions | ||
description: The description of the task that is included at the very beginning of the prompt. | ||
- name: global_prefix | ||
description: The string that is prepended to the prompt. | ||
- name: instance_prefix | ||
description: The string that is included before each instance (e.g., '\n\n'). | ||
- name: input_prefix | ||
description: The string that is included before each input (e.g., 'Question:'). | ||
- name: input_suffix | ||
description: The string that is included after each input (e.g., '\n'). | ||
- name: reference_prefix | ||
description: The string that is included before each reference (for multiple-choice questions). | ||
- name: reference_suffix | ||
description: The string that is included after each reference (for multiple-choice questions). | ||
- name: output_prefix | ||
description: The string that is included before the correct answer/predicted output (e.g., 'Answer:'). | ||
- name: output_suffix | ||
description: The string that is included after the correct answer/predicted output (e.g., '\n'). | ||
- name: substitutions | ||
description: A list of regular expression substitutions (e.g., replacing '\n' with ';\n') to perform at the very end on the prompt. | ||
- name: max_train_instances | ||
description: Maximum number of training instances to include in the prompt (currently by randomly sampling). | ||
- name: max_eval_instances | ||
description: Maximum number of instances to evaluate on (over all splits - test, valid, etc.). | ||
- name: num_outputs | ||
description: Maximum number of possible outputs to generate by sampling multiple outputs. | ||
- name: num_train_trials | ||
description: Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance. | ||
- name: sample_train | ||
description: If true, randomly sample N training examples; if false, select N consecutive training examples | ||
- name: model | ||
description: Name of the language model (<creator_organization>/<model name>) to send requests to. | ||
- name: model_deployment | ||
description: Name of the language model (<host_organization>/<model name>) to send requests to. | ||
- name: temperature | ||
description: Temperature parameter used in generation. | ||
- name: max_tokens | ||
description: Maximum number of tokens to generate. | ||
- name: stop_sequences | ||
description: List of sequences, where we stop generation if we encounter any of them. | ||
- name: random | ||
description: Random seed (string), which guarantees reproducibility. | ||
- name: multi_label | ||
description: If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references. | ||
|
||
############################################################ | ||
metrics: | ||
- name: Helpfulness | ||
display_name: Helpfulness | ||
short_display_name: Helpfulness | ||
description: Whether the model appears to do what it is instructed to. | ||
lower_is_better: false | ||
- name: Understandability | ||
display_name: Understandability | ||
short_display_name: Understandability | ||
description: Whether the answer is easy to comprehend. | ||
lower_is_better: false | ||
- name: Completeness | ||
display_name: Completeness | ||
short_display_name: Completeness | ||
description: Whether the response provides a comprehensive answer to the question or request. | ||
lower_is_better: false | ||
- name: Conciseness | ||
display_name: Conciseness | ||
short_display_name: Conciseness | ||
description: Whether the response is concise and to the point, without unnecessary repetition or elaboration. | ||
lower_is_better: false | ||
- name: Harmlessness | ||
display_name: Harmlessness | ||
short_display_name: Harmlessness | ||
description: Whether the response avoids using toxic language or helping the user with harmful goals. | ||
lower_is_better: false | ||
|
||
############################################################ | ||
metric_groups: | ||
- name: instruction_following_metrics | ||
display_name: Instruction Following | ||
metrics: | ||
- name: Helpfulness | ||
split: ${main_split} | ||
- name: Understandability | ||
split: ${main_split} | ||
- name: Completeness | ||
split: ${main_split} | ||
- name: Conciseness | ||
split: ${main_split} | ||
- name: Harmlessness | ||
split: ${main_split} | ||
|
||
############################################################ | ||
run_groups: | ||
- name: instruction_following | ||
display_name: Instruction Following | ||
description: Given an open-ended instruction in natural language, the goal is to produce a text response that is helpful, understandable, complete, concise and harmless. | ||
category: Evaluations | ||
subgroups: | ||
- anthropic_hh_rlhf | ||
- grammar | ||
- koala | ||
- open_assistant | ||
- self_instruct | ||
- vicuna | ||
|
||
- name: anthropic_hh_rlhf | ||
display_name: Anthropic RLHF dataset | ||
short_display_name: Anthropic RLHF dataset | ||
description: The dialogue datasets released by Anthropic to facilitate research in model helpfulness and harmlessness ([Bai et al., 2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., 2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance of each dialogue. | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: test | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Human-LM dialogues and preference labels" | ||
who: "Workers from MTurk and Upwork, language models from Anthropic" | ||
when: "2022" | ||
language: English | ||
|
||
# Ideally, the name should be "best_chatgpt_prompts". | ||
# But unfortunately the group name in the results is "grammar", | ||
# so the schema has to match the same group name. | ||
# TODO: Change the group name in the "grammar" run spec, and then change this group name. | ||
- name: grammar | ||
display_name: Best ChatGPT Prompts | ||
short_display_name: Best ChatGPT Prompts | ||
description: A list of “best ChatGPT prompts to power your workflow” summarized by [GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/). | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: test | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Instructions for LLMs" | ||
who: "Gridfiti Staff" | ||
when: "2023" | ||
language: English | ||
|
||
- name: koala | ||
display_name: Koala test dataset | ||
short_display_name: Koala test dataset | ||
description: The test dataset from the [Koala paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating instruction-following models. | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: test | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Instructions for LLMs" | ||
who: "Web users" | ||
when: "Before 2023" | ||
language: English | ||
|
||
- name: open_assistant | ||
display_name: Open Assistant | ||
short_display_name: Open Assistant | ||
description: LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 conversation trees ([Köpf et al., 2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial prompt in each conversation. | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: valid | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Human-written dialogues and response rankings" | ||
who: "Open Assistant participants" | ||
when: "2023" | ||
language: "35 languages" | ||
|
||
- name: self_instruct | ||
display_name: Self Instruct | ||
short_display_name: Self Instruct | ||
description: The manually-curated instructions from the Self-Instruct paper ([Wang et al., 2023](https://aclanthology.org/2023.acl-long.754.pdf)). | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: test | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Instructions for LLMs" | ||
who: "Authors of the research paper" | ||
when: "2022" | ||
language: English | ||
|
||
- name: vicuna | ||
display_name: Vicuna | ||
short_display_name: Vicuna | ||
description: The set of prompts used by the [Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate instruction-following models. | ||
metric_groups: | ||
- instruction_following_metrics | ||
environment: | ||
main_name: Helpfulness | ||
main_split: test | ||
taxonomy: | ||
task: open-ended instruction following | ||
what: "Instructions for LLMs" | ||
who: "Unknown" | ||
when: "Before 2023" | ||
language: English |