diff --git a/demo.py b/demo.py index a12f1d6c396..fcbe9d78dec 100644 --- a/demo.py +++ b/demo.py @@ -17,23 +17,30 @@ print(account.usages) # Make a request -request = Request(model="ai21/j1-large", prompt="Life is like a box of", echo_prompt=True) +request = Request( + model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", echo_prompt=True +) request_result: RequestResult = service.make_request(auth, request) print(request_result.completions[0].text) # Expect different responses for the same request but with different values for `random`. # Passing in the same value for `random` guarantees the same results. -request = Request(prompt="Life is like a box of", random="1") +request = Request(model="ai21/j2-large", model_deployment="ai21/j2-large", prompt="Life is like a box of", random="1") request_result = service.make_request(auth, request) print(request_result.completions[0].text) # How to get the embedding for some text -request = Request(model="openai/text-similarity-ada-001", prompt="Life is like a box of", embedding=True) +request = Request( + model="openai/text-similarity-ada-002", + model_deployment="openai/text-similarity-ada-002", + prompt="Life is like a box of", + embedding=True, +) request_result = service.make_request(auth, request) print(request_result.embedding) # Tokenize -request = TokenizationRequest(tokenizer="ai21/j1-jumbo", text="Tokenize me please.") +request = TokenizationRequest(tokenizer="ai21/j2-jumbo", text="Tokenize me please.") tokenization_request_result: TokenizationRequestResult = service.tokenize(auth, request) print(f"Number of tokens: {len(tokenization_request_result.tokens)}") diff --git a/docs/tutorial.md b/docs/tutorial.md index d02fd9cb5cc..cc103e11f6d 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -2,20 +2,20 @@ This tutorial will explain how to use the HELM command line tools to run benchmarks, aggregate statistics, and visualize results. -We will run two runs using the `mmlu` scenario on the `huggingface/gpt-2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy. +We will run two runs using the `mmlu` scenario on the `openai/gpt2` model. The `mmlu` scenario implements the **Massive Multitask Language (MMLU)** benchmark from [this paper](https://arxiv.org/pdf/2009.03300.pdf), and consists of a Question Answering (QA) task using a dataset with questions from 57 subjects such as elementary mathematics, US history, computer science, law, and more. Note that GPT-2 performs poorly on MMLU, so this is just a proof of concept. We will run two runs: the first using questions about anatomy, and the second using questions about philosophy. ## Using `helm-run` `helm-run` is a command line tool for running benchmarks. -To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=huggingface/gpt-2` (for anatomy) and `mmlu:subject=philosophy,model=huggingface/gpt-2` (for philosophy). +To run this benchmark using the HELM command-line tools, we need to specify **run spec descriptions** that describes the desired runs. For this example, the run spec descriptions are `mmlu:subject=anatomy,model=openai/gpt2` (for anatomy) and `mmlu:subject=philosophy,model=openai/gpt2` (for philosophy). Next, we need to create a **run spec configuration file** contining these run spec descriptions. A run spec configuration file is a text file containing `RunEntries` serialized to JSON, where each entry in `RunEntries` contains a run spec description. The `description` field of each entry should be a **run spec description**. Create a text file named `run_specs.conf` with the following contents: ``` entries: [ - {description: "mmlu:subject=anatomy,model=huggingface/gpt2", priority: 1}, - {description: "mmlu:subject=philosophy,model=huggingface/gpt2", priority: 1}, + {description: "mmlu:subject=anatomy,model=openai/gpt2", priority: 1}, + {description: "mmlu:subject=philosophy,model=openai/gpt2", priority: 1}, ] ``` @@ -35,7 +35,7 @@ The meaning of the additional arguments are as follows: - The environment directory is `prod_env/` by default and can be set using `--local-path`. Credentials for making API calls should be added to a `credentials.conf` file in this directory. - The output directory is `benchmark_output/` by default and can be set using `--output-path`. -After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=huggingface_gpt-2` and `mmlu:subject=philosophy,model=huggingface_gpt-2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`. +After running this command, navigate to the `benchmark_output/runs/v1/` directory. This should contain a two sub-directories named `mmlu:subject=anatomy,model=openai_gpt2` and `mmlu:subject=philosophy,model=openai_gpt2`. Note that the names of these sub-directories is based on the run spec descriptions we used earlier, but with `/` replaced with `_`. Each output sub-directory will contain several JSON files that were generated during the corresponding run: diff --git a/scripts/compute_request_limits.py b/scripts/compute_request_limits.py index a5060e37255..7d79e1b848e 100644 --- a/scripts/compute_request_limits.py +++ b/scripts/compute_request_limits.py @@ -1,10 +1,11 @@ # This script is used to find out the max_prompt_length and max_prompt_length_plus_tokens for a given model. # You must set max_attempts to 1 in retry.py to make it work. # Example usage: -# python compute_request_limits.py --model_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base" +# python compute_request_limits.py --model_deployment_name="writer/palmyra-base" --tokenizer_name="Writer/palmyra-base" from typing import Any, Optional, Dict from helm.proxy.clients.auto_client import AutoClient +from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment from helm.common.request import Request from helm.common.tokenization_request import TokenizationRequest @@ -40,6 +41,7 @@ def get_number_of_tokens(prompt: str, tokenizer: Tokenizer, tokenizer_name: str) def try_request( client: Any, + model_deployment_name: str, model_name: str, tokenizer_name: str, tokenizer: Tokenizer, @@ -58,6 +60,7 @@ def try_request( try: request = Request( model=model_name, + model_deployment=model_deployment_name, prompt=prefix + " ".join(["hello"] * (sequence_length - num_tokens_prefix - num_tokens_suffix)) + suffix, max_tokens=num_tokens, ) @@ -78,6 +81,7 @@ class RequestLimits: def figure_out_max_prompt_length( client: AutoClient, + model_deployment_name: str, model_name: str, tokenizer_name: str, upper_bound: int = 9500, @@ -95,7 +99,9 @@ def figure_out_max_prompt_length( with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar: while lower_bound < upper_bound: middle = math.ceil((lower_bound + upper_bound) / 2) - if try_request(client, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix): + if try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, middle, 0, prefix, suffix + ): lower_bound = middle else: upper_bound = middle - 1 @@ -117,6 +123,7 @@ def figure_out_max_prompt_length( def figure_out_max_prompt_length_plus_tokens( client: Any, # Client, + model_deployment_name: str, model_name: str, tokenizer_name: str, max_prompt_length: int, @@ -130,6 +137,7 @@ def figure_out_max_prompt_length_plus_tokens( # Check if there is a limit (some model accept as many tokens as you want) if try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -148,7 +156,17 @@ def figure_out_max_prompt_length_plus_tokens( with tqdm(total=int(math.log2(upper_bound - lower_bound))) as pbar: while lower_bound < upper_bound: middle = math.ceil((lower_bound + upper_bound) / 2) - if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, middle, prefix, suffix): + if try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length, + middle, + prefix, + suffix, + ): lower_bound = middle else: upper_bound = middle - 1 @@ -159,6 +177,7 @@ def figure_out_max_prompt_length_plus_tokens( def check_limits( client: AutoClient, + model_deployment_name: str, model_name: str, tokenizer_name: str, limits: RequestLimits, @@ -172,7 +191,9 @@ def check_limits( max_prompt_length = limits.max_prompt_length if max_prompt_length < 0: print("No limit on the number of tokens") - if not try_request(client, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix): + if not try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, 2**32 - 2, 0, prefix, suffix + ): print(f"There is a limit on the number of tokens. Params: max_prompt_length={2**32 - 2}, max_tokens=1") result = False else: @@ -180,15 +201,37 @@ def check_limits( # If there is no limit on the number of tokens, max_prompt_length should be -1 # And we should not be here # Check that max_prompt_length is ok - if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix): + if not try_request( + client, model_deployment_name, model_name, tokenizer_name, tokenizer, max_prompt_length, 0, prefix, suffix + ): print(f"max_prompt_length is too big. Params: max_prompt_length={max_prompt_length}, max_tokens=1") result = False # Check that max_prompt_length + 1 is not ok - if try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length + 1, 0, prefix, suffix): + if try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length + 1, + 0, + prefix, + suffix, + ): print(f"max_prompt_length could be bigger. Params: max_prompt_length={max_prompt_length+1}, max_tokens=1") result = False # Check that max_prompt_length - 1 is ok - if not try_request(client, model_name, tokenizer_name, tokenizer, max_prompt_length - 1, 0, prefix, suffix): + if not try_request( + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max_prompt_length - 1, + 0, + prefix, + suffix, + ): print( f"max_prompt_length ssems to be inconsistent. max_prompt_length={max_prompt_length} " f"is ok but max_prompt_length={max_prompt_length-1} is not, with max_tokens=0" @@ -203,7 +246,15 @@ def check_limits( if max_prompt_length_plus_tokens < 0: print("No limit on the number of tokens") if not try_request( - client, model_name, tokenizer_name, tokenizer, max(1, max_prompt_length), 2**32 - 2, prefix, suffix + client, + model_deployment_name, + model_name, + tokenizer_name, + tokenizer, + max(1, max_prompt_length), + 2**32 - 2, + prefix, + suffix, ): print( f"There is a limit on the number of tokens. Params: max_prompt_length={max_prompt_length}," @@ -216,6 +267,7 @@ def check_limits( # If there is no limit on the number of tokens, we skip this test if not try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -231,6 +283,7 @@ def check_limits( result = False if try_request( client, + model_deployment_name, model_name, tokenizer_name, tokenizer, @@ -251,7 +304,8 @@ def check_limits( def get_args(): # model_name, tokenizer_name, prefix and suffix are passed as arguments parser = argparse.ArgumentParser() - parser.add_argument("--model_name", type=str, default="writer/palmyra-base") + parser.add_argument("--model_deployment_name", type=str, default="writer/palmyra-base") + parser.add_argument("--model_name", type=str, default="") parser.add_argument("--tokenizer_name", type=str, default="Writer/palmyra-base") parser.add_argument( "--prefix", @@ -268,6 +322,10 @@ def get_args(): parser.add_argument("--credentials_path", type=str, default="../prod_env/credentials.conf") parser.add_argument("--cache_path", type=str, default="../prod_env/cache") args = parser.parse_args() + + if args.model_name == "": + model_deployment: ModelDeployment = get_model_deployment(args.model_deployment_name) + args.model_name = model_deployment.model_name return args @@ -287,7 +345,12 @@ def main(): print("client successfully created") print("Making short request...") - request = Request(model=args.model_name, prompt=args.prefix + "hello" + args.suffix, max_tokens=1) + request = Request( + model=args.model_name, + model_deployment=args.model_deployment_name, + prompt=args.prefix + "hello" + args.suffix, + max_tokens=1, + ) response = client.make_request(request) if not response.success: raise ValueError("Request failed") @@ -305,7 +368,7 @@ def main(): print("========== Figure out max_prompt_length ==========") limits: RequestLimits = figure_out_max_prompt_length( - client, args.model_name, args.tokenizer_name, prefix=args.prefix, suffix=args.suffix + client, args.model_deployment_name, args.model_name, args.tokenizer_name, prefix=args.prefix, suffix=args.suffix ) print(f"max_prompt_length: {limits.max_prompt_length}") print("===================================================") @@ -314,6 +377,7 @@ def main(): print("========== Figure out max_prompt_length_plus_tokens ==========") max_prompt_length_plus_tokens: int = figure_out_max_prompt_length_plus_tokens( client, + args.model_deployment_name, args.model_name, args.tokenizer_name, max_prompt_length=limits.max_prompt_length, @@ -328,7 +392,13 @@ def main(): # Check the limits print("========== Check the limits ==========") result: bool = check_limits( - client, args.model_name, args.tokenizer_name, limits, prefix=args.prefix, suffix=args.suffix + client, + args.model_deployment_name, + args.model_name, + args.tokenizer_name, + limits, + prefix=args.prefix, + suffix=args.suffix, ) if result: print("All limits are respected") diff --git a/src/helm/benchmark/adaptation/adapter_spec.py b/src/helm/benchmark/adaptation/adapter_spec.py index 36bd147c07d..82648c7be6e 100644 --- a/src/helm/benchmark/adaptation/adapter_spec.py +++ b/src/helm/benchmark/adaptation/adapter_spec.py @@ -73,7 +73,11 @@ class AdapterSpec: # Decoding parameters (inherited by `Request`) - # Model to make the request to (need to fill in) + # Model deployment to make the request to (need to fill in) + model_deployment: str = "" + + # DEPRECATED: old model field, kept for backward compatibility + # TODO: Remove this once we do not wish to support backward compatibility anymore. model: str = "" # Temperature to use diff --git a/src/helm/benchmark/adaptation/adapters/adapter.py b/src/helm/benchmark/adaptation/adapters/adapter.py index 3dd65132863..f03fff5578e 100644 --- a/src/helm/benchmark/adaptation/adapters/adapter.py +++ b/src/helm/benchmark/adaptation/adapters/adapter.py @@ -21,7 +21,7 @@ class Adapter(ABC): def __init__(self, adapter_spec: AdapterSpec, tokenizer_service: TokenizerService): self.adapter_spec: AdapterSpec = adapter_spec self.window_service: WindowService = WindowServiceFactory.get_window_service( - adapter_spec.model, tokenizer_service + adapter_spec.model_deployment, tokenizer_service ) @abstractmethod diff --git a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py index 75d8b622f59..339a220788b 100644 --- a/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/binary_ranking_adapter.py @@ -50,6 +50,7 @@ def generate_requests( ) request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt.text, num_completions=self.adapter_spec.num_outputs, temperature=self.adapter_spec.temperature, diff --git a/src/helm/benchmark/adaptation/adapters/generation_adapter.py b/src/helm/benchmark/adaptation/adapters/generation_adapter.py index ec251ce20a2..c4945852653 100644 --- a/src/helm/benchmark/adaptation/adapters/generation_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/generation_adapter.py @@ -39,6 +39,7 @@ def generate_requests( ) request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt.text, num_completions=self.adapter_spec.num_outputs, temperature=self.adapter_spec.temperature, diff --git a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py index b1b43828e0b..87e51a9b212 100644 --- a/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/language_modeling_adapter.py @@ -120,6 +120,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: ) request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt_text, num_completions=1, temperature=0, @@ -168,6 +169,7 @@ def _generate_requests(self, eval_instance: Instance) -> List[RequestState]: request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt_text, num_completions=1, temperature=0, diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py index aa8b6c9d204..a5126373502 100644 --- a/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py @@ -29,6 +29,7 @@ def generate_requests( request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, multimodal_prompt=prompt.multimedia_object, num_completions=self.adapter_spec.num_outputs, temperature=self.adapter_spec.temperature, diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py index 152bffc34db..18fbe8508f4 100644 --- a/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py @@ -27,6 +27,7 @@ def generate_requests( request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, multimodal_prompt=prompt.multimedia_object, num_completions=self.adapter_spec.num_outputs, temperature=self.adapter_spec.temperature, diff --git a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py index 18796dbebcd..4b9d3801bf0 100644 --- a/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py @@ -22,6 +22,7 @@ def teardown_method(self, _): def test_construct_prompt(self): adapter_spec: AdapterSpec = AdapterSpec( model="simple/model1", + model_deployment="simple/model1", method=ADAPT_GENERATION_MULTIMODAL, global_prefix="[START]", instructions="Please answer the following question about the images.", @@ -91,6 +92,7 @@ def test_construct_prompt(self): def test_construct_prompt_multi_label(self): adapter_spec: AdapterSpec = AdapterSpec( model="simple/model1", + model_deployment="simple/model1", method=ADAPT_GENERATION_MULTIMODAL, global_prefix="[START]", instructions="Please answer the following question about the images.", @@ -171,6 +173,7 @@ def test_construct_prompt_idefics_instruct_example(self): """ adapter_spec: AdapterSpec = AdapterSpec( model="simple/model1", + model_deployment="simple/model1", method=ADAPT_GENERATION_MULTIMODAL, input_prefix="User: ", input_suffix="", diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py index 5cf4e4d9410..08e8569b0be 100644 --- a/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py @@ -55,6 +55,7 @@ def generate_requests( ) request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt.text, num_completions=1, top_k_per_token=self.adapter_spec.num_outputs, diff --git a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py index 31429cc2529..d9a3d79fa41 100644 --- a/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py @@ -41,6 +41,7 @@ def construct_request_state( ) -> RequestState: request = Request( model=self.adapter_spec.model, + model_deployment=self.adapter_spec.model_deployment, prompt=prompt.text, num_completions=1, temperature=0, diff --git a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py index 7d327b8dd90..d2791ed532f 100644 --- a/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/test_generation_adapter.py @@ -33,6 +33,7 @@ def test_adapt(self): def test_construct_prompt(self): adapter_spec = AdapterSpec( model="openai/davinci", + model_deployment="openai/davinci", method=ADAPT_GENERATION, input_prefix="", input_suffix="", @@ -59,7 +60,12 @@ def test_construct_prompt(self): def test_construct_prompt_with_truncation(self): adapter_spec = AdapterSpec( - model="openai/davinci", method=ADAPT_GENERATION, input_prefix="", output_prefix="", max_tokens=100 + model="openai/davinci", + model_deployment="openai/davinci", + method=ADAPT_GENERATION, + input_prefix="", + output_prefix="", + max_tokens=100, ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) correct_reference = Reference(Output(text=""), tags=[CORRECT_TAG]) @@ -80,7 +86,9 @@ def test_construct_prompt_with_truncation(self): assert prompt_text.count("eval") == 1948 def test_sample_examples_without_references(self): - adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=1) + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=1 + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) all_train_instances = [ Instance(Input(text="prompt1"), references=[]), @@ -92,7 +100,9 @@ def test_sample_examples_without_references(self): assert len(examples) == 1 def test_sample_examples_open_ended_generation(self): - adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=3) + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=3 + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) all_train_instances: List[Instance] = [ @@ -106,7 +116,9 @@ def test_sample_examples_open_ended_generation(self): assert seed0_examples != seed1_examples, "Examples should differ when changing the seed" def test_sample_examples_open_ended_generation_stress(self): - adapter_spec = AdapterSpec(method=ADAPT_GENERATION, model="openai/ada", max_train_instances=5) + adapter_spec = AdapterSpec( + method=ADAPT_GENERATION, model="openai/ada", model_deployment="openai/ada", max_train_instances=5 + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) all_train_instances: List[Instance] = [ @@ -146,7 +158,11 @@ def test_sample_examples_open_ended_generation_stress(self): def test_multiple_correct_reference(self): adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, sample_train=False + method=ADAPT_GENERATION, + model="openai/ada", + model_deployment="openai/ada", + max_train_instances=2, + sample_train=False, ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) train_instances = [ @@ -191,7 +207,12 @@ def test_multiple_correct_reference(self): def test_multiple_correct_reference_multi_label(self): adapter_spec = AdapterSpec( - method=ADAPT_GENERATION, model="openai/ada", max_train_instances=2, multi_label=True, sample_train=False + method=ADAPT_GENERATION, + model="openai/ada", + model_deployment="openai/ada", + max_train_instances=2, + multi_label=True, + sample_train=False, ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) train_instances = [ diff --git a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py index 721285e4508..588dfe6b1f8 100644 --- a/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py @@ -29,6 +29,7 @@ def test_construct_language_modeling_prompt(self): method=ADAPT_LANGUAGE_MODELING, input_prefix="", model="openai/davinci", + model_deployment="openai/davinci", output_prefix="", max_tokens=0, ) @@ -52,6 +53,7 @@ def test_fits_tokens_within_context_window(self): method=ADAPT_LANGUAGE_MODELING, input_prefix="", model="openai/curie", + model_deployment="openai/curie", output_prefix="", max_tokens=0, ) @@ -83,6 +85,7 @@ def test_prompt_truncated(self): method=ADAPT_LANGUAGE_MODELING, input_prefix="", model="anthropic/claude-v1.3", + model_deployment="anthropic/claude-v1.3", output_prefix="", max_tokens=0, ) @@ -120,6 +123,7 @@ def test_prompt_truncated(self): method=ADAPT_LANGUAGE_MODELING, input_prefix="", model="anthropic/claude-v1.3", + model_deployment="anthropic/claude-v1.3", output_prefix="", max_tokens=2000, ) @@ -149,6 +153,7 @@ def test_prompt_wrapping(self): method=ADAPT_LANGUAGE_MODELING, input_prefix="", model="openai/code-davinci-002", + model_deployment="openai/code-davinci-002", output_prefix="", max_tokens=0, ) diff --git a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py index ab4ef43d9b9..06cb1dec6cf 100644 --- a/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +++ b/src/helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py @@ -7,7 +7,9 @@ class TestMultipleChoiceJointAdapter(TestAdapter): def test_sample_examples(self): - adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=4) + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=4 + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) all_train_instances = [ Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]), @@ -27,13 +29,20 @@ def test_sample_examples(self): assert examples[3].input.text == "say yes3" def test_sample_examples_no_train_instances(self): - adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=2) + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", model_deployment="openai/ada", max_train_instances=2 + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) examples = adapter.sample_examples(all_train_instances=[], seed=0) assert len(examples) == 0 def test_sample_examples_greater_max_train_instances(self): - adapter_spec = AdapterSpec(method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10) + adapter_spec = AdapterSpec( + method=ADAPT_MULTIPLE_CHOICE_JOINT, + model="openai/ada", + model_deployment="openai/ada", + max_train_instances=10, + ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) all_train_instances = [ Instance(Input(text="say no"), references=[Reference(Output(text="no"), tags=[CORRECT_TAG])]), @@ -46,7 +55,11 @@ def test_sample_examples_greater_max_train_instances(self): def test_multiple_correct_reference(self): adapter_spec = AdapterSpec( - method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", max_train_instances=10, sample_train=False + method=ADAPT_MULTIPLE_CHOICE_JOINT, + model="openai/ada", + model_deployment="openai/ada", + max_train_instances=10, + sample_train=False, ) adapter = AdapterFactory.get_adapter(adapter_spec, self.tokenizer_service) train_instances = [ @@ -102,6 +115,7 @@ def test_multiple_correct_reference_multi_label(self): adapter_spec = AdapterSpec( method=ADAPT_MULTIPLE_CHOICE_JOINT, model="openai/ada", + model_deployment="openai/ada", max_train_instances=10, multi_label=True, sample_train=False, diff --git a/src/helm/benchmark/config_registry.py b/src/helm/benchmark/config_registry.py new file mode 100644 index 00000000000..0fab062949c --- /dev/null +++ b/src/helm/benchmark/config_registry.py @@ -0,0 +1,14 @@ +from helm.benchmark.model_deployment_registry import register_deployments_if_not_already_registered +from helm.benchmark.model_metadata_registry import register_metadatas_if_not_already_registered +from helm.benchmark.tokenizer_config_registry import register_tokenizers_if_not_already_registered + +HELM_REGISTERED: bool = False + + +def register_helm_configurations(): + global HELM_REGISTERED + if not HELM_REGISTERED: + register_metadatas_if_not_already_registered() + register_tokenizers_if_not_already_registered() + register_deployments_if_not_already_registered() + HELM_REGISTERED = True diff --git a/src/helm/benchmark/huggingface_registration.py b/src/helm/benchmark/huggingface_registration.py index ff444fd4fda..bc833ddea2e 100644 --- a/src/helm/benchmark/huggingface_registration.py +++ b/src/helm/benchmark/huggingface_registration.py @@ -1,5 +1,6 @@ import os from typing import Optional +from datetime import date from helm.benchmark.model_deployment_registry import ( ClientSpec, @@ -7,7 +8,15 @@ WindowServiceSpec, register_model_deployment, ) +from helm.benchmark.model_metadata_registry import ( + get_model_metadata, + ModelMetadata, + register_model_metadata, + TEXT_MODEL_TAG, + FULL_FUNCTIONALITY_TEXT_MODEL_TAG, +) from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec, register_tokenizer_config +from helm.common.hierarchical_logger import hlog def register_huggingface_model( @@ -30,6 +39,27 @@ def register_huggingface_model( args=object_spec_args, ), ) + + # We check if the model is already registered because we don't want to + # overwrite the model metadata if it's already registered. + # If it's not registered, we register it, as otherwise an error would be thrown + # when we try to register the model deployment. + try: + _ = get_model_metadata(model_name=helm_model_name) + except ValueError: + register_model_metadata( + ModelMetadata( + name=helm_model_name, + creator_organization_name="Unknown", + display_name=helm_model_name, + description=helm_model_name, + access="open", + release_date=date.today(), + tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], + ) + ) + hlog(f"Registered default metadata for model {helm_model_name}") + register_model_deployment(model_deployment) tokenizer_config = TokenizerConfig( name=helm_model_name, diff --git a/src/helm/benchmark/metrics/basic_metrics.py b/src/helm/benchmark/metrics/basic_metrics.py index c9a107628cc..aa0eb8ae949 100644 --- a/src/helm/benchmark/metrics/basic_metrics.py +++ b/src/helm/benchmark/metrics/basic_metrics.py @@ -60,8 +60,8 @@ def compute_estimated_time_from_prompt_size_and_num_output_tokens( num_output_tokens: int, ) -> Optional[float]: estimated_runtime: Optional[float] - if request_state.request.model in inference_runtimes_dict: - inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model] + if request_state.request.model_deployment in inference_runtimes_dict: + inference_runtimes_dict_for_model = inference_runtimes_dict[request_state.request.model_deployment] runtime_per_output_token: float = inference_runtimes_dict_for_model["runtime_per_output_token"] raw_runtimes_for_prompt_tokens: Dict[str, float] = inference_runtimes_dict_for_model[ "runtime_for_prompt_tokens" @@ -583,7 +583,9 @@ def compute_efficiency_metrics( # Fetch the right `Tokenizer` depending on the model defined in `AdapterSpec` # and calculate the number of tokens in the prompt. tokenizer_service: TokenizerService = metric_service - window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service) + window_service: WindowService = WindowServiceFactory.get_window_service( + adapter_spec.model_deployment, tokenizer_service + ) prompt: str = request_state.request.prompt num_prompt_tokens: int = window_service.get_num_tokens(prompt) @@ -618,14 +620,16 @@ def compute_efficiency_metrics( # Compute efficiency metrics for training. training_co2_cost: Optional[float] - if request_state.request.model in self.training_efficiency_dict["carbon"]: - training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model]["value"] + if request_state.request.model_deployment in self.training_efficiency_dict["carbon"]: + training_co2_cost = self.training_efficiency_dict["carbon"][request_state.request.model_deployment]["value"] else: training_co2_cost = None training_energy_cost: Optional[float] - if request_state.request.model in self.training_efficiency_dict["energy"]: - training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model]["value"] + if request_state.request.model_deployment in self.training_efficiency_dict["energy"]: + training_energy_cost = self.training_efficiency_dict["energy"][request_state.request.model_deployment][ + "value" + ] else: training_energy_cost = None @@ -799,7 +803,9 @@ def compute_logprob_and_length(request_state: RequestState, window_service: Wind num_choices = len(references) tokenizer_service: TokenizerService = metric_service - window_service: WindowService = WindowServiceFactory.get_window_service(adapter_spec.model, tokenizer_service) + window_service: WindowService = WindowServiceFactory.get_window_service( + adapter_spec.model_deployment, tokenizer_service + ) reference_stats: Dict[ReferenceKey, ReferenceStat] = {} for request_state in reference_request_states: assert request_state.reference_index is not None and request_state.request_mode is not None diff --git a/src/helm/benchmark/metrics/disinformation_metrics.py b/src/helm/benchmark/metrics/disinformation_metrics.py index fd8d180bee9..2025d06eb7e 100644 --- a/src/helm/benchmark/metrics/disinformation_metrics.py +++ b/src/helm/benchmark/metrics/disinformation_metrics.py @@ -86,7 +86,7 @@ def _compute_wedging_human_eval( results: List[Stat] = [] instance_first_line = request_state.instance.input.text.splitlines()[0] human_evaluations = _fetch_human_evaluation_results(eval_cache_path, WEDGING_HUMAN_EVAL_FILE) - model_results = human_evaluations.get(adapter_spec.model) + model_results = human_evaluations.get(adapter_spec.model_deployment) if not model_results: # Trying to evaluate a model we don't have annotations for @@ -125,7 +125,7 @@ def _compute_reiteration_human_eval( """ results: List[Stat] = [] human_evaluations = _fetch_human_evaluation_results(eval_cache_path, REITERATION_HUMAN_EVAL_FILE) - model_results = human_evaluations.get(adapter_spec.model) + model_results = human_evaluations.get(adapter_spec.model_deployment) if not model_results: # Trying to evaluate a model we don't have annotations for return results diff --git a/src/helm/benchmark/metrics/dry_run_metrics.py b/src/helm/benchmark/metrics/dry_run_metrics.py index 1f2618b0dd0..4fe2126630e 100644 --- a/src/helm/benchmark/metrics/dry_run_metrics.py +++ b/src/helm/benchmark/metrics/dry_run_metrics.py @@ -38,7 +38,9 @@ def process(self, request_state: RequestState) -> List[Stat]: stats.append(Stat(MetricName("max_num_completion_tokens")).add(request.num_completions * request.max_tokens)) # Get number of tokens in the prompt - tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, self.metric_service) + tokenizer: WindowService = WindowServiceFactory.get_window_service( + request.model_deployment, self.metric_service + ) num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt) stats.append(Stat(MetricName("num_prompt_tokens")).add(num_prompt_tokens)) diff --git a/src/helm/benchmark/metrics/summarization_metrics.py b/src/helm/benchmark/metrics/summarization_metrics.py index f4cb1cff9ae..3a61ae77413 100644 --- a/src/helm/benchmark/metrics/summarization_metrics.py +++ b/src/helm/benchmark/metrics/summarization_metrics.py @@ -168,7 +168,6 @@ def evaluate_generation( metric_service: MetricService, eval_cache_path: str, ) -> List[Stat]: - refs: List[str] = [self._remove_braces(ref.output.text) for ref in request_state.instance.references] inp: str = self._remove_braces(request_state.instance.input.text) @@ -182,9 +181,9 @@ def evaluate_generation( self.humaneval = self._load_humaneval(eval_cache_path) # get human evaluation scores if they exist - model_name = adapter_spec.model.replace("/", "_") + deployment = adapter_spec.model_deployment.replace("/", "_") for metric_name in ["faithfulness", "relevance", "coherence"]: - val = self.humaneval[(metric_name, model_name, request_state.instance.id, pred)] + val = self.humaneval[(metric_name, deployment, request_state.instance.id, pred)] result.append(Stat(MetricName(f"HumanEval-{metric_name}")).add(float(val))) except KeyError: pass @@ -196,8 +195,8 @@ def evaluate_generation( if self.qa_fact_eval is None: self._load_qafacteval(eval_cache_path) assert self.qa_fact_eval is not None - model_name = adapter_spec.model.replace("/", "_") - val = self.qa_fact_eval[model_name][(request_state.instance.id, pred)] + deployment = adapter_spec.model_deployment.replace("/", "_") + val = self.qa_fact_eval[deployment][(request_state.instance.id, pred)] result.append(Stat(MetricName("QAFactEval")).add(float(val))) except KeyError: pass diff --git a/src/helm/benchmark/metrics/test_classification_metrics.py b/src/helm/benchmark/metrics/test_classification_metrics.py index f5f3d23ff18..d15b4b9fef7 100644 --- a/src/helm/benchmark/metrics/test_classification_metrics.py +++ b/src/helm/benchmark/metrics/test_classification_metrics.py @@ -26,7 +26,7 @@ def _request_state(prediction: str, options: List[_Option]): request_mode=None, train_trial_index=0, output_mapping=None, - request=Request(), + request=Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002"), result=RequestResult( success=True, embedding=[], completions=[Sequence(text=prediction, logprob=0.0, tokens=[])], cached=False ), diff --git a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py index 297dcf60736..9d6444fcbe9 100644 --- a/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +++ b/src/helm/benchmark/metrics/tokens/auto_token_cost_estimator.py @@ -39,5 +39,5 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in """ Estimate the number of tokens for a given request based on the organization. """ - token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_organization) + token_cost_estimator: TokenCostEstimator = self._get_estimator(request.model_host) return token_cost_estimator.estimate_tokens(request, metric_service) diff --git a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py index 7880686adee..1bd22893061 100644 --- a/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +++ b/src/helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py @@ -17,6 +17,8 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in """ total_estimated_tokens: int = request.num_completions * request.max_tokens if request.echo_prompt: - window_service: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service) + window_service: WindowService = WindowServiceFactory.get_window_service( + request.model_deployment, metric_service + ) total_estimated_tokens += window_service.get_num_tokens(request.prompt) return GooseAITokenCounter.account_for_base_tokens(total_estimated_tokens) diff --git a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py index d52807ab781..429075fe949 100644 --- a/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +++ b/src/helm/benchmark/metrics/tokens/openai_token_cost_estimator.py @@ -15,7 +15,7 @@ def estimate_tokens(self, request: Request, metric_service: MetricService) -> in Add num_tokens(prompt) if Request.echo_prompt is True. """ - tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model, metric_service) + tokenizer: WindowService = WindowServiceFactory.get_window_service(request.model_deployment, metric_service) num_prompt_tokens: int = tokenizer.get_num_tokens(request.prompt) total_estimated_tokens: int = num_prompt_tokens + request.num_completions * request.max_tokens diff --git a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py index 2964beeefbf..2d202413f66 100644 --- a/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +++ b/src/helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py @@ -10,6 +10,8 @@ def setup_method(self, method): def test_estimate_tokens(self): request = Request( + model="openai/text-davinci-002", + model_deployment="openai/text-davinci-002", prompt="The Center for Research on Foundation Models (CRFM) is " "an interdisciplinary initiative born out of the Stanford " "Institute for Human-Centered Artificial Intelligence (HAI) " diff --git a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py index 44da189cf35..e4f07463e92 100644 --- a/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +++ b/src/helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py @@ -12,7 +12,6 @@ class TestOpenAITokenCostEstimator: - # The following prompt has 51 tokens according to the GPT-2 tokenizer TEST_PROMPT: str = ( "The Center for Research on Foundation Models (CRFM) is " @@ -37,13 +36,21 @@ def setup_method(self, method): self._mock_metric_service.tokenize = MagicMock(return_value=tokenization_request_result) def test_estimate_tokens(self): - request = Request(prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, num_completions=3, max_tokens=100) + request = Request( + model="openai/text-davinci-002", + model_deployment="openai/text-davinci-002", + prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, + num_completions=3, + max_tokens=100, + ) # Prompt + max number of tokens from completions = 51 + 3 * 100 assert self._token_cost_estimator.estimate_tokens(request, self._mock_metric_service) == 51 + 3 * 100 def test_estimate_tokens_with_echo_prompt(self): request = Request( + model="openai/text-davinci-002", + model_deployment="openai/text-davinci-002", prompt=TestOpenAITokenCostEstimator.TEST_PROMPT, echo_prompt=True, num_completions=1, diff --git a/src/helm/benchmark/model_deployment_registry.py b/src/helm/benchmark/model_deployment_registry.py index 44d6f88668e..c3f9d361472 100644 --- a/src/helm/benchmark/model_deployment_registry.py +++ b/src/helm/benchmark/model_deployment_registry.py @@ -1,16 +1,18 @@ import os from typing import Dict, Optional, List from dataclasses import dataclass +import importlib_resources as resources import cattrs import yaml from helm.common.hierarchical_logger import hlog from helm.common.object_spec import ObjectSpec -from helm.proxy.models import ALL_MODELS, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, MODEL_NAME_TO_MODEL, TEXT_MODEL_TAG, Model +from helm.benchmark.model_metadata_registry import ModelMetadata, get_model_metadata, CONFIG_PACKAGE -MODEL_DEPLOYMENTS_FILE = "model_deployments.yaml" +MODEL_DEPLOYMENTS_FILE: str = "model_deployments.yaml" +DEPLOYMENTS_REGISTERED: bool = False class ClientSpec(ObjectSpec): @@ -23,43 +25,63 @@ class WindowServiceSpec(ObjectSpec): @dataclass(frozen=True) class ModelDeployment: - """A model deployment is an accessible instance of this model (e.g. a hosted endpoint). - - A model can have multiple model deployments.""" + """ + A model deployment is an accessible instance of this model (e.g., a hosted endpoint). + A model can have multiple model deployments. + """ name: str - """Name of the model deployment.""" + """Name of the model deployment. Usually formatted as "/". + Example: "huggingface/t5-11b".""" client_spec: ClientSpec """Specification for instantiating the client for this model deployment.""" model_name: Optional[str] = None - """Name of the model that this model deployment is for. - - If unset, defaults to the the same value as `name`.""" + """Name of the model that this model deployment is for. Refers to the field "name" in the Model class. + If unset, defaults to the same value as `name`.""" tokenizer_name: Optional[str] = None - """Tokenizer for this model deployment. - - If unset, auto-inferred by the WindowService.""" + """Tokenizer for this model deployment. If unset, auto-inferred by the WindowService.""" window_service_spec: Optional[WindowServiceSpec] = None - """Specification for instantiating the window service for this model deployment""" + """Specification for instantiating the window service for this model deployment.""" max_sequence_length: Optional[int] = None """Maximum sequence length for this model deployment.""" max_request_length: Optional[int] = None """Maximum request length for this model deployment. - If unset, defaults to the same value as max_sequence_length.""" max_sequence_and_generated_tokens_length: Optional[int] = None """The max length of the model input and output tokens. - Some models (like Anthropic/Claude and Megatron) have a specific limit sequence length + max_token. + If unset, defaults to INT_MAX (i.e., no limit).""" - If unset, defaults to INT_MAX (i.e. no limit).""" + deprecated: bool = False + """Whether this model deployment is deprecated.""" + + @property + def host_organization(self) -> str: + """ + Extracts the host group from the model deployment name. + Example: "huggingface" from "huggingface/t5-11b" + This can be different from the creator organization (for example "together") + """ + return self.name.split("/")[0] + + @property + def engine(self) -> str: + """ + Extracts the model engine from the model deployment name. + Example: 'ai21/j1-jumbo' => 'j1-jumbo' + """ + return self.name.split("/")[1] + + def __post_init__(self): + if not self.model_name: + object.__setattr__(self, "model_name", self.name) @dataclass(frozen=True) @@ -67,28 +89,32 @@ class ModelDeployments: model_deployments: List[ModelDeployment] -_name_to_model_deployment: Dict[str, ModelDeployment] = {} +ALL_MODEL_DEPLOYMENTS: List[ModelDeployment] = [] +DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: Dict[str, ModelDeployment] = { + deployment.name: deployment for deployment in ALL_MODEL_DEPLOYMENTS +} +# ===================== REGISTRATION FUNCTIONS ==================== # def register_model_deployment(model_deployment: ModelDeployment) -> None: - hlog(f"Registered model deployment {model_deployment.name}") - _name_to_model_deployment[model_deployment.name] = model_deployment - - # Auto-register a model with this name if none exists - model_name = model_deployment.model_name or model_deployment.name - if model_name not in MODEL_NAME_TO_MODEL: - model = Model( - group="unknown", - name=model_name, - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ) - MODEL_NAME_TO_MODEL[model_name] = model - ALL_MODELS.append(model) - hlog(f"Registered default metadata for model {model_name}") + # hlog(f"Registered model deployment {model_deployment.name}") + DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_deployment.name] = model_deployment + ALL_MODEL_DEPLOYMENTS.append(model_deployment) + + model_name: str = model_deployment.model_name or model_deployment.name + + try: + model_metadata: ModelMetadata = get_model_metadata(model_name) + deployment_names: List[str] = model_metadata.deployment_names or [model_metadata.name] + if model_deployment.name not in deployment_names: + if model_metadata.deployment_names is None: + model_metadata.deployment_names = [] + model_metadata.deployment_names.append(model_deployment.name) + except ValueError: + raise ValueError(f"Model deployment {model_deployment.name} has no corresponding model metadata") def register_model_deployments_from_path(path: str) -> None: - global _name_to_model_deployment hlog(f"Reading model deployments from {path}...") with open(path, "r") as f: raw = yaml.safe_load(f) @@ -97,12 +123,63 @@ def register_model_deployments_from_path(path: str) -> None: register_model_deployment(model_deployment) -def maybe_register_model_deployments_from_base_path(base_path: str) -> None: - """Register model deployments from prod_env/model_deployments.yaml""" - path = os.path.join(base_path, MODEL_DEPLOYMENTS_FILE) +def maybe_register_model_deployments_from_base_path(path: str) -> None: + """Register model deployments from yaml file if the path exists.""" if os.path.exists(path): register_model_deployments_from_path(path) -def get_model_deployment(name: str) -> Optional[ModelDeployment]: - return _name_to_model_deployment.get(name) +# ===================== UTIL FUNCTIONS ==================== # +def get_model_deployment(name: str, warn_deprecated: bool = False) -> ModelDeployment: + register_deployments_if_not_already_registered() + if name not in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: + raise ValueError(f"Model deployment {name} not found") + deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[name] + if deployment.deprecated and warn_deprecated: + hlog(f"WARNING: DEPLOYMENT Model deployment {name} is deprecated") + return deployment + + +def get_model_deployments_by_host_organization(host_organization: str) -> List[str]: + """ + Gets models by host organization. + Example: together => [" together/bloom", "together/t0pp", ...] + """ + register_deployments_if_not_already_registered() + return [ + deployment.name for deployment in ALL_MODEL_DEPLOYMENTS if deployment.host_organization == host_organization + ] + + +def get_model_deployment_host_organization(name: str) -> str: + """ + Extracts the host organization from the model deployment name. + Example: "huggingface/t5-11b" => "huggingface" + """ + deployment: ModelDeployment = get_model_deployment(name) + return deployment.host_organization + + +def get_metadata_for_deployment(deployment_name: str) -> ModelMetadata: + """ + Given a deployment name, returns the corresponding model metadata. + """ + deployment: ModelDeployment = get_model_deployment(deployment_name) + return get_model_metadata(deployment.model_name or deployment.name) + + +def get_model_names_with_tokenizer(tokenizer_name: str) -> List[str]: + """Get all the name of the models with tokenizer `tokenizer_name`.""" + register_deployments_if_not_already_registered() + deployments: List[ModelDeployment] = [ + deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.tokenizer_name == tokenizer_name + ] + return [deployment.model_name or deployment.name for deployment in deployments] + + +def register_deployments_if_not_already_registered() -> None: + global DEPLOYMENTS_REGISTERED + if not DEPLOYMENTS_REGISTERED: + path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_DEPLOYMENTS_FILE) + maybe_register_model_deployments_from_base_path(path) + DEPLOYMENTS_REGISTERED = True diff --git a/src/helm/benchmark/model_metadata_registry.py b/src/helm/benchmark/model_metadata_registry.py index e95c8a520b1..335c75c5b4b 100644 --- a/src/helm/benchmark/model_metadata_registry.py +++ b/src/helm/benchmark/model_metadata_registry.py @@ -1,52 +1,129 @@ import os -from typing import Optional, List +from typing import Dict, Optional, List from dataclasses import dataclass, field from datetime import date +import importlib_resources as resources import dacite import yaml -from helm.proxy.models import ALL_MODELS, MODEL_NAME_TO_MODEL, Model +# Different modalities +TEXT_MODEL_TAG: str = "TEXT_MODEL_TAG" +IMAGE_MODEL_TAG: str = "IMAGE_MODEL_TAG" +CODE_MODEL_TAG: str = "CODE_MODEL_TAG" +EMBEDDING_MODEL_TAG: str = "EMBEDDING_MODEL_TAG" -MODEL_METADATA_FILE = "model_metadata.yaml" +# Some model APIs have limited functionalities +FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "FULL_FUNCTIONALITY_TEXT_MODEL_TAG" +LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG" +# ChatML format +CHATML_MODEL_TAG: str = "CHATML_MODEL_TAG" -@dataclass(frozen=True) +# OpenAI Chat format +OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt" + +# For Anthropic models +ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "ANTHROPIC_CLAUDE_1_MODEL_TAG" +ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "ANTHROPIC_CLAUDE_2_MODEL_TAG" + +# Models which emit garbage tokens when temperature=0. +BUGGY_TEMP_0_TAG: str = "BUGGY_TEMP_0_TAG" + +# Models that are used for ablations and fine-grained analyses. +# These models are selected specifically because of their low marginal cost to evaluate. +ABLATION_MODEL_TAG: str = "ABLATION_MODEL_TAG" + +# Some models (e.g., T5) have stripped newlines. +# So we cannot use \n as a stop sequence for these models. +NO_NEWLINES_TAG: str = "NO_NEWLINES_TAG" + +# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the +# prompts to indicate the mode before doing inference. +NLG_PREFIX_TAG: str = "NLG_PREFIX_TAG" + +# Some models can follow instructions. +INSTRUCTION_FOLLOWING_MODEL_TAG: str = "INSTRUCTION_FOLLOWING_MODEL_TAG" + +# For Vision-langauge models (VLMs) +VISION_LANGUAGE_MODEL_TAG: str = "VISION_LANGUAGE_MODEL_TAG" + + +CONFIG_PACKAGE = "helm.config" +MODEL_METADATA_FILE: str = "model_metadata.yaml" +METADATAS_REGISTERED: bool = False + + +# Frozen is set to false as the model_deployment_registry.py file +# might populate the deployment_names field. +@dataclass(frozen=False) class ModelMetadata: name: str - """Name of the model e.g. "meta/llama-2".""" + """Name of the model group (e.g., "openai/davinci"). This is the name of the model, + not the name of the deployment. + Usually formatted as "/". Example: "ai21/j1-jumbo".""" - creator_organization: Optional[str] = None - """Organization that originally created the model (e.g. "meta").""" + creator_organization_name: str + """Name of the organization that created the model.""" - access: Optional[str] = None - """How this model is available (e.g., limited). + display_name: str + """Name that is going to be displayed to the user (on the website, etc.).""" - If there are multiple deployments, this should be the most permissive access across - all deployments.""" + description: str + """Description of the model, to be displayed on the website.""" - todo: bool = False - """Whether we have yet to evaluate this model.""" + access: str + """Description of the access level of the model. Should be one of the following: + - "open": the model is open-source and can be downloaded from the internet. + - "closed": not accessible + - "limited": accessible with an API key. + If there are multiple deployments, this should be the most permissive access across all deployments.""" - release_date: Optional[date] = None - """When the model was released.""" + release_date: date + """Release date of the model.""" - num_parameters: Optional[int] = None - """The number of model parameters. + tags: List[str] = field(default_factory=list) + """Tags corresponding to the properties of the model.""" + num_parameters: Optional[int] = None + """Number of parameters in the model. This should be a string as the number of parameters is usually a round number (175B), but we set it as an int for plotting purposes.""" - tags: List[str] = field(default_factory=list) - """""" + deployment_names: Optional[List[str]] = None + """List of the model deployments for this model. Should at least contain one model deployment. + Refers to the field "name" in the ModelDeployment class. Defaults to a single model deployment + with the same name as the model.""" + + @property + def creator_organization(self) -> str: + """ + Extracts the creator organization from the model name. + Example: 'ai21/j1-jumbo' => 'ai21' + This can be different from the hosting organization. + """ + return self.name.split("/")[0] + + @property + def engine(self) -> str: + """ + Extracts the model engine from the model name. + Example: 'ai21/j1-jumbo' => 'j1-jumbo' + """ + return self.name.split("/")[1] @dataclass(frozen=True) class ModelMetadataList: - models: List[ModelMetadata] + models: List[ModelMetadata] = field(default_factory=list) +ALL_MODELS_METADATA: List[ModelMetadata] = [] +MODEL_NAME_TO_MODEL_METADATA: Dict[str, ModelMetadata] = {model.name: model for model in ALL_MODELS_METADATA} + + +# ===================== REGISTRATION FUNCTIONS ==================== # def register_model_metadata_from_path(path: str) -> None: """Register model configurations from the given path.""" with open(path, "r") as f: @@ -55,17 +132,77 @@ def register_model_metadata_from_path(path: str) -> None: # serialization format for dates model_metadata_list = dacite.from_dict(ModelMetadataList, raw) for model_metadata in model_metadata_list.models: - model = Model( - group="none", # TODO: Group should be part of model deployment, not model - name=model_metadata.name, - tags=model_metadata.tags, - ) - MODEL_NAME_TO_MODEL[model_metadata.name] = model - ALL_MODELS.append(model) - - -def maybe_register_model_metadata_from_base_path(base_path: str) -> None: - """Register model metadata from prod_env/model_metadata.yaml""" - path = os.path.join(base_path, MODEL_METADATA_FILE) + register_model_metadata(model_metadata) + + +def register_model_metadata(model_metadata: ModelMetadata) -> None: + """Register a single model configuration.""" + # hlog(f"Registered model metadata {model_metadata.name}") + ALL_MODELS_METADATA.append(model_metadata) + MODEL_NAME_TO_MODEL_METADATA[model_metadata.name] = model_metadata + + +def maybe_register_model_metadata_from_base_path(path: str) -> None: + """Register model metadata from yaml file if the path exists.""" if os.path.exists(path): register_model_metadata_from_path(path) + + +# ===================== UTIL FUNCTIONS ==================== # +def get_model_metadata(model_name: str) -> ModelMetadata: + """Get the `Model` given the name.""" + register_metadatas_if_not_already_registered() + if model_name not in MODEL_NAME_TO_MODEL_METADATA: + raise ValueError(f"No model with name: {model_name}") + + return MODEL_NAME_TO_MODEL_METADATA[model_name] + + +def get_model_creator_organization(model_name: str) -> str: + """Get the model's group given the name.""" + model: ModelMetadata = get_model_metadata(model_name) + return model.creator_organization + + +def get_all_models() -> List[str]: + """Get all model names.""" + register_metadatas_if_not_already_registered() + return list(MODEL_NAME_TO_MODEL_METADATA.keys()) + + +def get_models_by_creator_organization(organization: str) -> List[str]: + """ + Gets models by creator organization. + Example: ai21 => ai21/j1-jumbo, ai21/j1-grande, ai21-large. + """ + register_metadatas_if_not_already_registered() + return [model.name for model in ALL_MODELS_METADATA if model.creator_organization == organization] + + +def get_model_names_with_tag(tag: str) -> List[str]: + """Get all the name of the models with tag `tag`.""" + register_metadatas_if_not_already_registered() + return [model.name for model in ALL_MODELS_METADATA if tag in model.tags] + + +def get_all_text_models() -> List[str]: + """Get all text model names.""" + return get_model_names_with_tag(TEXT_MODEL_TAG) + + +def get_all_code_models() -> List[str]: + """Get all code model names.""" + return get_model_names_with_tag(CODE_MODEL_TAG) + + +def get_all_instruction_following_models() -> List[str]: + """Get all instruction-following model names.""" + return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG) + + +def register_metadatas_if_not_already_registered() -> None: + global METADATAS_REGISTERED + if not METADATAS_REGISTERED: + path: str = resources.files(CONFIG_PACKAGE).joinpath(MODEL_METADATA_FILE) + maybe_register_model_metadata_from_base_path(path) + METADATAS_REGISTERED = True diff --git a/src/helm/benchmark/presentation/contamination.py b/src/helm/benchmark/presentation/contamination.py index 21985f60101..0e876bc42d9 100644 --- a/src/helm/benchmark/presentation/contamination.py +++ b/src/helm/benchmark/presentation/contamination.py @@ -5,7 +5,7 @@ import yaml from helm.common.hierarchical_logger import htrack, hlog -from helm.proxy.models import MODEL_NAME_TO_MODEL +from helm.benchmark.model_metadata_registry import MODEL_NAME_TO_MODEL_METADATA from helm.benchmark.presentation.schema import Schema @@ -70,7 +70,7 @@ def validate_contamination(contamination: Contamination, schema: Schema): """Make sure models and groups in contamination are defined according to `schema`.""" for point in contamination.points: for model in point.models: - if model not in MODEL_NAME_TO_MODEL: + if model not in MODEL_NAME_TO_MODEL_METADATA: hlog(f"WARNING: model {model} not defined in schema") for group in point.groups: if group not in schema.name_to_run_group: diff --git a/src/helm/benchmark/presentation/run_specs.conf b/src/helm/benchmark/presentation/run_specs.conf index a1009c29e35..5b17b35deea 100644 --- a/src/helm/benchmark/presentation/run_specs.conf +++ b/src/helm/benchmark/presentation/run_specs.conf @@ -483,36 +483,36 @@ entries: [ {description: "lsat_qa:model=text_code,task=assignment", priority: 3} {description: "lsat_qa:model=text_code,task=miscellaneous", priority: 3} - {description: "lextreme:subset=brazilian_court_decisions_judgment,model=all", priority: 5} - {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=all", priority: 5} - {description: "lextreme:subset=german_argument_mining,model=all", priority: 5} - {description: "lextreme:subset=greek_legal_code_chapter,model=all", priority: 5} - {description: "lextreme:subset=greek_legal_code_subject,model=all", priority: 5} - {description: "lextreme:subset=greek_legal_code_volume,model=all", priority: 5} - {description: "lextreme:subset=swiss_judgment_prediction,model=all", priority: 5} - {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=all", priority: 5} - {description: "lextreme:subset=online_terms_of_service_clause_topics,model=all", priority: 5} - {description: "lextreme:subset=covid19_emergency_event,model=all", priority: 5} - {description: "lextreme:subset=multi_eurlex_level_1,model=all", priority: 5} - {description: "lextreme:subset=multi_eurlex_level_2,model=all", priority: 5} - {description: "lextreme:subset=multi_eurlex_level_3,model=all", priority: 5} - {description: "lextreme:subset=greek_legal_ner,model=all", priority: 5} - {description: "lextreme:subset=legalnero,model=all", priority: 5} - {description: "lextreme:subset=lener_br,model=all", priority: 5} - {description: "lextreme:subset=mapa_coarse,model=all", priority: 5} - {description: "lextreme:subset=mapa_fine,model=all", priority: 5} - - {description: "lex_glue:subset=ecthr_a,model=all", priority: 3} - {description: "lex_glue:subset=ecthr_b,model=all", priority: 3} - {description: "lex_glue:subset=scotus,model=all", priority: 3} - {description: "lex_glue:subset=eurlex,model=all", priority: 3} - {description: "lex_glue:subset=ledgar,model=all", priority: 3} - {description: "lex_glue:subset=unfair_tos,model=all", priority: 3} - {description: "lex_glue:subset=case_hold,model=all", priority: 3} - - {description: "billsum_legal_summarization:model=all", priority: 3}, - {description: "multilexsum_legal_summarization:model=all", priority: 3}, - {description: "eurlexsum_legal_summarization:model=all", priority: 3}, + {description: "lextreme:subset=brazilian_court_decisions_judgment,model=text", priority: 5} + {description: "lextreme:subset=brazilian_court_decisions_unanimity,model=text", priority: 5} + {description: "lextreme:subset=german_argument_mining,model=text", priority: 5} + {description: "lextreme:subset=greek_legal_code_chapter,model=text", priority: 5} + {description: "lextreme:subset=greek_legal_code_subject,model=text", priority: 5} + {description: "lextreme:subset=greek_legal_code_volume,model=text", priority: 5} + {description: "lextreme:subset=swiss_judgment_prediction,model=text", priority: 5} + {description: "lextreme:subset=online_terms_of_service_unfairness_levels,model=text", priority: 5} + {description: "lextreme:subset=online_terms_of_service_clause_topics,model=text", priority: 5} + {description: "lextreme:subset=covid19_emergency_event,model=text", priority: 5} + {description: "lextreme:subset=multi_eurlex_level_1,model=text", priority: 5} + {description: "lextreme:subset=multi_eurlex_level_2,model=text", priority: 5} + {description: "lextreme:subset=multi_eurlex_level_3,model=text", priority: 5} + {description: "lextreme:subset=greek_legal_ner,model=text", priority: 5} + {description: "lextreme:subset=legalnero,model=text", priority: 5} + {description: "lextreme:subset=lener_br,model=text", priority: 5} + {description: "lextreme:subset=mapa_coarse,model=text", priority: 5} + {description: "lextreme:subset=mapa_fine,model=text", priority: 5} + + {description: "lex_glue:subset=ecthr_a,model=text", priority: 3} + {description: "lex_glue:subset=ecthr_b,model=text", priority: 3} + {description: "lex_glue:subset=scotus,model=text", priority: 3} + {description: "lex_glue:subset=eurlex,model=text", priority: 3} + {description: "lex_glue:subset=ledgar,model=text", priority: 3} + {description: "lex_glue:subset=unfair_tos,model=text", priority: 3} + {description: "lex_glue:subset=case_hold,model=text", priority: 3} + + {description: "billsum_legal_summarization:model=text", priority: 3}, + {description: "multilexsum_legal_summarization:model=text", priority: 3}, + {description: "eurlexsum_legal_summarization:model=text", priority: 3}, # MedQA {description: "med_qa:model=biomedical", priority: 2} diff --git a/src/helm/benchmark/presentation/schema.py b/src/helm/benchmark/presentation/schema.py index 829f33e7336..3a0b7877b8c 100644 --- a/src/helm/benchmark/presentation/schema.py +++ b/src/helm/benchmark/presentation/schema.py @@ -207,9 +207,12 @@ class RunGroup(Field): # Which adapter_spec fields we should preserve when displaying methods for this group # When we are constructing a table where the rows are methods, what constitutes a "method" is given by the set of - # adapter keys. By default, this should just be "model" (e.g., BLOOM), where details like "num_train_instances" are - # "marginalized out". However, for ablations, we want to include both "model" and "num_train_instances". - adapter_keys_shown: List[str] = field(default_factory=lambda: ["model"]) + # adapter keys. By default, this should just be "model_deployment" (e.g., BLOOM), where details like + # "num_train_instances" are "marginalized out". However, for ablations, we want to include both "model_deployment" + # and "num_train_instances". + # NOTE: "model" is kept for backward compatibility reason. + # TODO: remove when we don't want helm-summarize to support runs before November 2023 anymore. + adapter_keys_shown: List[str] = field(default_factory=lambda: ["model_deployment", "model"]) @dataclass diff --git a/src/helm/benchmark/presentation/summarize.py b/src/helm/benchmark/presentation/summarize.py index 2880f0a419f..4d873338015 100644 --- a/src/helm/benchmark/presentation/summarize.py +++ b/src/helm/benchmark/presentation/summarize.py @@ -57,7 +57,10 @@ CONTAMINATION_STYLES, CONTAMINATION_LEVEL_STRONG, ) +from helm.benchmark.config_registry import register_helm_configurations from helm.benchmark.presentation.run_display import write_run_display_json +from helm.benchmark.model_deployment_registry import get_metadata_for_deployment +from helm.benchmark.model_metadata_registry import ModelMetadata OVERLAP_N_COUNT = 13 @@ -178,6 +181,8 @@ def get_method_display_name(model_display_name: Optional[str], info: Dict[str, A info = dict(info) if "model" in info: del info["model"] + if "model_deployment" in info: + del info["model_deployment"] return (model_display_name or "???") + (f" [{dict_to_str(info)}]" if len(info) > 0 else "") @@ -565,12 +570,12 @@ def write_cost_report(self): # TODO: move to write_executive_summary() models_to_costs: Dict[str, Dict[str]] = defaultdict(lambda: defaultdict(int)) for run in self.runs: - model: str = run.run_spec.adapter_spec.model + deployment: str = run.run_spec.adapter_spec.model_deployment for stat in run.stats: stat_name = stat.name.name if stat_name in Summarizer.COST_REPORT_FIELDS and not stat.name.split: - models_to_costs[model][stat_name] += stat.sum + models_to_costs[deployment][stat_name] += stat.sum # Do a second pass to add up the total number of tokens for costs in models_to_costs.values(): @@ -661,7 +666,7 @@ def get_cell(stats: List[Stat], compute_mean: bool = False, compute_sum: bool = for subgroup in self.expand_subgroups(group): for adapter_spec, runs in self.group_adapter_to_runs[subgroup.name].items(): filtered_runs = self.filter_runs_by_visibility(runs, subgroup) - models.add(adapter_spec.model) + models.add(adapter_spec.model_deployment) methods.add(adapter_spec.method) for run in filtered_runs: num_instances.extend(get_all_stats_by_name(run.stats, "num_instances")) @@ -870,33 +875,28 @@ def run_spec_names_to_url(run_spec_names: List[str]) -> str: model_order = [model.name for model in self.schema.models] def _adapter_spec_sort_key(spec): - index = model_order.index(spec.model) if spec.model in model_order else -1 - return (index, spec.model) + index = model_order.index(spec.model_deployment) if spec.model_deployment in model_order else -1 + return (index, spec.model_deployment) adapter_specs = list(sorted(adapter_specs, key=_adapter_spec_sort_key)) # Pull out only the keys of the method adapter_spec that is needed to # uniquely identify the method. - infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model"]) + infos = unique_simplification(list(map(asdict_without_nones, adapter_specs)), ["model_deployment", "model"]) assert len(adapter_specs) == len(infos), [adapter_specs, infos] # Populate the contents of the table rows = [] for adapter_spec, info in zip(adapter_specs, infos): - model_name: str = adapter_spec.model - - # Get the model display name from the schema. - # Fall back to using the model name as the model display name if the model is not - # defined in the schema. - model_display_name = ( - self.schema.name_to_model[model_name].display_name - if model_name in self.schema.name_to_model - else model_name + deployment: str = ( + adapter_spec.model_deployment if len(adapter_spec.model_deployment) > 0 else adapter_spec.model ) + model_metadata: ModelMetadata = get_metadata_for_deployment(deployment) + model_name: str = model_metadata.name runs = adapter_to_runs[adapter_spec] - display_name = get_method_display_name(model_display_name, info) + display_name = get_method_display_name(model_metadata.display_name, info) # Link to all the runs under this model if link_to_runs: @@ -1337,6 +1337,8 @@ def main(): else: raise ValueError("Exactly one of --release or --suite must be specified.") + register_helm_configurations() + # Output JSON files summarizing the benchmark results which will be loaded in the web interface summarizer = Summarizer( release=release, diff --git a/src/helm/benchmark/presentation/test_run_entry.py b/src/helm/benchmark/presentation/test_run_entry.py index 68d33424d81..86a3b53afc2 100644 --- a/src/helm/benchmark/presentation/test_run_entry.py +++ b/src/helm/benchmark/presentation/test_run_entry.py @@ -1,4 +1,5 @@ import os +import pytest from helm.common.object_spec import parse_object_spec from helm.benchmark.presentation.run_entry import read_run_entries @@ -6,11 +7,16 @@ from helm.benchmark import vlm_run_specs # noqa -def test_read_all_specs(): - """Read all the run entries and make sure they parse and we can instantiate them.""" +def list_fnames(): base_path = os.path.dirname(__file__) - for fname in os.listdir(base_path): - if fname.endswith(".conf"): - run_entries = read_run_entries([os.path.join(base_path, fname)]) - for entry in run_entries.entries: - construct_run_specs(parse_object_spec(entry.description)) + return [os.path.join(base_path, fname) for fname in os.listdir(base_path) if fname.endswith(".conf")] + + +class TestRunEntry: + """Read all the run entries and make sure they parse and we can instantiate them.""" + + @pytest.mark.parametrize("fname", list_fnames()) + def test_read_all_specs(self, fname: str): + run_entries = read_run_entries([fname]) + for entry in run_entries.entries: + construct_run_specs(parse_object_spec(entry.description)) diff --git a/src/helm/benchmark/run.py b/src/helm/benchmark/run.py index e87c546a070..cdf280d4364 100644 --- a/src/helm/benchmark/run.py +++ b/src/helm/benchmark/run.py @@ -14,6 +14,7 @@ from helm.benchmark.model_metadata_registry import register_model_metadata_from_path from helm.benchmark.model_deployment_registry import register_model_deployments_from_path +from helm.benchmark.config_registry import register_helm_configurations from helm.benchmark.adaptation.adapter_spec import AdapterSpec from helm.benchmark import vlm_run_specs # noqa from .executor import ExecutionSpec @@ -38,7 +39,7 @@ def run_entries_to_run_specs( for run_spec in construct_run_specs(parse_object_spec(entry.description)): # Filter by models - if models_to_run and run_spec.adapter_spec.model not in models_to_run: + if models_to_run and run_spec.adapter_spec.model_deployment not in models_to_run: continue # Filter by groups @@ -277,6 +278,8 @@ def main(): [RunEntry(description=description, priority=1, groups=None) for description in args.run_specs] ) + register_helm_configurations() + run_specs = run_entries_to_run_specs( run_entries=run_entries, max_eval_instances=args.max_eval_instances, diff --git a/src/helm/benchmark/run_expander.py b/src/helm/benchmark/run_expander.py index a664b2893ca..d7ada38f0ab 100644 --- a/src/helm/benchmark/run_expander.py +++ b/src/helm/benchmark/run_expander.py @@ -3,7 +3,7 @@ from dataclasses import replace from typing import Any, List, Dict, Optional, Tuple, Type -from helm.proxy.models import ( +from helm.benchmark.model_metadata_registry import ( get_all_instruction_following_models, get_all_code_models, get_all_models, @@ -11,16 +11,10 @@ get_model_names_with_tag, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - AI21_TOKENIZER_TAG, - COHERE_TOKENIZER_TAG, - OPT_TOKENIZER_TAG, - GPTJ_TOKENIZER_TAG, - GPTNEO_TOKENIZER_TAG, - GPT4_TOKENIZER_TAG, ABLATION_MODEL_TAG, VISION_LANGUAGE_MODEL_TAG, ) +from helm.benchmark.model_deployment_registry import get_model_names_with_tokenizer from .runner import RunSpec from helm.benchmark.adaptation.adapter_spec import AdapterSpec, Substitution from .augmentations.perturbation import PerturbationSpec @@ -355,10 +349,6 @@ def values_dict(self): "code": get_all_code_models(), "instruction_following": get_all_instruction_following_models(), "limited_functionality_text": get_model_names_with_tag(LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG), - "gpt2_tokenizer": get_model_names_with_tag(GPT2_TOKENIZER_TAG), - "ai21_tokenizer": get_model_names_with_tag(AI21_TOKENIZER_TAG), - "cohere_tokenizer": get_model_names_with_tag(COHERE_TOKENIZER_TAG), - "opt_tokenizer": get_model_names_with_tag(OPT_TOKENIZER_TAG), "summarization_zs": ["openai/davinci", "openai/curie", "openai/text-davinci-002", "openai/text-curie-001"], "biomedical": ["openai/text-davinci-003"], # TODO: add https://huggingface.co/stanford-crfm/BioMedLM "interactive_qa": ["openai/text-davinci-001", "openai/davinci", "ai21/j1-jumbo", "openai/text-babbage-001"], @@ -388,6 +378,13 @@ def values_dict(self): return values_dict +class ModelDeploymentRunExpander(ReplaceValueRunExpander): + """For overriding model deployment""" + + name = "model_deployment" + values_dict: Dict[str, List[Any]] = {} + + ############################################################ @@ -880,18 +877,18 @@ class TokenizerRunExpander(ScenarioSpecRunExpander): "huggingface/santacoder": ["bigcode/santacoder"], "huggingface/starcoder": ["bigcode/starcoder"], } - model_tags_and_tokenizers = [ - (GPT2_TOKENIZER_TAG, "huggingface/gpt2"), - (AI21_TOKENIZER_TAG, "ai21/j1"), - (COHERE_TOKENIZER_TAG, "cohere/cohere"), - (OPT_TOKENIZER_TAG, "meta/opt"), - (GPTJ_TOKENIZER_TAG, "eleutherai/gptj"), - (GPT4_TOKENIZER_TAG, "openai/cl100k_base"), - (GPTNEO_TOKENIZER_TAG, "eleutherai/gptneox"), + list_tokenizers = [ + "huggingface/gpt2", + "ai21/j1", + "cohere/cohere", + "meta/opt", + "eleutherai/gptj", + "openai/cl100k_base", + "eleutherai/gptneox", ] - for model_tag, tokenizer in model_tags_and_tokenizers: - for model in get_model_names_with_tag(model_tag): - model_to_tokenizer_mapping[model] = [tokenizer] + for tokenizer_name in list_tokenizers: + for model in get_model_names_with_tokenizer(tokenizer_name): + model_to_tokenizer_mapping[model] = [tokenizer_name] # tokenizer=default will map to using the right tokenizer for a given model. values_dict = {"default": model_to_tokenizer_mapping} @@ -907,10 +904,10 @@ def __init__(self, value): self.all_values = [value] def expand(self, run_spec: RunSpec) -> List[RunSpec]: - # Find right tokenizer given model. + # Find right tokenizer given model deployment name. if isinstance(self.all_values, dict): - model: str = run_spec.adapter_spec.model - self.values = self.all_values[model] if model in self.all_values else [] + deployment: str = run_spec.adapter_spec.model_deployment + self.values = self.all_values[deployment] if deployment in self.all_values else [] else: self.values = self.all_values return super().expand(run_spec) @@ -1114,6 +1111,7 @@ def expand(self, run_spec: RunSpec) -> List[RunSpec]: MaxEvalInstancesRunExpander, NumOutputsRunExpander, ModelRunExpander, + ModelDeploymentRunExpander, DataAugmentationRunExpander, TokenizerRunExpander, NumPromptTokensRunExpander, diff --git a/src/helm/benchmark/run_specs.py b/src/helm/benchmark/run_specs.py index 8657b590ef7..c143ab2fd11 100644 --- a/src/helm/benchmark/run_specs.py +++ b/src/helm/benchmark/run_specs.py @@ -1,8 +1,10 @@ +import dataclasses import importlib import itertools from functools import partial from typing import Any, Callable, List, Dict, Optional, Set, TypeVar +from helm.benchmark.model_deployment_registry import ALL_MODEL_DEPLOYMENTS, DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT from helm.common.hierarchical_logger import hlog, htrack from helm.common.object_spec import ObjectSpec from helm.benchmark.adaptation.adapters.adapter_factory import ( @@ -47,10 +49,15 @@ TaskType, get_lextreme_task_type, ) -from helm.proxy.models import ( +from helm.benchmark.model_deployment_registry import ( + ModelDeployment, + get_model_deployment, +) +from helm.benchmark.model_metadata_registry import ( + ModelMetadata, + get_model_metadata, ANTHROPIC_CLAUDE_1_MODEL_TAG, ANTHROPIC_CLAUDE_2_MODEL_TAG, - get_model, NO_NEWLINES_TAG, NLG_PREFIX_TAG, CHATML_MODEL_TAG, @@ -432,6 +439,7 @@ def get_adapter_spec1() -> AdapterSpec: num_outputs=3, num_train_trials=3, model="simple/model1", + model_deployment="simple/model1", temperature=1, stop_sequences=["."], ) @@ -1885,7 +1893,6 @@ def get_metric_specs(big_bench_metrics: List[str]) -> List[MetricSpec]: # "metrics" is a required field. The default values were populated using the link above. adapter_spec = AdapterSpec( method=get_adaptation_method(big_bench_task["metrics"]), - model="openai/text-curie-001", # Can override with the `ModelRunExpander`. max_train_instances=5, # Can override with the `MaxTrainInstancesRunExpander`. num_outputs=1, # Can override with the `NumOutputsRunExpander`. # From "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models", @@ -2529,6 +2536,80 @@ def get_cleva_spec(task: str, version: str, subtask: Optional[str] = None, promp ############################################################ +def get_default_model_deployment_for_model( + model_name: str, warn_arg_deprecated: bool = False, ignore_deprecated: bool = False +) -> Optional[str]: + """Returns a valid model deployment name corresponding to the given model arg. + This is used as a backwards compatibility layer for model names that are now moved to model deployments. + Example: "anthropic/claude-v1.3" => "anthropic/claude-v1.3" + Example: "meta/llama-7b" => "together/llama-7b" + + The process to find a model deployment name is as follows: + 1. If there is a model deployment with the same name as the model arg, use it. + 2. If there is at least one deployment for the model, use the first one that is available. + 3. If there are no deployments for the model, returns None. + + This function will also try to find a model deployment name that is not deprecated. + If there are no non-deprecated deployments, it will return the first deployment (even if it's deprecated). + If ignore_deprecated is True, this function will return None if the model deployment is deprecated. + + If warn_arg_deprecated is True, this function will print a warning if the model deployment name is not the same + as the model arg. This is to remind the user that the model name is deprecated and should be replaced with + the model deployment name (in their config). + + Args: + model_arg: The model arg to convert to a model deployment name. + warn_arg_deprecated: Whether to print a warning if the model deployment name is not the same as the model arg. + ignore_deprecated: Whether to return None if the model deployment is deprecated. + """ + + # If there is a model deployment with the same name as the model arg, use it. + if model_name in DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT: + deployment: ModelDeployment = DEPLOYMENT_NAME_TO_MODEL_DEPLOYMENT[model_name] + if deployment.deprecated and ignore_deprecated: + if warn_arg_deprecated: + hlog(f"WARNING: Model deployment {model_name} is deprecated") + return None + return deployment.name + + # If there is at least one deployment for the model, use the first one that is available. + available_deployments: List[ModelDeployment] = [ + deployment for deployment in ALL_MODEL_DEPLOYMENTS if deployment.model_name == model_name + ] + if len(available_deployments) > 0: + available_deployment_names: List[str] = [deployment.name for deployment in available_deployments] + if warn_arg_deprecated: + hlog("WARNING: Model name is deprecated. Please use the model deployment name instead.") + hlog(f"Available model deployments for model {model_name}: {available_deployment_names}") + + # Additionally, if there is a non-deprecated deployment, use it. + non_deprecated_deployments: List[ModelDeployment] = [ + deployment for deployment in available_deployments if not deployment.deprecated + ] + if len(non_deprecated_deployments) > 0: + chosen_deployment = non_deprecated_deployments[0] + # There are no non-deprecated deployments, so there are two options: + # 1. If we can return an empty string, return it. (no model deployment is available) + # 2. If we can't return an empty string, return the first deployment (even if it's deprecated). + elif ignore_deprecated: + return None + else: + chosen_deployment = available_deployments[0] + if warn_arg_deprecated: + hlog(f"WARNING: All model deployments for model {model_name} are deprecated.") + if warn_arg_deprecated: + hlog( + f"Choosing {chosen_deployment.name} (the first one) as " + f"the default model deployment for model {model_name}" + ) + hlog("If you want to use a different model deployment, please specify it explicitly.") + return chosen_deployment.name + + # Some models are added but have no deployments yet. + # In this case, we return None. + return None + + def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]: """ Takes a specification (name, args) and returns a list of `RunSpec`s. @@ -2554,13 +2635,41 @@ def construct_run_specs(spec: ObjectSpec) -> List[RunSpec]: ] def alter_run_spec(run_spec: RunSpec) -> RunSpec: - try: - model = get_model(run_spec.adapter_spec.model) - except ValueError: - # Models registered from configs cannot have expanders applied to them, - # because the models will not have been registered yet at this point. - # TODO: Figure out a cleaner way to deal with this. - return run_spec + if not run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment: + raise ValueError("At least one of model_deployment and model must be specified") + elif not run_spec.adapter_spec.model and run_spec.adapter_spec.model_deployment: + # Infer model from model deployment + default_model_name = get_model_deployment(run_spec.adapter_spec.model_deployment).model_name + if not default_model_name: + default_model_name = run_spec.adapter_spec.model_deployment + run_spec = dataclasses.replace( + run_spec, + adapter_spec=dataclasses.replace(run_spec.adapter_spec, model=default_model_name), + ) + elif run_spec.adapter_spec.model and not run_spec.adapter_spec.model_deployment: + # Infer model deployment from model + default_model_deployment = get_default_model_deployment_for_model(run_spec.adapter_spec.model) + if not default_model_deployment: + raise ValueError( + f"Unknown model or no default model deployment found for model {run_spec.adapter_spec.model}" + ) + run_spec = dataclasses.replace( + run_spec, + adapter_spec=dataclasses.replace(run_spec.adapter_spec, model_deployment=default_model_deployment), + ) + + # Both model and model_deployment should now be filled + assert run_spec.adapter_spec.model_deployment + assert run_spec.adapter_spec.model + + model: ModelMetadata = get_model_metadata(run_spec.adapter_spec.model) + deployment: ModelDeployment = get_model_deployment(run_spec.adapter_spec.model_deployment) + if run_spec.adapter_spec.model != deployment.model_name: + raise ValueError( + f"Invalid RunSpec: selected model deployment '{run_spec.adapter_spec.model_deployment}'" + f"for model '{run_spec.adapter_spec.model}' but the model deployment is " + f"for a different model '{deployment.model_name}'" + ) # For models that strip newlines, when we're generating, we need to set # the delimiter to be '###' so we stop properly. if NO_NEWLINES_TAG in model.tags and run_spec.adapter_spec.method in ( diff --git a/src/helm/benchmark/scenarios/numeracy_scenario.py b/src/helm/benchmark/scenarios/numeracy_scenario.py index 20d697fefa9..7205ae4bdf1 100644 --- a/src/helm/benchmark/scenarios/numeracy_scenario.py +++ b/src/helm/benchmark/scenarios/numeracy_scenario.py @@ -544,7 +544,7 @@ def get_numeracy_adapter_spec( "max_eval_instances": max_eval_instances, "num_outputs": 1, "num_train_trials": 1, - "model": "openai/davinci", + "model_deployment": "openai/davinci", "temperature": 0, "stop_sequences": ["\n"], "max_tokens": 20, diff --git a/src/helm/benchmark/static/schema.yaml b/src/helm/benchmark/static/schema.yaml index 273c1d90416..7b7ab81f4f0 100644 --- a/src/helm/benchmark/static/schema.yaml +++ b/src/helm/benchmark/static/schema.yaml @@ -990,7 +990,9 @@ adapter: - name: sample_train description: If true, randomly sample N training examples; if false, select N consecutive training examples - name: model - description: Name of the language model (/) to send requests to. + description: DEPRECATED. Name of the language model (/) to send requests to. + - name: model_deployment + description: Name of the language model (/) to send requests to. - name: temperature description: Temperature parameter used in generation. - name: max_tokens @@ -1952,6 +1954,7 @@ run_groups: - synthetic_efficiency adapter_keys_shown: - model + - model_deployment - max_tokens - name: calibration @@ -1983,6 +1986,7 @@ run_groups: - civil_comments adapter_keys_shown: - model + - model_deployment - max_train_instances subgroup_metric_groups_hidden: - robustness @@ -2004,6 +2008,7 @@ run_groups: - bbq adapter_keys_shown: - model + - model_deployment - method - name: ablation_prompts @@ -2018,6 +2023,7 @@ run_groups: - civil_comments adapter_keys_shown: - model + - model_deployment - instructions - input_prefix - input_suffix @@ -3023,6 +3029,7 @@ run_groups: main_split: test adapter_keys_shown: - model + - model_deployment - max_tokens taxonomy: task: "?" diff --git a/src/helm/benchmark/test_model_properties.py b/src/helm/benchmark/test_model_properties.py index 237d027fe61..a1fa3577f3e 100644 --- a/src/helm/benchmark/test_model_properties.py +++ b/src/helm/benchmark/test_model_properties.py @@ -2,15 +2,22 @@ Delete this after the refactor is done.""" +import pytest from tempfile import TemporaryDirectory from typing import Any -from helm.benchmark.model_deployment_registry import ClientSpec, ModelDeployment, WindowServiceSpec +from helm.benchmark.config_registry import register_helm_configurations +from helm.benchmark.model_deployment_registry import ( + ClientSpec, + ModelDeployment, + WindowServiceSpec, + ALL_MODEL_DEPLOYMENTS, +) +from helm.benchmark.model_metadata_registry import ModelMetadata from helm.benchmark.tokenizer_config_registry import TokenizerConfig, TokenizerSpec from helm.benchmark.window_services.test_utils import get_tokenizer_service from helm.benchmark.window_services.window_service_factory import WindowServiceFactory from helm.proxy.clients.auto_client import AutoClient -from helm.proxy.models import ALL_MODELS from collections import defaultdict @@ -204,7 +211,7 @@ client_spec=ClientSpec(class_name="helm.proxy.clients.http_model_client.HTTPModelClient"), tokenizer_name="neurips/local", window_service_spec=WindowServiceSpec( - class_name="helm.benchmark.window_services.http_model_window_service.HTTPModelWindowServce" + class_name="helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService" ), max_sequence_length=2048, ), @@ -301,7 +308,7 @@ ), ModelDeployment( name="anthropic/stanford-online-all-v4-s3", - client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicClient"), + client_spec=ClientSpec(class_name="helm.proxy.clients.anthropic_client.AnthropicLegacyClient"), tokenizer_name="huggingface/gpt2", window_service_spec=WindowServiceSpec( class_name="helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService" @@ -437,6 +444,26 @@ max_sequence_length=2019, max_request_length=2020, ), + ModelDeployment( + name="cohere/command", + client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"), + tokenizer_name="cohere/cohere", + window_service_spec=WindowServiceSpec( + class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + ), + max_sequence_length=2019, + max_request_length=2020, + ), + ModelDeployment( + name="cohere/command-light", + client_spec=ClientSpec(class_name="helm.proxy.clients.cohere_client.CohereClient"), + tokenizer_name="cohere/cohere", + window_service_spec=WindowServiceSpec( + class_name="helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + ), + max_sequence_length=2019, + max_request_length=2020, + ), ModelDeployment( name="together/gpt-j-6b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), @@ -458,7 +485,7 @@ max_request_length=2049, ), ModelDeployment( - name="eleutherai/pythia-1b-v0", + name="together/pythia-1b-v0", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -468,7 +495,7 @@ max_request_length=2049, ), ModelDeployment( - name="eleutherai/pythia-2.8b-v0", + name="together/pythia-2.8b-v0", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -478,7 +505,7 @@ max_request_length=2049, ), ModelDeployment( - name="eleutherai/pythia-6.9b", + name="together/pythia-6.9b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -488,7 +515,7 @@ max_request_length=2049, ), ModelDeployment( - name="eleutherai/pythia-12b-v0", + name="together/pythia-12b-v0", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -498,7 +525,7 @@ max_request_length=2049, ), ModelDeployment( - name="meta/llama-7b", + name="together/llama-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -507,7 +534,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="meta/llama-13b", + name="together/llama-13b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -516,7 +543,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="meta/llama-30b", + name="together/llama-30b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -525,7 +552,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="meta/llama-65b", + name="together/llama-65b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -534,7 +561,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="meta/llama-2-7b", + name="together/llama-2-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="meta-llama/Llama-2-7b-hf", window_service_spec=WindowServiceSpec( @@ -543,7 +570,7 @@ max_sequence_length=4096, ), ModelDeployment( - name="meta/llama-2-13b", + name="together/llama-2-13b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="meta-llama/Llama-2-7b-hf", window_service_spec=WindowServiceSpec( @@ -552,7 +579,7 @@ max_sequence_length=4096, ), ModelDeployment( - name="meta/llama-2-70b", + name="together/llama-2-70b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="meta-llama/Llama-2-7b-hf", window_service_spec=WindowServiceSpec( @@ -561,7 +588,7 @@ max_sequence_length=4096, ), ModelDeployment( - name="stanford/alpaca-7b", + name="together/alpaca-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -570,7 +597,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="lmsys/vicuna-7b-v1.3", + name="together/vicuna-7b-v1.3", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -579,7 +606,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="lmsys/vicuna-13b-v1.3", + name="together/vicuna-13b-v1.3", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="hf-internal-testing/llama-tokenizer", window_service_spec=WindowServiceSpec( @@ -588,7 +615,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="mistralai/mistral-7b-v0.1", + name="together/mistral-7b-v0.1", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="mistralai/Mistral-7B-v0.1", window_service_spec=WindowServiceSpec( @@ -597,7 +624,7 @@ max_sequence_length=4095, ), ModelDeployment( - name="mosaicml/mpt-7b", + name="together/mpt-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -607,7 +634,7 @@ max_request_length=2049, ), ModelDeployment( - name="mosaicml/mpt-instruct-7b", + name="together/mpt-instruct-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -617,7 +644,7 @@ max_request_length=2049, ), ModelDeployment( - name="mosaicml/mpt-30b", + name="together/mpt-30b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -627,7 +654,7 @@ max_request_length=2049, ), ModelDeployment( - name="mosaicml/mpt-instruct-30b", + name="together/mpt-instruct-30b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -637,7 +664,7 @@ max_request_length=2049, ), ModelDeployment( - name="tiiuae/falcon-7b", + name="together/falcon-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="tiiuae/falcon-7b", window_service_spec=WindowServiceSpec( @@ -646,7 +673,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="tiiuae/falcon-7b-instruct", + name="together/falcon-7b-instruct", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="tiiuae/falcon-7b", window_service_spec=WindowServiceSpec( @@ -655,7 +682,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="tiiuae/falcon-40b", + name="together/falcon-40b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="tiiuae/falcon-7b", window_service_spec=WindowServiceSpec( @@ -664,7 +691,7 @@ max_sequence_length=2048, ), ModelDeployment( - name="tiiuae/falcon-40b-instruct", + name="together/falcon-40b-instruct", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="tiiuae/falcon-7b", window_service_spec=WindowServiceSpec( @@ -1237,7 +1264,7 @@ max_sequence_length=1024, ), ModelDeployment( - name="databricks/dolly-v2-3b", + name="together/dolly-v2-3b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -1247,7 +1274,7 @@ max_request_length=2049, ), ModelDeployment( - name="databricks/dolly-v2-7b", + name="together/dolly-v2-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -1257,7 +1284,7 @@ max_request_length=2049, ), ModelDeployment( - name="databricks/dolly-v2-12b", + name="together/dolly-v2-12b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -1267,7 +1294,7 @@ max_request_length=2049, ), ModelDeployment( - name="stabilityai/stablelm-base-alpha-3b", + name="together/stablelm-base-alpha-3b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -1277,7 +1304,7 @@ max_request_length=4097, ), ModelDeployment( - name="stabilityai/stablelm-base-alpha-7b", + name="together/stablelm-base-alpha-7b", client_spec=ClientSpec(class_name="helm.proxy.clients.together_client.TogetherClient"), tokenizer_name="EleutherAI/gpt-neox-20b", window_service_spec=WindowServiceSpec( @@ -1292,7 +1319,7 @@ model_name=None, tokenizer_name="lightningai/lit-gpt", window_service_spec=WindowServiceSpec( - class_name="helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowServce", args={} + class_name="helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService", args={} ), max_sequence_length=2048, max_request_length=None, @@ -1354,23 +1381,37 @@ def _full_class_name(obj: Any) -> str: return f"{obj.__class__.__module__}.{obj.__class__.__name__}" -def test_all_models_have_window_services(): - auto_client = AutoClient(defaultdict(str), "", "") - model_deployments = {model_deployment.name: model_deployment for model_deployment in _BUILT_IN_MODEL_DEPLOYMENTS} - tokenizer_configs = {tokenizer_config.name: tokenizer_config for tokenizer_config in _BUILT_IN_TOKENIZER_CONFIGS} - with TemporaryDirectory() as tmpdir: - tokenizer_service = get_tokenizer_service(tmpdir) - for model in ALL_MODELS: +# HACK: This looks like it should be done in a setup_class() +# for the test below but apparently pytest first check the parametrize +# before running the setup_class(). +# Therefore ALL_MODEL_DEPLOYMENTS is empty and no test would be run, +# so we need to do this here. +register_helm_configurations() + + +class TestModelProperties: + @pytest.mark.parametrize("model", ALL_MODEL_DEPLOYMENTS) + def test_models_has_window_service(self, model: ModelMetadata): + auto_client = AutoClient(defaultdict(str), "", "") + model_deployments = { + model_deployment.name: model_deployment for model_deployment in _BUILT_IN_MODEL_DEPLOYMENTS + } + tokenizer_configs = { + tokenizer_config.name: tokenizer_config for tokenizer_config in _BUILT_IN_TOKENIZER_CONFIGS + } + with TemporaryDirectory() as tmpdir: + tokenizer_service = get_tokenizer_service(tmpdir) # Can't test lit-gpt client because it requires manual dependencies if "lit-gpt" in model.name: - continue + return # Can't test Llama 2 because it requires Hugging Face credentials if "llama-2-" in model.name: - continue + return - client = auto_client._get_client(model.name) - window_service = WindowServiceFactory.get_window_service(model.name, tokenizer_service) + deployment_name: str = model.name + client = auto_client._get_client(deployment_name) + window_service = WindowServiceFactory.get_window_service(deployment_name, tokenizer_service) tokenizer_name = window_service.tokenizer_name tokenizer = auto_client._get_tokenizer(tokenizer_name) @@ -1417,3 +1458,6 @@ def test_all_models_have_window_services(): # TODO: Give PalmyraWindowService's tokenizer a different name e.g. writer/palmyra if tokenizer_name != "huggingface/gpt2": assert tokenizer_configs[tokenizer_name] == tokenizer_config + + def test_num_models_available(self): + assert len(ALL_MODEL_DEPLOYMENTS) == 119 diff --git a/src/helm/benchmark/tokenizer_config_registry.py b/src/helm/benchmark/tokenizer_config_registry.py index 10f349a619f..732cd38bd1e 100644 --- a/src/helm/benchmark/tokenizer_config_registry.py +++ b/src/helm/benchmark/tokenizer_config_registry.py @@ -1,15 +1,18 @@ import os from typing import Dict, Optional, List from dataclasses import dataclass +import importlib_resources as resources import cattrs import yaml from helm.common.hierarchical_logger import hlog from helm.common.object_spec import ObjectSpec +from helm.benchmark.model_metadata_registry import CONFIG_PACKAGE -TOKENIEZR_CONFIGS_FILE = "tokenizer_configs.yaml" +TOKENIZER_CONFIGS_FILE: str = "tokenizer_configs.yaml" +TOKENIZERS_REGISTERED: bool = False class TokenizerSpec(ObjectSpec): @@ -38,11 +41,13 @@ class TokenizerConfigs: tokenizer_configs: List[TokenizerConfig] -_name_to_tokenizer_config: Dict[str, TokenizerConfig] = {} +ALL_TOKENIZER_CONFIGS: List[TokenizerConfig] = [] +TOKENIZER_NAME_TO_CONFIG: Dict[str, TokenizerConfig] = {config.name: config for config in ALL_TOKENIZER_CONFIGS} def register_tokenizer_config(tokenizer_config: TokenizerConfig) -> None: - _name_to_tokenizer_config[tokenizer_config.name] = tokenizer_config + ALL_TOKENIZER_CONFIGS.append(tokenizer_config) + TOKENIZER_NAME_TO_CONFIG[tokenizer_config.name] = tokenizer_config def register_tokenizer_configs_from_path(path: str) -> None: @@ -54,11 +59,20 @@ def register_tokenizer_configs_from_path(path: str) -> None: register_tokenizer_config(tokenizer_config) -def maybe_register_tokenizer_configs_from_base_path(base_path: str) -> None: - path = os.path.join(base_path, TOKENIEZR_CONFIGS_FILE) +def maybe_register_tokenizer_configs_from_base_path(path: str) -> None: + """Register tokenizer configs from yaml file if the path exists.""" if os.path.exists(path): register_tokenizer_configs_from_path(path) def get_tokenizer_config(name: str) -> Optional[TokenizerConfig]: - return _name_to_tokenizer_config.get(name) + register_tokenizers_if_not_already_registered() + return TOKENIZER_NAME_TO_CONFIG.get(name) + + +def register_tokenizers_if_not_already_registered() -> None: + global TOKENIZERS_REGISTERED + if not TOKENIZERS_REGISTERED: + path: str = resources.files(CONFIG_PACKAGE).joinpath(TOKENIZER_CONFIGS_FILE) + maybe_register_tokenizer_configs_from_base_path(path) + TOKENIZERS_REGISTERED = True diff --git a/src/helm/benchmark/window_services/http_model_window_service.py b/src/helm/benchmark/window_services/http_model_window_service.py index dac3bb70fbb..d84308b370a 100644 --- a/src/helm/benchmark/window_services/http_model_window_service.py +++ b/src/helm/benchmark/window_services/http_model_window_service.py @@ -3,7 +3,7 @@ # TODO: Remove Once we have configurable model names since this hardcodes the tokenizer name -class HTTPModelWindowServce(LocalWindowService): +class HTTPModelWindowService(LocalWindowService): def __init__(self, service: TokenizerService): super().__init__(service) diff --git a/src/helm/benchmark/window_services/lit_gpt_window_service.py b/src/helm/benchmark/window_services/lit_gpt_window_service.py index 5deddd6a004..4d670a38e68 100644 --- a/src/helm/benchmark/window_services/lit_gpt_window_service.py +++ b/src/helm/benchmark/window_services/lit_gpt_window_service.py @@ -2,7 +2,7 @@ from .tokenizer_service import TokenizerService -class LitGPTWindowServce(LocalWindowService): +class LitGPTWindowService(LocalWindowService): def __init__(self, service: TokenizerService): super().__init__(service) diff --git a/src/helm/benchmark/window_services/window_service_factory.py b/src/helm/benchmark/window_services/window_service_factory.py index d720e8e42b5..e15bf720167 100644 --- a/src/helm/benchmark/window_services/window_service_factory.py +++ b/src/helm/benchmark/window_services/window_service_factory.py @@ -1,22 +1,7 @@ from typing import Optional -from helm.benchmark.model_deployment_registry import WindowServiceSpec, get_model_deployment -from helm.proxy.models import ( - get_model, - get_model_names_with_tag, - Model, - AI21_WIDER_CONTEXT_WINDOW_TAG, - AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG, - WIDER_CONTEXT_WINDOW_TAG, - GPT_TURBO_CONTEXT_WINDOW_TAG, - GPT_TURBO_16K_CONTEXT_WINDOW_TAG, - GPT4_CONTEXT_WINDOW_TAG, - GPT4_32K_CONTEXT_WINDOW_TAG, -) - -from helm.benchmark.tokenizer_config_registry import get_tokenizer_config, TokenizerConfig -from helm.benchmark.window_services.huggingface_window_service import HuggingFaceWindowService -from helm.benchmark.window_services.gpt2_window_service import GPT2WindowService +from helm.benchmark.model_deployment_registry import ModelDeployment, WindowServiceSpec, get_model_deployment +from helm.benchmark.tokenizer_config_registry import TokenizerConfig, get_tokenizer_config from helm.benchmark.window_services.window_service import WindowService from helm.benchmark.window_services.tokenizer_service import TokenizerService from helm.common.object_spec import create_object, inject_object_spec_args @@ -24,19 +9,12 @@ class WindowServiceFactory: @staticmethod - def get_window_service(model_name: str, service: TokenizerService) -> WindowService: + def get_window_service(model_deployment_name: str, service: TokenizerService) -> WindowService: """ Returns a `WindowService` given the name of the model. Make sure this function returns instantaneously on repeated calls. """ - model: Model = get_model(model_name) - organization: str = model.organization - engine: str = model.engine - - window_service: WindowService - - # TODO: Migrate all window services to use use model deployments - model_deployment = get_model_deployment(model_name) + model_deployment: Optional[ModelDeployment] = get_model_deployment(model_deployment_name) if model_deployment: # If the model deployment specifies a WindowServiceSpec, instantiate it. window_service_spec: WindowServiceSpec @@ -74,246 +52,6 @@ def get_window_service(model_name: str, service: TokenizerService) -> WindowServ "prefix_token": prefix_token, }, ) - window_service = create_object(window_service_spec) - elif organization == "neurips": - from helm.benchmark.window_services.http_model_window_service import HTTPModelWindowServce - - window_service = HTTPModelWindowServce(service) - elif organization == "openai": - from helm.benchmark.window_services.openai_window_service import OpenAIWindowService - from helm.benchmark.window_services.wider_openai_window_service import ( - WiderOpenAIWindowService, - GPTTurboWindowService, - GPTTurbo16KWindowService, - GPT4WindowService, - GPT432KWindowService, - ) - - if model_name in get_model_names_with_tag(GPT4_CONTEXT_WINDOW_TAG): - window_service = GPT4WindowService(service) - elif model_name in get_model_names_with_tag(GPT4_32K_CONTEXT_WINDOW_TAG): - window_service = GPT432KWindowService(service) - elif model_name in get_model_names_with_tag(GPT_TURBO_CONTEXT_WINDOW_TAG): - window_service = GPTTurboWindowService(service) - elif model_name in get_model_names_with_tag(GPT_TURBO_16K_CONTEXT_WINDOW_TAG): - window_service = GPTTurbo16KWindowService(service) - elif model_name in get_model_names_with_tag(WIDER_CONTEXT_WINDOW_TAG): - window_service = WiderOpenAIWindowService(service) - else: - window_service = OpenAIWindowService(service) - # For the Google models, we approximate with the OpenAIWindowService - elif organization == "simple" or organization == "google": - from helm.benchmark.window_services.openai_window_service import OpenAIWindowService - - window_service = OpenAIWindowService(service) - elif organization == "AlephAlpha": - from helm.benchmark.window_services.luminous_window_service import ( - LuminousBaseWindowService, - LuminousExtendedWindowService, - LuminousSupremeWindowService, - LuminousWorldWindowService, - ) - - if engine == "luminous-base": - window_service = LuminousBaseWindowService(service) - elif engine == "luminous-extended": - window_service = LuminousExtendedWindowService(service) - elif engine == "luminous-supreme": - window_service = LuminousSupremeWindowService(service) - elif engine == "luminous-world": - window_service = LuminousWorldWindowService(service) - else: - raise ValueError(f"Unhandled Aleph Alpha model: {engine}") - elif organization == "microsoft": - from helm.benchmark.window_services.mt_nlg_window_service import MTNLGWindowService - - window_service = MTNLGWindowService(service) - elif organization == "anthropic": - from helm.benchmark.window_services.anthropic_window_service import ( - AnthropicWindowService, - LegacyAnthropicWindowService, - ) - - if engine == "stanford-online-all-v4-s3": - window_service = LegacyAnthropicWindowService(service) - else: - window_service = AnthropicWindowService(service) - elif organization == "writer": - from helm.benchmark.window_services.palmyra_window_service import ( - PalmyraWindowService, - LongerPalmyraWindowService, - ) - - if engine in ["palmyra-base", "palmyra-large", "palmyra-instruct-30", "palmyra-e"]: - window_service = PalmyraWindowService(service) - elif engine in ["palmyra-x", "silk-road"]: - window_service = LongerPalmyraWindowService(service) - else: - raise ValueError(f"Unhandled Writer model: {engine}") - elif engine == "santacoder": - from helm.benchmark.window_services.santacoder_window_service import SantaCoderWindowService - - window_service = SantaCoderWindowService(service) - elif engine == "starcoder": - from helm.benchmark.window_services.starcoder_window_service import StarCoderWindowService - - window_service = StarCoderWindowService(service) - elif model_name == "huggingface/gpt2": - window_service = GPT2WindowService(service) - elif model_name == "together/bloom": - from helm.benchmark.window_services.bloom_window_service import BloomWindowService - - window_service = BloomWindowService(service) - elif model_name == "together/glm": - # From https://github.com/THUDM/GLM-130B, "the tokenizer is implemented based on - # icetk---a unified multimodal tokenizer for images, Chinese, and English." - from helm.benchmark.window_services.ice_window_service import ICEWindowService - - window_service = ICEWindowService(service) - elif model_name in ["huggingface/gpt-j-6b", "together/gpt-j-6b", "together/gpt-jt-6b-v1", "gooseai/gpt-j-6b"]: - from helm.benchmark.window_services.gptj_window_service import GPTJWindowService - - window_service = GPTJWindowService(service) - elif model_name in [ - "together/gpt-neox-20b", - "gooseai/gpt-neo-20b", - "together/gpt-neoxt-chat-base-20b", - "together/redpajama-incite-base-3b-v1", - "together/redpajama-incite-instruct-3b-v1", - "together/redpajama-incite-base-7b", - "together/redpajama-incite-instruct-7b", - # Pythia uses the same tokenizer as GPT-NeoX-20B. - # See: https://huggingface.co/EleutherAI/pythia-6.9b#training-procedure - "eleutherai/pythia-1b-v0", - "eleutherai/pythia-2.8b-v0", - "eleutherai/pythia-6.9b", - "eleutherai/pythia-12b-v0", - # MPT-7B model was trained with the EleutherAI/gpt-neox-20b tokenizer - # See: https://huggingface.co/mosaicml/mpt-7b - "mosaicml/mpt-7b", - "mosaicml/mpt-instruct-7b", - "mosaicml/mpt-30b", - "mosaicml/mpt-instruct-30b", - # Dolly models are based on Pythia. - # See: https://github.com/databrickslabs/dolly - "databricks/dolly-v2-3b", - "databricks/dolly-v2-7b", - "databricks/dolly-v2-12b", - ]: - from helm.benchmark.window_services.gptneox_window_service import GPTNeoXWindowService - - window_service = GPTNeoXWindowService(service) - elif model_name in [ - "tiiuae/falcon-7b", - "tiiuae/falcon-7b-instruct", - "tiiuae/falcon-40b", - "tiiuae/falcon-40b-instruct", - ]: - window_service = HuggingFaceWindowService(service=service, tokenizer_name="tiiuae/falcon-7b") - elif model_name in [ - "stabilityai/stablelm-base-alpha-3b", - "stabilityai/stablelm-base-alpha-7b", - ]: - from helm.benchmark.window_services.gptneox_window_service import StableLMAlphaWindowService - - window_service = StableLMAlphaWindowService(service) - elif model_name == "together/h3-2.7b": - window_service = GPT2WindowService(service) - elif model_name in [ - "together/opt-1.3b", - "together/opt-6.7b", - "together/opt-66b", - "together/opt-175b", - ]: - from helm.benchmark.window_services.opt_window_service import OPTWindowService - - window_service = OPTWindowService(service) - elif model_name == "together/t0pp": - from helm.benchmark.window_services.t0pp_window_service import T0ppWindowService - - window_service = T0ppWindowService(service) - elif model_name == "together/t5-11b": - from helm.benchmark.window_services.t511b_window_service import T511bWindowService - - window_service = T511bWindowService(service) - elif model_name == "together/flan-t5-xxl": - from helm.benchmark.window_services.flan_t5_window_service import FlanT5WindowService - - window_service = FlanT5WindowService(service) - elif model_name == "together/ul2": - from helm.benchmark.window_services.ul2_window_service import UL2WindowService - - window_service = UL2WindowService(service) - elif model_name == "together/yalm": - from helm.benchmark.window_services.yalm_window_service import YaLMWindowService - - window_service = YaLMWindowService(service) - elif model_name == "nvidia/megatron-gpt2": - from helm.benchmark.window_services.megatron_window_service import MegatronWindowService - - window_service = MegatronWindowService(service) - elif model_name in [ - "lmsys/vicuna-7b-v1.3", - "lmsys/vicuna-13b-v1.3", - "meta/llama-7b", - "meta/llama-13b", - "meta/llama-30b", - "meta/llama-65b", - "stanford/alpaca-7b", - ]: - from helm.benchmark.window_services.llama_window_service import LlamaWindowService - - window_service = LlamaWindowService(service) - elif model_name in [ - "meta/llama-2-7b", - "meta/llama-2-13b", - "meta/llama-2-70b", - ]: - from helm.benchmark.window_services.llama_window_service import Llama2WindowService - - window_service = Llama2WindowService(service) - elif organization == "cohere": - from helm.benchmark.window_services.cohere_window_service import ( - CohereWindowService, - CohereCommandWindowService, - ) - - if "command" in engine: - window_service = CohereCommandWindowService(service) - else: - window_service = CohereWindowService(service) - elif organization == "ai21": - from helm.benchmark.window_services.wider_ai21_window_service import ( - WiderAI21WindowService, - AI21Jurassic2JumboWindowService, - ) - from helm.benchmark.window_services.ai21_window_service import AI21WindowService - - if model_name in get_model_names_with_tag(AI21_WIDER_CONTEXT_WINDOW_TAG): - window_service = WiderAI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service)) - if model_name in get_model_names_with_tag(AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG): - window_service = AI21Jurassic2JumboWindowService( - service=service, gpt2_window_service=GPT2WindowService(service) - ) - else: - window_service = AI21WindowService(service=service, gpt2_window_service=GPT2WindowService(service)) - - elif organization == "lightningai": - from helm.benchmark.window_services.lit_gpt_window_service import LitGPTWindowServce - - window_service = LitGPTWindowServce(service) - elif organization == "mistralai": - window_service = HuggingFaceWindowService( - service, tokenizer_name="mistralai/Mistral-7B-v0.1", max_sequence_length=4095 - ) - elif model_name in [ - "HuggingFaceM4/idefics-9b", - "HuggingFaceM4/idefics-9b-instruct", - "HuggingFaceM4/idefics-80b", - "HuggingFaceM4/idefics-80b-instruct", - ]: - window_service = HuggingFaceWindowService(service, model_name) - else: - raise ValueError(f"Unhandled model name: {model_name}") + return create_object(window_service_spec) - return window_service + raise ValueError(f"Unhandled model deployment name: {model_deployment_name}") diff --git a/src/helm/common/request.py b/src/helm/common/request.py index 6ca89fc0cb6..4acefd3690d 100644 --- a/src/helm/common/request.py +++ b/src/helm/common/request.py @@ -3,7 +3,6 @@ from typing import Any, Callable, Dict, List, Optional from helm.common.media_object import MultimediaObject -from helm.proxy.models import Model, get_model from .general import indent_lines, format_text @@ -15,8 +14,13 @@ class Request: various APIs (e.g., GPT-3, Jurassic). """ - model: str = "openai/text-davinci-002" - """Which model to query""" + model_deployment: str = "" + """Which model deployment to query -> Determines the Client. + Refers to a deployment in the model deployment registry.""" + + model: str = "" + """Which model to use -> Determines the Engine. + Refers to a model metadata in the model registry.""" embedding: bool = False """Whether to query embedding instead of text response""" @@ -65,16 +69,23 @@ class Request: """Multimodal prompt with media objects interleaved (e.g., text, video, image, text, ...)""" @property - def model_organization(self) -> str: - """Example: 'openai/davinci' => 'openai'""" - model: Model = get_model(self.model) - return model.organization + def model_host(self) -> str: + """Returns the model host (referring to the deployment). + Not to be confused with the model creator organization (referring to the model). + Example: 'openai/davinci' => 'openai' + 'together/bloom' => 'together'""" + return self.model_deployment.split("/")[0] @property def model_engine(self) -> str: - """Example: 'openai/davinci' => 'davinci'""" - model: Model = get_model(self.model) - return model.engine + """Returns the model engine (referring to the model). + This is often the same as self.model_deploymentl.split("/")[1], but not always. + For example, one model could be served on several servers (each with a different model_deployment) + In that case we would have for example: + 'aws/bloom-1', 'aws/bloom-2', 'aws/bloom-3' => 'bloom' + This is why we need to keep track of the model engine with the model metadata. + Example: 'openai/davinci' => 'davinci'""" + return self.model.split("/")[1] @dataclass(frozen=True) diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml new file mode 100644 index 00000000000..f7699818a82 --- /dev/null +++ b/src/helm/config/model_deployments.yaml @@ -0,0 +1,1567 @@ +# This file defines all the model deployments that are supported by the Helm API. +# Some models have several deployments, each with different parameters. + +# If you want to add a new deployment, you can technically do it here but we recommend +# you to do it in private/model_deployments.yaml instead. + +model_deployments: + + - name: simple/model1 + model_name: simple/model1 + tokenizer_name: simple/model1 + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.simple_client.SimpleClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + # AI21 Labs + + # J1 models are Deprecated by AI21 Labs + # API returns: Detail: Jurassic J1 models are deprecated + - name: ai21/j1-jumbo + deprecated: true + model_name: ai21/j1-jumbo + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j1-large + deprecated: true + model_name: ai21/j1-large + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j1-grande + deprecated: true + model_name: ai21/j1-grande + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j1-grande-v2-beta + deprecated: true + model_name: ai21/j1-grande-v2-beta + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j2-jumbo + model_name: ai21/j2-jumbo + tokenizer_name: ai21/j1 + max_sequence_length: 6000 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_ai21_window_service.AI21Jurassic2JumboWindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j2-large + model_name: ai21/j2-large + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + - name: ai21/j2-grande + model_name: ai21/j2-grande + tokenizer_name: ai21/j1 + max_sequence_length: 2047 + client_spec: + class_name: "helm.proxy.clients.ai21_client.AI21Client" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ai21_window_service.AI21WindowService" + args: + gpt2_window_service: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + + + # Aleph Alpha + - name: AlephAlpha/luminous-base + model_name: AlephAlpha/luminous-base + tokenizer_name: AlephAlpha/luminous-base + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.luminous_window_service.LuminousBaseWindowService" + args: {} + + - name: AlephAlpha/luminous-extended + model_name: AlephAlpha/luminous-extended + tokenizer_name: AlephAlpha/luminous-extended + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.luminous_window_service.LuminousExtendedWindowService" + args: {} + + - name: AlephAlpha/luminous-supreme + model_name: AlephAlpha/luminous-supreme + tokenizer_name: AlephAlpha/luminous-supreme + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.aleph_alpha_client.AlephAlphaClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.luminous_window_service.LuminousSupremeWindowService" + args: {} + + # TODO: Add luminous-world once it is released. + + + + # Anthropic + - name: anthropic/claude-v1.3 + model_name: anthropic/claude-v1.3 + tokenizer_name: anthropic/claude + max_sequence_length: 8000 + max_sequence_and_generated_tokens_length: 9016 + client_spec: + class_name: "helm.proxy.clients.anthropic_client.AnthropicClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService" + args: {} + + - name: anthropic/claude-instant-v1 + model_name: anthropic/claude-instant-v1 + tokenizer_name: anthropic/claude + max_sequence_length: 8000 + max_sequence_and_generated_tokens_length: 9016 + client_spec: + class_name: "helm.proxy.clients.anthropic_client.AnthropicClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService" + args: {} + + - name: anthropic/claude-2.0 + model_name: anthropic/claude-2.0 + tokenizer_name: anthropic/claude + max_sequence_length: 8000 + max_sequence_and_generated_tokens_length: 9016 + client_spec: + class_name: "helm.proxy.clients.anthropic_client.AnthropicClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.anthropic_window_service.AnthropicWindowService" + args: {} + + - name: anthropic/stanford-online-all-v4-s3 + deprecated: true # Closed model, not accessible via API + model_name: anthropic/stanford-online-all-v4-s3 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 8192 + client_spec: + class_name: "helm.proxy.clients.anthropic_client.AnthropicLegacyClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.anthropic_window_service.LegacyAnthropicWindowService" + args: {} + + + + # Cohere + - name: cohere/xlarge-20220609 + model_name: cohere/xlarge-20220609 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/large-20220720 + model_name: cohere/large-20220720 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/medium-20220720 + model_name: cohere/medium-20220720 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/small-20220720 + model_name: cohere/small-20220720 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/xlarge-20221108 + model_name: cohere/xlarge-20221108 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/medium-20221108 + model_name: cohere/medium-20221108 + tokenizer_name: cohere/cohere + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereWindowService" + args: {} + + - name: cohere/command-medium-beta + model_name: cohere/command-medium-beta + tokenizer_name: cohere/cohere + max_sequence_length: 2019 + max_request_length: 2020 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + args: {} + + - name: cohere/command-xlarge-beta + model_name: cohere/command-xlarge-beta + tokenizer_name: cohere/cohere + max_sequence_length: 2019 + max_request_length: 2020 + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + args: {} + + - name: cohere/command + model_name: cohere/command + tokenizer_name: cohere/cohere + max_sequence_length: 2019 # TODO: verify this + max_request_length: 2020 # TODO: verify this + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + args: {} + + - name: cohere/command-light + model_name: cohere/command-light + tokenizer_name: cohere/cohere + max_sequence_length: 2019 # TODO: verify this + max_request_length: 2020 # TODO: verify this + client_spec: + class_name: "helm.proxy.clients.cohere_client.CohereClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.cohere_window_service.CohereCommandWindowService" + args: {} + + + + # Gooseai + + ## EleutherAI + - name: gooseai/gpt-neo-20b + model_name: eleutherai/gpt-neox-20b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: gooseai/gpt-j-6b + model_name: eleutherai/gpt-j-6b + tokenizer_name: EleutherAI/gpt-j-6B + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.goose_ai_client.GooseAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService" + args: {} + + + + # HuggingFace + + ## Bigcode + - name: huggingface/santacoder + model_name: bigcode/santacoder + tokenizer_name: bigcode/santacoder + client_spec: + class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.santacoder_window_service.SantaCoderWindowService" + args: {} + + - name: huggingface/starcoder + model_name: bigcode/starcoder + tokenizer_name: bigcode/starcoder + client_spec: + class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.starcoder_window_service.StarCoderWindowService" + args: {} + + ## EleutherAI + - name: huggingface/gpt-j-6b + model_name: eleutherai/gpt-j-6b + tokenizer_name: EleutherAI/gpt-j-6B + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService" + args: {} + + ## OpenAI + - name: huggingface/gpt2 + model_name: openai/gpt2 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 1024 + max_request_length: 1025 + client_spec: + class_name: "helm.proxy.clients.huggingface_client.HuggingFaceClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + + + # HuggingFaceM4 + - name: HuggingFaceM4/idefics-9b + model_name: huggingface/idefics-9b + tokenizer_name: HuggingFaceM4/idefics-9b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: HuggingFaceM4/idefics-9b-instruct + model_name: huggingface/idefics-9b-instruct + tokenizer_name: HuggingFaceM4/idefics-9b-instruct + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: HuggingFaceM4/idefics-80b + model_name: huggingface/idefics-80b + tokenizer_name: HuggingFaceM4/idefics-80b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: HuggingFaceM4/idefics-80b-instruct + model_name: huggingface/idefics-80b-instruct + tokenizer_name: HuggingFaceM4/idefics-80b-instruct + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.vision_language.idefics_client.IDEFICSClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + + + # Lighting AI + - name: lightningai/lit-gpt + model_name: lightningai/lit-gpt + tokenizer_name: lightningai/lit-gpt + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.lit_gpt_client.LitGPTClient" + args: + checkpoint_dir: "" # Path to the checkpoint directory + precision: bf16-true + window_service_spec: + class_name: "helm.benchmark.window_services.lit_gpt_window_service.LitGPTWindowService" + args: {} + + + + # Microsoft + - name: microsoft/TNLGv2_530B + model_name: microsoft/TNLGv2_530B + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService" + args: {} + + - name: microsoft/TNLGv2_7B + model_name: microsoft/TNLGv2_7B + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2047 + max_request_length: 2048 + client_spec: + class_name: "helm.proxy.clients.microsoft_client.MicrosoftClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.mt_nlg_window_service.MTNLGWindowService" + args: {} + + + + # Neurips + - name: neurips/local + model_name: neurips/local + tokenizer_name: neurips/local + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.http_model_client.HTTPModelClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.http_model_window_service.HTTPModelWindowService" + args: {} + + + + # Nvidia + - name: nvidia/megatron-gpt2 + model_name: nvidia/megatron-gpt2 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 1024 + client_spec: + class_name: "helm.proxy.clients.megatron_client.MegatronClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.megatron_window_service.MegatronWindowService" + args: {} + + + + # OpenAI + + ## GPT 3 Models + # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3 + # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024. + + - name: openai/davinci + deprecated: true + model_name: openai/davinci + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/curie + deprecated: true + model_name: openai/curie + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/babbage + deprecated: true + model_name: openai/babbage + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/ada + deprecated: true + model_name: openai/ada + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-davinci-003 + deprecated: true + model_name: openai/text-davinci-003 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 4000 + max_request_length: 4001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService" + args: {} + + - name: openai/text-davinci-002 + deprecated: true + model_name: openai/text-davinci-002 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 4000 + max_request_length: 4001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService" + args: {} + + - name: openai/text-davinci-001 + deprecated: true + model_name: openai/text-davinci-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-curie-001 + deprecated: true + model_name: openai/text-curie-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-babbage-001 + deprecated: true + model_name: openai/text-babbage-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-ada-001 + deprecated: true + model_name: openai/text-ada-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + + ## GPT 3.5 Turbo Models + # ChatGPT: https://openai.com/blog/chatgpt + + # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable + # sequence length is smaller at 4087 with one user input message and one assistant + # output message because ChatGPT uses special tokens for message roles and boundaries. + # We use a rounded-down sequence length of 4000 to account for these special tokens. + - name: openai/gpt-3.5-turbo-0301 + model_name: openai/gpt-3.5-turbo-0301 + tokenizer_name: openai/cl100k_base + max_sequence_length: 4000 + max_request_length: 4001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService" + args: {} + + # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable + # sequence length is smaller at 4087 with one user input message and one assistant + # output message because ChatGPT uses special tokens for message roles and boundaries. + # We use a rounded-down sequence length of 4000 to account for these special tokens. + - name: openai/gpt-3.5-turbo-0613 + model_name: openai/gpt-3.5-turbo-0613 + tokenizer_name: openai/cl100k_base + max_sequence_length: 4000 + max_request_length: 4001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurboWindowService" + args: {} + + # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained + # in the openai/gpt-3.5-turbo-0613 comment + - name: openai/gpt-3.5-turbo-16k-0613 + model_name: openai/gpt-3.5-turbo-16k-0613 + tokenizer_name: openai/cl100k_base + max_sequence_length: 16000 + max_request_length: 16001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPTTurbo16KWindowService" + args: {} + + + ## GPT 4 Models + + - name: openai/gpt-4-0314 + model_name: openai/gpt-4-0314 + tokenizer_name: openai/cl100k_base + max_sequence_length: 8192 + max_request_length: 8193 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService" + args: {} + + - name: openai/gpt-4-32k-0314 + model_name: openai/gpt-4-32k-0314 + tokenizer_name: openai/cl100k_base + max_sequence_length: 32768 + max_request_length: 32769 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService" + args: {} + + - name: openai/gpt-4-0613 + model_name: openai/gpt-4-0613 + tokenizer_name: openai/cl100k_base + max_sequence_length: 8192 + max_request_length: 8193 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT4WindowService" + args: {} + + - name: openai/gpt-4-32k-0613 + model_name: openai/gpt-4-32k-0613 + tokenizer_name: openai/cl100k_base + max_sequence_length: 32768 + max_request_length: 32769 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.GPT432KWindowService" + args: {} + + + ## Codex Models + # DEPRECATED: Codex models have been shut down on March 23 2023. + + - name: openai/code-davinci-002 + deprecated: true + model_name: openai/code-davinci-002 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 4000 + max_request_length: 4001 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.wider_openai_window_service.WiderOpenAIWindowService" + args: {} + + - name: openai/code-davinci-001 + deprecated: true + model_name: openai/code-davinci-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/code-cushman-001 + deprecated: true + model_name: openai/code-cushman-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + + ## Text Similarity Models + # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings + # The number of parameters is guessed based on the number of parameters of the + # corresponding GPT-3 model. + # DEPRECATED: Announced on July 06 2023 that first generation embeddings models + # will be shut down on January 04 2024. + + - name: openai/text-similarity-davinci-001 + deprecated: true + model_name: openai/text-similarity-davinci-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-similarity-curie-001 + deprecated: true + model_name: openai/text-similarity-curie-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-similarity-babbage-001 + deprecated: true + model_name: openai/text-similarity-babbage-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + - name: openai/text-similarity-ada-001 + deprecated: true + model_name: openai/text-similarity-ada-001 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + # As of 2023-11-07, text-embedding-ada-002 is not deprecated: + # "We recommend using text-embedding-ada-002 for nearly all use cases." + # Source: https://platform.openai.com/docs/guides/embeddings/what-are-embeddings + - name: openai/text-embedding-ada-002 + model_name: openai/text-embedding-ada-002 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.openai_client.OpenAIClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.openai_window_service.OpenAIWindowService" + args: {} + + + + # Together + # The list of models served by Together changes often, to check the latest list, visit: + # https://docs.together.ai/docs/inference-models + # You can also check the playground to check that the live models are working: + # https://api.together.xyz/playground + + ## BigScience + - name: together/bloom + deprecated: true # Removed from together + model_name: bigscience/bloom + tokenizer_name: bigscience/bloom + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.bloom_window_service.BloomWindowService" + args: {} + + - name: together/t0pp + deprecated: true # Removed from together + model_name: bigscience/t0pp + tokenizer_name: bigscience/T0pp + max_sequence_length: 1024 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.t0pp_window_service.T0ppWindowService" + args: {} + + ## Databricks + - name: together/dolly-v2-3b + model_name: databricks/dolly-v2-3b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/dolly-v2-7b + model_name: databricks/dolly-v2-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/dolly-v2-12b + model_name: databricks/dolly-v2-12b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + ## EleutherAI + - name: together/gpt-j-6b + deprecated: true # Removed from together + model_name: eleutherai/gpt-j-6b + tokenizer_name: EleutherAI/gpt-j-6B + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService" + args: {} + + - name: together/gpt-neox-20b + deprecated: true # Removed from together + model_name: eleutherai/gpt-neox-20b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/pythia-1b-v0 + model_name: eleutherai/pythia-1b-v0 + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/pythia-2.8b-v0 + model_name: eleutherai/pythia-2.8b-v0 + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/pythia-6.9b + model_name: eleutherai/pythia-6.9b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/pythia-12b-v0 + model_name: eleutherai/pythia-12b-v0 + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + ## Google + - name: together/t5-11b + deprecated: true # Removed from together + model_name: google/t5-11b + tokenizer_name: google/t5-11b + max_sequence_length: 511 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.t511b_window_service.T511bWindowService" + args: {} + + - name: together/flan-t5-xxl + deprecated: true # Removed from together + model_name: google/flan-t5-xxl + tokenizer_name: google/flan-t5-xxl + max_sequence_length: 511 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.flan_t5_window_service.FlanT5WindowService" + args: {} + + - name: together/ul2 + deprecated: true # Removed from together + model_name: google/ul2 + tokenizer_name: google/ul2 + max_sequence_length: 511 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ul2_window_service.UL2WindowService" + args: {} + + ## HazyResearch + - name: together/h3-2.7b + deprecated: true# Not available on Together yet + model_name: hazyresearch/h3-2.7b + tokenizer_name: huggingface/gpt2 + max_sequence_length: 1024 + max_request_length: 1025 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gpt2_window_service.GPT2WindowService" + args: {} + + ## LMSYS + # TODO: might be deprecated. Needs to be checked. + # Together officialy supports vicuna 1.5, not sure if 1.3 is still supported. + - name: together/vicuna-7b-v1.3 + model_name: lmsys/vicuna-7b-v1.3 + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + - name: together/vicuna-13b-v1.3 + model_name: lmsys/vicuna-13b-v1.3 + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + ## Meta + - name: together/llama-7b + model_name: meta/llama-7b + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + - name: together/llama-13b + model_name: meta/llama-13b + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + - name: together/llama-30b + model_name: meta/llama-30b + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + - name: together/llama-65b + model_name: meta/llama-65b + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + - name: together/llama-2-7b + model_name: meta/llama-2-7b + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService" + args: {} + + - name: together/llama-2-13b + model_name: meta/llama-2-13b + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService" + args: {} + + - name: together/llama-2-70b + model_name: meta/llama-2-70b + tokenizer_name: meta-llama/Llama-2-7b-hf + max_sequence_length: 4096 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.Llama2WindowService" + args: {} + + - name: together/opt-175b + deprecated: true # Not available on Together yet + model_name: meta/opt-175b + tokenizer_name: facebook/opt-66b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService" + args: {} + + - name: together/opt-66b + deprecated: true # Not available on Together yet + model_name: meta/opt-66b + tokenizer_name: facebook/opt-66b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService" + args: {} + + - name: together/opt-6.7b + deprecated: true # Not available on Together yet + model_name: meta/opt-6.7b + tokenizer_name: facebook/opt-66b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService" + args: {} + + - name: together/opt-1.3b + deprecated: true # Not available on Together yet + model_name: meta/opt-1.3b + tokenizer_name: facebook/opt-66b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.opt_window_service.OPTWindowService" + args: {} + + ## MistralAI + - name: together/mistral-7b-v0.1 + model_name: mistralai/mistral-7b-v0.1 + tokenizer_name: mistralai/Mistral-7B-v0.1 + max_sequence_length: 4095 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + ## MosaicML + - name: together/mpt-7b + deprecated: true # Not available on Together yet + model_name: mosaicml/mpt-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/mpt-instruct-7b + deprecated: true # Not available on Together yet + model_name: mosaicml/mpt-instruct-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/mpt-30b + model_name: mosaicml/mpt-30b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/mpt-instruct-30b + model_name: mosaicml/mpt-instruct-30b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + ## StabilityAI + - name: together/stablelm-base-alpha-3b + deprecated: true # Removed from together + model_name: stabilityai/stablelm-base-alpha-3b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 4096 + max_request_length: 4097 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService" + args: {} + + - name: together/stablelm-base-alpha-7b + deprecated: true # Removed from together + model_name: stabilityai/stablelm-base-alpha-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 4096 + max_request_length: 4097 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.StableLMAlphaWindowService" + args: {} + + ## Stanford + - name: together/alpaca-7b + model_name: stanford/alpaca-7b + tokenizer_name: hf-internal-testing/llama-tokenizer + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.llama_window_service.LlamaWindowService" + args: {} + + ## Tiiuae + - name: together/falcon-7b + model_name: tiiuae/falcon-7b + tokenizer_name: tiiuae/falcon-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: together/falcon-7b-instruct + model_name: tiiuae/falcon-7b-instruct + tokenizer_name: tiiuae/falcon-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: together/falcon-40b + model_name: tiiuae/falcon-40b + tokenizer_name: tiiuae/falcon-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + - name: together/falcon-40b-instruct + model_name: tiiuae/falcon-40b-instruct + tokenizer_name: tiiuae/falcon-7b + max_sequence_length: 2048 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.huggingface_window_service.HuggingFaceWindowService" + args: {} + + ## Together + # These are models fine-tuned by Together (and not simply hosted by Together). + - name: together/gpt-jt-6b-v1 + model_name: together/gpt-jt-6b-v1 + tokenizer_name: EleutherAI/gpt-j-6B + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptj_window_service.GPTJWindowService" + args: {} + + - name: together/gpt-neoxt-chat-base-20b + model_name: together/gpt-neoxt-chat-base-20b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/redpajama-incite-base-3b-v1 + model_name: together/redpajama-incite-base-3b-v1 + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/redpajama-incite-instruct-3b-v1 + model_name: together/redpajama-incite-instruct-3b-v1 + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/redpajama-incite-base-7b + model_name: together/redpajama-incite-base-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + - name: together/redpajama-incite-instruct-7b + model_name: together/redpajama-incite-instruct-7b + tokenizer_name: EleutherAI/gpt-neox-20b + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.gptneox_window_service.GPTNeoXWindowService" + args: {} + + ## Tsinghua + - name: together/glm + deprecated: true # Not available on Together yet + model_name: tsinghua/glm + tokenizer_name: TsinghuaKEG/ice + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.ice_window_service.ICEWindowService" + args: {} + + ## Yandex + - name: together/yalm + deprecated: true # Not available on Together yet + model_name: yandex/yalm + tokenizer_name: Yandex/yalm + max_sequence_length: 2048 + max_request_length: 2049 + client_spec: + class_name: "helm.proxy.clients.together_client.TogetherClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.yalm_window_service.YaLMWindowService" + args: {} + + + + # Writer + - name: writer/palmyra-base + model_name: writer/palmyra-base + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_sequence_and_generated_tokens_length: 2048 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService" + args: {} + + - name: writer/palmyra-large + model_name: writer/palmyra-large + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_sequence_and_generated_tokens_length: 2048 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService" + args: {} + + - name: writer/palmyra-instruct-30 + model_name: writer/palmyra-instruct-30 + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_sequence_and_generated_tokens_length: 2048 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService" + args: {} + + - name: writer/palmyra-e + model_name: writer/palmyra-e + tokenizer_name: huggingface/gpt2 + max_sequence_length: 2048 + max_sequence_and_generated_tokens_length: 2048 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.PalmyraWindowService" + args: {} + + - name: writer/silk-road + model_name: writer/silk-road + tokenizer_name: huggingface/gpt2 + max_sequence_length: 8192 + max_sequence_and_generated_tokens_length: 8192 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService" + args: {} + + - name: writer/palmyra-x + model_name: writer/palmyra-x + tokenizer_name: huggingface/gpt2 + max_sequence_length: 8192 + max_sequence_and_generated_tokens_length: 8192 + client_spec: + class_name: "helm.proxy.clients.palmyra_client.PalmyraClient" + args: {} + window_service_spec: + class_name: "helm.benchmark.window_services.palmyra_window_service.LongerPalmyraWindowService" + args: {} \ No newline at end of file diff --git a/src/helm/config/model_metadata.yaml b/src/helm/config/model_metadata.yaml new file mode 100644 index 00000000000..e9c097ea165 --- /dev/null +++ b/src/helm/config/model_metadata.yaml @@ -0,0 +1,1351 @@ +# This file defines all the models officially supported by the Helm API. +# The model names here should match the model names in model_deployments.yaml. + +# If you want to add a new model, you can technically do it here but we recommend +# you to do it in private/model_metadata.yaml instead. + +models: + + - name: simple/model1 + display_name: Simple Model 1 + description: This is a test model. + creator_organization_name: Helm + access: open + release_date: 2023-01-01 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + # AI21 Labs + - name: ai21/j1-jumbo # DEPRECATED + display_name: J1-Jumbo v1 (178B) + description: Jurassic-1 Jumbo (178B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)). + creator_organization_name: AI21 Labs + access: limited + num_parameters: 178000000000 + release_date: 2021-08-11 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j1-large # DEPRECATED + display_name: J1-Large v1 (7.5B) + description: Jurassic-1 Large (7.5B parameters) ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)). + creator_organization_name: AI21 Labs + access: limited + num_parameters: 7500000000 + release_date: 2021-08-11 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j1-grande # DEPRECATED + display_name: J1-Grande v1 (17B) + description: Jurassic-1 Grande (17B parameters) with a "few tweaks" to the training process ([docs](https://studio.ai21.com/docs/jurassic1-language-models/), [tech report](https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf)). + creator_organization_name: AI21 Labs + access: limited + num_parameters: 17000000000 + release_date: 2022-05-03 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j1-grande-v2-beta # DEPRECATED + display_name: J1-Grande v2 beta (17B) + description: Jurassic-1 Grande v2 beta (17B parameters) + creator_organization_name: AI21 Labs + access: limited + num_parameters: 17000000000 + release_date: 2022-10-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j2-jumbo + display_name: Jurassic-2 Jumbo (178B) + description: Jurassic-2 Jumbo (178B parameters) ([docs](https://www.ai21.com/blog/introducing-j2)) + creator_organization_name: AI21 Labs + access: limited + num_parameters: 178000000000 + release_date: 2023-03-09 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j2-large + display_name: Jurassic-2 Large (7.5B) + description: Jurassic-2 Large (7.5B parameters) ([docs](https://www.ai21.com/blog/introducing-j2)) + creator_organization_name: AI21 Labs + access: limited + num_parameters: 7500000000 + release_date: 2023-03-09 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: ai21/j2-grande + display_name: Jurassic-2 Grande (17B) + description: Jurassic-2 Grande (17B parameters) ([docs](https://www.ai21.com/blog/introducing-j2)) + creator_organization_name: AI21 Labs + access: limited + num_parameters: 17000000000 + release_date: 2023-03-09 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + # TODO(1524): Change AI21 model names + # - j2-jumbo -> j2-ultra + # - j2-grande -> j2-mid + # - j2-large -> j2-light + + + + # Aleph Alpha + # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous + # TODO: add Luminous World when it's released + - name: AlephAlpha/luminous-base + display_name: Luminous Base (13B) + description: Luminous Base (13B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/)) + creator_organization_name: Aleph Alpha + access: limited + num_parameters: 13000000000 + # TODO: get exact release date + release_date: 2022-01-01 + # Does not support echo + tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: AlephAlpha/luminous-extended + display_name: Luminous Extended (30B) + description: Luminous Extended (30B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/)) + creator_organization_name: Aleph Alpha + access: limited + num_parameters: 30000000000 + release_date: 2022-01-01 + # Does not support echo + tags: [TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: AlephAlpha/luminous-supreme + display_name: Luminous Supreme (70B) + description: Luminous Supreme (70B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/)) + creator_organization_name: Aleph Alpha + access: limited + num_parameters: 70000000000 + release_date: 2022-01-01 + # Does not support echo. + # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG. + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + # TODO: Uncomment when luminous-world is released. + # - name: AlephAlpha/luminous-world # Not released yet. + # display_name: Luminous World (178B) + # description: Luminous World (178B parameters) ([docs](https://docs.aleph-alpha.com/docs/introduction/luminous/)) + # creator_organization_name: Aleph Alpha + # access: limited + # num_parameters: TBD + # release_date: TBD + # # Does not support echo. + # tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Anthropic + - name: anthropic/claude-v1.3 + display_name: Anthropic Claude v1.3 + description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf). + creator_organization_name: Anthropic + access: limited + num_parameters: 52000000000 + release_date: 2023-03-17 + tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: anthropic/claude-instant-v1 + display_name: Anthropic Claude Instant V1 + description: A lightweight version of Claude, a model trained using reinforcement learning from human feedback ([docs](https://www.anthropic.com/index/introducing-claude)). + creator_organization_name: Anthropic + access: limited + release_date: 2023-03-17 + tags: [ANTHROPIC_CLAUDE_1_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: anthropic/claude-2.0 + display_name: Anthropic Claude 2.0 + description: Claude 2.0 is a general purpose large language model developed by Anthropic. It uses a transformer architecture and is trained via unsupervised learning, RLHF, and Constitutional AI (including both a supervised and Reinforcement Learning (RL) phase). ([model card](https://efficient-manatee.files.svdcdn.com/production/images/Model-Card-Claude-2.pdf)) + creator_organization_name: Anthropic + access: limited + release_date: 2023-07-11 + tags: [ANTHROPIC_CLAUDE_2_MODEL_TAG, TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + # DEPRECATED: Please do not use. + - name: anthropic/stanford-online-all-v4-s3 + display_name: Anthropic-LM v4-s3 (52B) + description: A 52B parameter language model, trained using reinforcement learning from human feedback [paper](https://arxiv.org/pdf/2204.05862.pdf). + creator_organization_name: Anthropic + access: closed + num_parameters: 52000000000 + release_date: 2021-12-01 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] + + + + # Berkeley + - name: berkeley/koala-13b # NOT SUPPORTED + display_name: Koala (13B) + description: Koala (13B) is a chatbot fine-tuned from Llama (13B) on dialogue data gathered from the web. ([blog post](https://bair.berkeley.edu/blog/2023/04/03/koala/)) + creator_organization_name: UC Berkeley + access: open + num_parameters: 13000000000 + release_date: 2022-04-03 + tags: [] # TODO: add tags + + + + # BigScience + - name: bigscience/bloom + display_name: BLOOM (176B) + description: BLOOM (176B parameters) is an autoregressive model trained on 46 natural languages and 13 programming languages ([paper](https://arxiv.org/pdf/2211.05100.pdf)). + creator_organization_name: BigScience + access: open + num_parameters: 176000000000 + release_date: 2022-06-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] + + - name: bigscience/bloomz # NOT SUPPORTED + display_name: BLOOMZ (176B) + description: BLOOMZ (176B parameters) is BLOOM that has been fine-tuned on natural language instructions ([details](https://huggingface.co/bigscience/bloomz)). + creator_organization_name: BigScience + access: open + num_parameters: 176000000000 + release_date: 2022-11-03 + tags: [] # TODO: add tags + + - name: bigscience/t0pp + display_name: T0pp (11B) + description: T0pp (11B parameters) is an encoder-decoder model trained on a large set of different tasks specified in natural language prompts ([paper](https://arxiv.org/pdf/2110.08207.pdf)). + creator_organization_name: BigScience + access: open + num_parameters: 11000000000 + release_date: 2021-10-15 + # Does not support echo. + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG] + + + + # BigCode + - name: bigcode/santacoder + display_name: SantaCoder (1.1B) + description: SantaCoder (1.1B parameters) model trained on the Python, Java, and JavaScript subset of The Stack (v1.1) ([model card](https://huggingface.co/bigcode/santacoder)). + creator_organization_name: BigCode + access: open + num_parameters: 1100000000 + release_date: 2023-01-09 # ArXiv submission date + tags: [CODE_MODEL_TAG] + + - name: bigcode/starcoder + display_name: StarCoder (15.5B) + description: The StarCoder (15.5B parameter) model trained on 80+ programming languages from The Stack (v1.2) ([model card](https://huggingface.co/bigcode/starcoder)). + creator_organization_name: BigCode + access: open + num_parameters: 15500000000 + release_date: 2023-05-09 # ArXiv submission date + tags: [CODE_MODEL_TAG] + + + + # Cerebras Systems + - name: cerebras/cerebras-gpt-6.7b # NOT SUPPORTED + display_name: Cerebras GPT (6.7B) + description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf)) + creator_organization_name: Cerebras + access: limited + num_parameters: 6700000000 + release_date: 2023-04-06 + tags: [] # TODO: add tags + + - name: cerebras/cerebras-gpt-13b # NOT SUPPORTED + display_name: Cerebras GPT (13B) + description: Cerebras GPT is a family of open compute-optimal language models scaled from 111M to 13B parameters trained on the Eleuther Pile. ([paper](https://arxiv.org/pdf/2304.03208.pdf)) + creator_organization_name: Cerebras + access: limited + num_parameters: 13000000000 + release_date: 2023-04-06 + tags: [] # TODO: add tags + + + + # Cohere + # Model versioning and the possible versions are not documented here: + # https://docs.cohere.ai/generate-reference#model-optional. + # So, instead, we got the names of the models from the Cohere Playground. + # + # Note that their tokenizer and model were trained on English text and + # they do not have a dedicated decode API endpoint, so the adaptation + # step for language modeling fails for certain Scenarios: + # the_pile:subset=ArXiv + # the_pile:subset=Github + # the_pile:subset=PubMed Central + + # TODO: Consider renaming to new model names. + - name: cohere/xlarge-20220609 + display_name: Cohere xlarge v20220609 (52.4B) + description: Cohere xlarge v20220609 (52.4B parameters) + creator_organization_name: Cohere + access: limited + num_parameters: 52400000000 + release_date: 2022-06-09 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/large-20220720 # DEPRECATED + display_name: Cohere large v20220720 (13.1B) + description: Cohere large v20220720 (13.1B parameters), which is deprecated by Cohere as of December 2, 2022. + creator_organization_name: Cohere + access: limited + num_parameters: 13100000000 + release_date: 2022-07-20 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/medium-20220720 + display_name: Cohere medium v20220720 (6.1B) + description: Cohere medium v20220720 (6.1B parameters) + creator_organization_name: Cohere + access: limited + num_parameters: 6100000000 + release_date: 2022-07-20 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/small-20220720 # DEPRECATED + display_name: Cohere small v20220720 (410M) + description: Cohere small v20220720 (410M parameters), which is deprecated by Cohere as of December 2, 2022. + creator_organization_name: Cohere + access: limited + num_parameters: 410000000 + release_date: 2022-07-20 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/xlarge-20221108 + display_name: Cohere xlarge v20221108 (52.4B) + description: Cohere xlarge v20221108 (52.4B parameters) + creator_organization_name: Cohere + access: limited + num_parameters: 52400000000 + release_date: 2022-11-08 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/medium-20221108 # DEPRECATED + display_name: Cohere medium v20221108 (6.1B) + description: Cohere medium v20221108 (6.1B parameters) + creator_organization_name: Cohere + access: limited + num_parameters: 6100000000 + release_date: 2022-11-08 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: cohere/command-medium-beta # DEPRECATED + display_name: Cohere Command beta (6.1B) + description: Cohere Command beta (6.1B parameters) is fine-tuned from the medium model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)). + creator_organization_name: Cohere + access: limited + num_parameters: 6100000000 + release_date: 2022-11-08 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: cohere/command-xlarge-beta # DEPRECATED + display_name: Cohere Command beta (52.4B) + description: Cohere Command beta (52.4B parameters) is fine-tuned from the XL model to respond well with instruction-like prompts ([details](https://docs.cohere.ai/docs/command-beta)). + creator_organization_name: Cohere + access: limited + num_parameters: 52400000000 + release_date: 2022-11-08 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + # TODO: Fill in the details. + - name: cohere/command + display_name: Cohere Command TODO + description: Cohere Command TODO + creator_organization_name: Cohere + access: limited + release_date: 2022-11-08 # TODO + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: cohere/command-light + display_name: Cohere Command TODO + description: Cohere Command TODO + creator_organization_name: Cohere + access: limited + release_date: 2022-11-08 # TODO + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + + # Databricks + - name: databricks/dolly-v2-3b + display_name: Dolly V2 (3B) + description: Dolly V2 (3B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b. + creator_organization_name: Databricks + access: open + num_parameters: 2517652480 + release_date: 2023-04-12 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: databricks/dolly-v2-7b + display_name: Dolly V2 (7B) + description: Dolly V2 (7B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b. + creator_organization_name: Databricks + access: open + num_parameters: 6444163072 + release_date: 2023-04-12 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: databricks/dolly-v2-12b + display_name: Dolly V2 (12B) + description: Dolly V2 (12B) is an instruction-following large language model trained on the Databricks machine learning platform. It is based on pythia-12b. + creator_organization_name: Databricks + access: open + num_parameters: 11327027200 + release_date: 2023-04-12 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # DeepMind + - name: deepmind/gopher # NOT SUPPORTED + display_name: Gopher (280B) + description: Gopher (280B parameters) ([paper](https://arxiv.org/pdf/2112.11446.pdf)). + creator_organization_name: DeepMind + access: closed + num_parameters: 280000000000 + release_date: 2021-12-08 + tags: [] # TODO: add tags + + - name: deepmind/chinchilla # NOT SUPPORTED + display_name: Chinchilla (70B) + description: Chinchilla (70B parameters) ([paper](https://arxiv.org/pdf/2203.15556.pdf)). + creator_organization_name: DeepMind + access: closed + num_parameters: 70000000000 + release_date: 2022-03-31 + tags: [] # TODO: add tags + + + + # EleutherAI + - name: eleutherai/gpt-j-6b # Served by GooseAi, HuggingFace and Together. + display_name: GPT-J (6B) + description: GPT-J (6B parameters) autoregressive language model trained on The Pile ([details](https://arankomatsuzaki.wordpress.com/2021/06/04/gpt-j/)). + creator_organization_name: EleutherAI + access: open + num_parameters: 6000000000 + release_date: 2021-06-04 + # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together). + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG] + + - name: eleutherai/gpt-neox-20b # Served by GooseAi and Together. + display_name: GPT-NeoX (20B) + description: GPT-NeoX (20B parameters) autoregressive language model trained on The Pile ([paper](https://arxiv.org/pdf/2204.06745.pdf)). + creator_organization_name: EleutherAI + access: open + num_parameters: 20000000000 + release_date: 2022-02-02 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] + + - name: eleutherai/pythia-1b-v0 + display_name: Pythia (1B) + description: Pythia (1B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. + creator_organization_name: EleutherAI + access: open + num_parameters: 805736448 + release_date: 2023-02-13 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: eleutherai/pythia-2.8b-v0 + display_name: Pythia (2.8B) + description: Pythia (2.8B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. + creator_organization_name: EleutherAI + access: open + num_parameters: 2517652480 + release_date: 2023-02-13 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: eleutherai/pythia-6.9b + display_name: Pythia (6.9B) + description: Pythia (6.9B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. + creator_organization_name: EleutherAI + access: open + num_parameters: 6444163072 + release_date: 2023-02-13 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: eleutherai/pythia-12b-v0 + display_name: Pythia (12B) + description: Pythia (12B parameters). The Pythia project combines interpretability analysis and scaling laws to understand how knowledge develops and evolves during training in autoregressive transformers. + creator_organization_name: EleutherAI + access: open + num_parameters: 11327027200 + release_date: 2023-02-13 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Google + - name: google/t5-11b + display_name: T5 (11B) + description: T5 (11B parameters) is an encoder-decoder model trained on a multi-task mixture, where each task is converted into a text-to-text format ([paper](https://arxiv.org/pdf/1910.10683.pdf)). + creator_organization_name: Google + access: open + num_parameters: 11000000000 + release_date: 2019-10-23 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG] + + - name: google/ul2 + display_name: UL2 (20B) + description: UL2 (20B parameters) is an encoder-decoder model trained on the C4 corpus. It's similar to T5 but trained with a different objective and slightly different scaling knobs ([paper](https://arxiv.org/pdf/2205.05131.pdf)). + creator_organization_name: Google + access: open + num_parameters: 20000000000 + release_date: 2022-05-10 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, NLG_PREFIX_TAG] + + - name: google/flan-t5-xxl + display_name: Flan-T5 (11B) + description: Flan-T5 (11B parameters) is T5 fine-tuned on 1.8K tasks ([paper](https://arxiv.org/pdf/2210.11416.pdf)). + creator_organization_name: Google + access: open + num_parameters: 11000000000 + release_date: 2022-12-06 # Paper date + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: google/palm # NOT SUPPORTED + display_name: PaLM (540B) + description: Pathways Language Model (540B parameters) is trained using 6144 TPU v4 chips ([paper](https://arxiv.org/pdf/2204.02311.pdf)). + creator_organization_name: Google + access: closed + num_parameters: 540000000000 + release_date: 2023-03-01 # was first announced on 2022-04 but remained private. + tags: [] # TODO: add tags + + + + # HazyResearch + - name: hazyresearch/h3-2.7b + display_name: H3 (2.7B) + description: H3 (2.7B parameters) is a decoder-only language model based on state space models ([paper](https://arxiv.org/abs/2212.14052)). + creator_organization_name: HazyResearch + access: open + num_parameters: 2700000000 + release_date: 2023-01-23 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # HuggingFace + - name: huggingface/idefics-9b + display_name: IDEFICS (9B) + description: IDEFICS (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics)) + creator_organization_name: HuggingFace + access: open + num_parameters: 9000000000 + release_date: 2023-08-22 + tags: [VISION_LANGUAGE_MODEL_TAG] + + - name: huggingface/idefics-9b-instruct + display_name: IDEFICS instruct (9B) + description: IDEFICS instruct (9B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics)) + creator_organization_name: HuggingFace + access: open + num_parameters: 9000000000 + release_date: 2023-08-22 + tags: [VISION_LANGUAGE_MODEL_TAG] + + - name: huggingface/idefics-80b + display_name: IDEFICS (80B) + description: IDEFICS (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics)) + creator_organization_name: HuggingFace + access: open + num_parameters: 80000000000 + release_date: 2023-08-22 + tags: [VISION_LANGUAGE_MODEL_TAG] + + - name: huggingface/idefics-80b-instruct + display_name: IDEFICS instruct (80B) + description: IDEFICS instruct (80B parameters) is an open-source model based on DeepMind's Flamingo. ([blog](https://huggingface.co/blog/idefics)) + creator_organization_name: HuggingFace + access: open + num_parameters: 80000000000 + release_date: 2023-08-22 + tags: [VISION_LANGUAGE_MODEL_TAG] + + + + # Lightning AI + - name: lightningai/lit-gpt + display_name: Lit-GPT + description: Lit-GPT is an optimized collection of open-source LLMs for finetuning and inference. It supports – Falcon, Llama 2, Vicuna, LongChat, and other top-performing open-source large language models. + creator_organization_name: Lightning AI + access: open + release_date: 2023-04-04 + tags: [TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # LMSYS + - name: lmsys/vicuna-7b-v1.3 + display_name: Vicuna v1.3 (7B) + description: Vicuna v1.3 (7B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. + creator_organization_name: LMSYS + access: open + num_parameters: 7000000000 + release_date: 2023-06-22 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: lmsys/vicuna-13b-v1.3 + display_name: Vicuna v1.3 (13B) + description: Vicuna v1.3 (13B) is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. + creator_organization_name: LMSYS + access: open + num_parameters: 13000000000 + release_date: 2023-06-22 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + + # Meta + - name: meta/opt-iml-175b # NOT SUPPORTED + display_name: OPT-IML (175B) + description: OPT-IML (175B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 175000000000 + release_date: 2022-12-22 + tags: [] # TODO: add tags + + - name: meta/opt-iml-30b # NOT SUPPORTED + display_name: OPT-IML (30B) + description: OPT-IML (30B parameters) is a suite of decoder-only transformer LMs that are multi-task fine-tuned on 2000 datasets ([paper](https://arxiv.org/pdf/2212.12017.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 30000000000 + release_date: 2022-12-22 + tags: [] # TODO: add tags + + - name: meta/opt-175b + display_name: OPT (175B) + description: Open Pre-trained Transformers (175B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 175000000000 + release_date: 2022-05-02 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] + + - name: meta/opt-66b + display_name: OPT (66B) + description: Open Pre-trained Transformers (66B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 66000000000 + release_date: 2022-05-02 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] + + - name: meta/opt-6.7b + display_name: OPT (6.7B) + description: Open Pre-trained Transformers (6.7B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 6700000000 + release_date: 2022-05-02 + # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together). + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG] + + - name: meta/opt-1.3b + display_name: OPT (1.3B) + description: Open Pre-trained Transformers (1.3B parameters) is a suite of decoder-only pre-trained transformers that are fully and responsibly shared with interested researchers ([paper](https://arxiv.org/pdf/2205.01068.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 1300000000 + release_date: 2022-05-02 + # TODO: The BUGGY_TEMP_0_TAG is a deployment related tag (Together). + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, BUGGY_TEMP_0_TAG] + + - name: meta/galactica-120b # NOT SUPPORTED + display_name: Galactica (120B) + description: Galactica (120B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 120000000000 + release_date: 2022-11-15 + tags: [] # TODO: add tags + + - name: meta/galactica-30b # NOT SUPPORTED + display_name: Galactica (30B) + description: Galactica (30B parameters) is trained on 48 million papers, textbooks, lectures notes, compounds and proteins, scientific websites, etc. ([paper](https://galactica.org/static/paper.pdf)). + creator_organization_name: Meta + access: open + num_parameters: 30000000000 + release_date: 2022-11-15 + tags: [] # TODO: add tags + + - name: meta/llama-7b + display_name: LLaMA (7B) + description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters. + creator_organization_name: Meta + access: open + num_parameters: 7000000000 + release_date: 2023-02-24 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-13b + display_name: LLaMA (13B) + description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters. + creator_organization_name: Meta + access: open + num_parameters: 13000000000 + release_date: 2023-02-24 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-30b + display_name: LLaMA (30B) + description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters. + creator_organization_name: Meta + access: open + num_parameters: 30000000000 + release_date: 2023-02-24 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-65b + display_name: LLaMA (65B) + description: LLaMA is a collection of foundation language models ranging from 7B to 65B parameters. + creator_organization_name: Meta + access: open + num_parameters: 65000000000 + release_date: 2023-02-24 + # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-2-7b + display_name: Llama 2 (7B) + description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1. + creator_organization_name: Meta + access: open + num_parameters: 7000000000 + release_date: 2023-07-18 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-2-13b + display_name: Llama 2 (13B) + description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1. + creator_organization_name: Meta + access: open + num_parameters: 13000000000 + release_date: 2023-07-18 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: meta/llama-2-70b + display_name: Llama 2 (70B) + description: Llama 2 pretrained models are trained on 2 trillion tokens, and have double the context length than Llama 1. + creator_organization_name: Meta + access: open + num_parameters: 70000000000 + release_date: 2023-07-18 + # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Microsoft/NVIDIA + - name: microsoft/TNLGv2_530B + display_name: TNLG v2 (530B) + description: TNLG v2 (530B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)). + creator_organization_name: Microsoft/NVIDIA + access: closed + num_parameters: 530000000000 + release_date: 2022-01-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: microsoft/TNLGv2_7B + display_name: TNLG v2 (6.7B) + description: TNLG v2 (6.7B parameters) autoregressive language model trained on a filtered subset of the Pile and CommonCrawl ([paper](https://arxiv.org/pdf/2201.11990.pdf)). + creator_organization_name: Microsoft/NVIDIA + access: closed + num_parameters: 6700000000 + release_date: 2022-01-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Mistral AI + - name: mistralai/mistral-7b-v0.1 + display_name: Mistral v0.1 (7B) + description: Mistral 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA) and Sliding-Window Attention (SWA). + creator_organization_name: Mistral AI + access: open + num_parameters: 7300000000 + release_date: 2023-09-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + + # MosaicML + - name: mosaicml/mpt-7b + display_name: MPT (7B) + description: MPT (7B) is a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 6700000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: mosaicml/mpt-7b-chat # NOT SUPPORTED + display_name: MPT-Chat (7B) + description: MPT-Chat (7B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B) , a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 6700000000 + release_date: 2023-05-05 + tags: [] # TODO: add tags + + - name: mosaicml/mpt-instruct-7b + display_name: MPT-Instruct (7B) + description: MPT-Instruct (7B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 6700000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: mosaicml/mpt-30b + display_name: MPT (30B) + description: MPT (30B) is a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 30000000000 + release_date: 2023-06-22 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: mosaicml/mpt-30b-chat # NOT SUPPORTED + display_name: MPT-Chat (30B) + description: MPT-Chat (30B) is a chatbot-like model for dialogue generation. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 30000000000 + release_date: 2023-06-22 + tags: [] # TODO: add tags + + - name: mosaicml/mpt-instruct-30b + display_name: MPT-Instruct (30B) + description: MPT-Instruct (30B) is a model for short-form instruction following. It is built by finetuning MPT (30B), a Transformer trained from scratch on 1T tokens of text and code. + creator_organization_name: MosaicML + access: open + num_parameters: 30000000000 + release_date: 2023-06-22 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Neurips + - name: neurips/local + display_name: Neurips Local + description: Neurips Local + creator_organization_name: Neurips + access: open + release_date: 2023-06-01 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # NVIDIA + - name: nvidia/megatron-gpt2 + display_name: Megatron GPT2 + description: GPT-2 implemented in Megatron-LM ([paper](https://arxiv.org/abs/1909.08053)). + creator_organization_name: NVIDIA + access: open + release_date: 2019-09-17 # paper date + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, BUGGY_TEMP_0_TAG] + + + + # OpenAI + + ## GPT 2 Models + # Not served by OpenAI, instead served by HuggingFace. + + - name: openai/gpt2 + display_name: GPT-2 (1.5B) + description: GPT-2 (1.5B parameters) is a transformer model trained on a large corpus of English text in a self-supervised fashion ([paper](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)). + creator_organization_name: OpenAI + access: open + num_parameters: 1500000000 + release_date: 2019-02-14 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + ## GPT 3 Models + # The list of models can be found here: https://beta.openai.com/docs/engines/gpt-3 + # DEPRECATED: Announced on July 06 2023 that these models will be shut down on January 04 2024. + + - name: openai/davinci # DEPRECATED + display_name: davinci (175B) + description: Original GPT-3 (175B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 175000000000 + release_date: 2020-05-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/curie # DEPRECATED + display_name: curie (6.7B) + description: Original GPT-3 (6.7B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 6700000000 + release_date: 2020-05-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/babbage # DEPRECATED + display_name: babbage (1.3B) + description: Original GPT-3 (1.3B parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 1300000000 + release_date: 2020-05-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/ada # DEPRECATED + display_name: ada (350M) + description: Original GPT-3 (350M parameters) autoregressive language model ([paper](https://arxiv.org/pdf/2005.14165.pdf), [docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 350000000 + release_date: 2020-05-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/text-davinci-003 # DEPRECATED + display_name: text-davinci-003 + description: text-davinci-003 model that involves reinforcement learning (PPO) with reward models. Derived from text-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 175000000000 + release_date: 2022-11-28 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + # TODO: text-davinci-002 supports insertion. Support insertion in our framework. + # https://github.com/stanford-crfm/benchmarking/issues/359 + - name: openai/text-davinci-002 # DEPRECATED + display_name: text-davinci-002 + description: text-davinci-002 model that involves supervised fine-tuning on human-written demonstrations. Derived from code-davinci-002 ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 175000000000 + release_date: 2022-01-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/text-davinci-001 # DEPRECATED + display_name: text-davinci-001 + description: text-davinci-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 175000000000 + release_date: 2022-01-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/text-curie-001 # DEPRECATED + display_name: text-curie-001 + description: text-curie-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 6700000000 + release_date: 2022-01-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/text-babbage-001 # DEPRECATED + display_name: text-babbage-001 + description: text-babbage-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 1300000000 + release_date: 2022-01-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: openai/text-ada-001 # DEPRECATED + display_name: text-ada-001 + description: text-ada-001 model that involves supervised fine-tuning on human-written demonstrations ([docs](https://beta.openai.com/docs/model-index-for-researchers)). + creator_organization_name: OpenAI + access: limited + num_parameters: 350000000 + release_date: 2022-01-27 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + ## GPT 3.5 Turbo Models + # ChatGPT: https://openai.com/blog/chatgpt + + - name: openai/gpt-3.5-turbo-0301 + display_name: gpt-3.5-turbo-0301 + description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-03-01. + creator_organization_name: OpenAI + access: limited + release_date: 2023-03-01 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: openai/gpt-3.5-turbo-0613 + display_name: gpt-3.5-turbo-0613 + description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13. + creator_organization_name: OpenAI + access: limited + release_date: 2023-06-13 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained + # in the openai/gpt-3.5-turbo-0613 comment + - name: openai/gpt-3.5-turbo-16k-0613 + display_name: gpt-3.5-turbo-16k-0613 + description: Sibling model of text-davinci-003 is optimized for chat but works well for traditional completions tasks as well. Snapshot from 2023-06-13 with a longer context length of 16,384 tokens. + creator_organization_name: OpenAI + access: limited + release_date: 2023-06-13 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + ## GPT 4 Models + + - name: openai/gpt-4-0314 + display_name: gpt-4-0314 + description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from March 14th 2023. + creator_organization_name: OpenAI + access: limited + release_date: 2023-03-14 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: openai/gpt-4-32k-0314 + display_name: gpt-4-32k-0314 + description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from March 14th 2023. + creator_organization_name: OpenAI + access: limited + release_date: 2023-03-14 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: openai/gpt-4-0613 + display_name: gpt-4-0613 + description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 from 2023-06-13. + creator_organization_name: OpenAI + access: limited + release_date: 2023-06-13 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + - name: openai/gpt-4-32k-0613 + display_name: gpt-4-32k-0613 + description: GPT-4 is a large multimodal model (currently only accepting text inputs and emitting text outputs) that is optimized for chat but works well for traditional completions tasks. Snapshot of gpt-4 with a longer context length of 32,768 tokens from 2023-06-13. + creator_organization_name: OpenAI + access: limited + release_date: 2023-06-13 + tags: [TEXT_MODEL_TAG, OPENAI_CHATGPT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + ## Codex Models + # DEPRECATED: Codex models have been shut down on March 23 2023. + + - name: openai/code-davinci-002 # DEPRECATED + display_name: code-davinci-002 + description: Codex-style model that is designed for pure code-completion tasks ([docs](https://beta.openai.com/docs/models/codex)). + creator_organization_name: OpenAI + access: limited + release_date: 2021-07-01 # TODO: Find correct date (this is for v1) + tags: [CODE_MODEL_TAG] + + - name: openai/code-davinci-001 # DEPRECATED + display_name: code-davinci-001 + description: code-davinci-001 model + creator_organization_name: OpenAI + access: limited + release_date: 2021-07-01 # Paper date + tags: [CODE_MODEL_TAG] + + - name: openai/code-cushman-001 # DEPRECATED + display_name: code-cushman-001 (12B) + description: Codex-style model that is a stronger, multilingual version of the Codex (12B) model in the [Codex paper](https://arxiv.org/pdf/2107.03374.pdf). + creator_organization_name: OpenAI + access: limited + num_parameters: 12000000000 + release_date: 2021-07-01 # Paper date + tags: [CODE_MODEL_TAG] + + + ## Text Similarity Models + # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings + # The number of parameters is guessed based on the number of parameters of the + # corresponding GPT-3 model. + # DEPRECATED: Announced on July 06 2023 that first generation embeddings models + # will be shut down on January 04 2024. + + - name: openai/text-similarity-davinci-001 # DEPRECATED + display_name: text-similarity-davinci-001 + description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)). + creator_organization_name: OpenAI + access: limited + num_parameters: 175000000000 + release_date: 2022-01-25 # Blog post date + tags: [TEXT_SIMILARITY_MODEL_TAG] + + - name: openai/text-similarity-curie-001 # DEPRECATED + display_name: text-similarity-curie-001 + description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)). + creator_organization_name: OpenAI + access: limited + num_parameters: 6700000000 + release_date: 2022-01-25 # Blog post date + tags: [TEXT_SIMILARITY_MODEL_TAG] + + - name: openai/text-similarity-babbage-001 # DEPRECATED + display_name: text-similarity-babbage-001 + description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)). + creator_organization_name: OpenAI + access: limited + num_parameters: 1300000000 + release_date: 2022-01-25 # Blog post date + tags: [TEXT_SIMILARITY_MODEL_TAG] + + - name: openai/text-similarity-ada-001 # DEPRECATED + display_name: text-similarity-ada-001 + description: Embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/introducing-text-and-code-embeddings)). + creator_organization_name: OpenAI + access: limited + num_parameters: 350000000 + release_date: 2022-01-25 # Blog post date + tags: [TEXT_SIMILARITY_MODEL_TAG] + + - name: openai/text-embedding-ada-002 + display_name: text-embedding-ada-002 + description: An improved embedding model that is designed for text similarity tasks ([docs](https://openai.com/blog/new-and-improved-embedding-model)). + creator_organization_name: OpenAI + access: limited + release_date: 2022-12-15 # Blog post date + tags: [TEXT_SIMILARITY_MODEL_TAG] + + + + # Salesforce + - name: salesforce/codegen # NOT SUPPORTED + display_name: CodeGen (16B) + description: CodeGen (16B parameters) is an open dense code model trained for multi-turn program synthesis ([blog](https://arxiv.org/pdf/2203.13474.pdf)). + creator_organization_name: Tsinghua + access: open + num_parameters: 16000000000 + release_date: 2022-03-25 + tags: [] # TODO: add tags + + + + # Stability AI + - name: stabilityai/stablelm-base-alpha-3b + display_name: StableLM-Base-Alpha (3B) + description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models. + creator_organization_name: Stability AI + access: open + num_parameters: 3000000000 + release_date: 2023-04-20 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: stabilityai/stablelm-base-alpha-7b + display_name: StableLM-Base-Alpha (7B) + description: StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models pre-trained on a diverse collection of English datasets with a sequence length of 4096 to push beyond the context window limitations of existing open-source language models. + creator_organization_name: Stability AI + access: open + num_parameters: 7000000000 + release_date: 2023-04-20 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Stanford + - name: stanford/alpaca-7b + display_name: Alpaca (7B) + description: Alpaca 7B is a model fine-tuned from the LLaMA 7B model on 52K instruction-following demonstrations + creator_organization_name: Stanford + access: open + num_parameters: 7000000000 + release_date: 2023-03-13 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG] + + + + # TII UAE + - name: tiiuae/falcon-7b + display_name: Falcon (7B) + description: Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora. + creator_organization_name: TII UAE + access: open + num_parameters: 7000000000 + release_date: 2023-03-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: tiiuae/falcon-7b-instruct + display_name: Falcon-Instruct (7B) + description: Falcon-7B-Instruct is a 7B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets. + creator_organization_name: TII UAE + access: open + num_parameters: 7000000000 + release_date: 2023-03-15 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: tiiuae/falcon-40b + display_name: Falcon (40B) + description: Falcon-40B is a 40B parameters causal decoder-only model built by TII and trained on 1,500B tokens of RefinedWeb enhanced with curated corpora. + creator_organization_name: TII UAE + access: open + num_parameters: 40000000000 + release_date: 2023-05-25 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: tiiuae/falcon-40b-instruct + display_name: Falcon-Instruct (40B) + description: Falcon-40B-Instruct is a 40B parameters causal decoder-only model built by TII based on Falcon-7B and finetuned on a mixture of chat/instruct datasets. + creator_organization_name: TII UAE + access: open + num_parameters: 40000000000 + release_date: 2023-05-25 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Together + - name: together/gpt-jt-6b-v1 + display_name: GPT-JT (6B) + description: GPT-JT (6B parameters) is a fork of GPT-J ([blog post](https://www.together.xyz/blog/releasing-v1-of-gpt-jt-powered-by-open-source-ai)). + creator_organization_name: Together + access: open + num_parameters: 6700000000 + release_date: 2022-11-29 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: together/gpt-neoxt-chat-base-20b + display_name: GPT-NeoXT-Chat-Base (20B) + description: GPT-NeoXT-Chat-Base (20B) is fine-tuned from GPT-NeoX, serving as a base model for developing open-source chatbots. + creator_organization_name: Together + access: open + num_parameters: 20000000000 + release_date: 2023-03-08 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG] + + - name: together/redpajama-incite-base-3b-v1 + display_name: RedPajama-INCITE-Base-v1 (3B) + description: RedPajama-INCITE-Base-v1 (3B parameters) is a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible. + creator_organization_name: Together + access: open + num_parameters: 3000000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: together/redpajama-incite-instruct-3b-v1 + display_name: RedPajama-INCITE-Instruct-v1 (3B) + description: RedPajama-INCITE-Instruct-v1 (3B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible. + creator_organization_name: Together + access: open + num_parameters: 3000000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: together/redpajama-incite-chat-3b-v1 # NOT SUPPORTED + display_name: RedPajama-INCITE-Chat-v1 (3B) + description: RedPajama-INCITE-Chat-v1 (3B parameters) is a model fine-tuned on OASST1 and Dolly2 to enhance chatting ability. It is built from RedPajama-INCITE-Base-v1 (3B), a 3 billion base model that aims to replicate the LLaMA recipe as closely as possible. + creator_organization_name: Together + access: open + num_parameters: 3000000000 + release_date: 2023-05-05 + tafs: [] # TODO: add tags + + - name: together/redpajama-incite-base-7b + display_name: RedPajama-INCITE-Base (7B) + description: RedPajama-INCITE-Base (7B parameters) is a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible. + creator_organization_name: Together + access: open + num_parameters: 7000000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: together/redpajama-incite-instruct-7b + display_name: RedPajama-INCITE-Instruct (7B) + description: RedPajama-INCITE-Instruct (7B parameters) is a model fine-tuned for few-shot applications on the data of GPT-JT. It is built from RedPajama-INCITE-Base (7B), a 7 billion base model that aims to replicate the LLaMA recipe as closely as possible. + creator_organization_name: Together + access: open + num_parameters: 7000000000 + release_date: 2023-05-05 + tags: [TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Tsinghua + - name: tsinghua/glm + display_name: GLM (130B) + description: GLM (130B parameters) is an open bilingual (English & Chinese) bidirectional dense model that was trained using General Language Model (GLM) procedure ([paper](https://arxiv.org/pdf/2210.02414.pdf)). + creator_organization_name: Tsinghua + access: open + num_parameters: 130000000000 + release_date: 2022-08-04 + # Inference with echo=True is not feasible -- in the prompt encoding phase, they use + # bidirectional attention and do not perform predictions on them. + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG] + + - name: tsinghua/codegeex # NOT SUPPORTED + display_name: CodeGeeX (13B) + description: CodeGeeX (13B parameters) is an open dense code model trained on more than 20 programming languages on a corpus of more than 850B tokens ([blog](http://keg.cs.tsinghua.edu.cn/codegeex/)). + creator_organization_name: Tsinghua + access: open + num_parameters: 13000000000 + release_date: 2022-09-19 + tags: [] # TODO: add tags + + + + # Writer + - name: writer/palmyra-base + display_name: Palmyra Base (5B) + description: Palmyra Base (5B) + creator_organization_name: Writer + access: limited + num_parameters: 5000000000 + release_date: 2022-10-13 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: writer/palmyra-large + display_name: Palmyra Large (20B) + description: Palmyra Large (20B) + creator_organization_name: Writer + access: limited + num_parameters: 20000000000 + release_date: 2022-12-23 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: writer/palmyra-instruct-30 + display_name: InstructPalmyra (30B) + description: InstructPalmyra (30B parameters) is trained using reinforcement learning techniques based on feedback from humans. + creator_organization_name: Writer + access: limited + num_parameters: 30000000000 + release_date: 2023-02-16 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: writer/palmyra-e + display_name: Palmyra E (30B) + description: Palmyra E (30B) + creator_organization_name: Writer + access: limited + num_parameters: 30000000000 + release_date: 2023-03-03 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: writer/silk-road + display_name: Silk Road (35B) + description: Silk Road (35B) + creator_organization_name: Writer + access: limited + num_parameters: 35000000000 + release_date: 2023-04-13 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + - name: writer/palmyra-x + display_name: Palmyra X (43B) + description: Palmyra-X (43B parameters) is trained to adhere to instructions using human feedback and utilizes a technique called multiquery attention. Furthermore, a new feature called 'self-instruct' has been introduced, which includes the implementation of an early stopping criteria specifically designed for minimal instruction tuning ([paper](https://dev.writer.com/docs/becoming-self-instruct-introducing-early-stopping-criteria-for-minimal-instruct-tuning)). + creator_organization_name: Writer + access: limited + num_parameters: 43000000000 + release_date: 2023-06-11 + # Does not support echo + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG] + + + + # Yandex + - name: yandex/yalm + display_name: YaLM (100B) + description: YaLM (100B parameters) is an autoregressive language model trained on English and Russian text ([GitHub](https://github.com/yandex/YaLM-100B)). + creator_organization_name: Yandex + access: open + num_parameters: 100000000000 + release_date: 2022-06-23 + tags: [TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG] \ No newline at end of file diff --git a/src/helm/config/tokenizer_configs.yaml b/src/helm/config/tokenizer_configs.yaml new file mode 100644 index 00000000000..c7c0d1446ec --- /dev/null +++ b/src/helm/config/tokenizer_configs.yaml @@ -0,0 +1,202 @@ +tokenizer_configs: + + - name: simple/model1 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.simple_tokenizer.SimpleTokenizer" + end_of_text_token: "" + prefix_token: "" + + # AI21 + - name: ai21/j1 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.ai21_tokenizer.AI21Tokenizer" + end_of_text_token: " " + prefix_token: "" + + # AlephAlpha + - name: AlephAlpha/luminous-base + tokenizer_spec: + class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: AlephAlpha/luminous-extended + tokenizer_spec: + class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: AlephAlpha/luminous-supreme + tokenizer_spec: + class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: AlephAlpha/luminous-world + tokenizer_spec: + class_name: "helm.proxy.tokenizers.aleph_alpha_tokenizer.AlephAlphaTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Anthropic + - name: anthropic/claude + tokenizer_spec: + class_name: "helm.proxy.tokenizers.anthropic_tokenizer.AnthropicTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Bigcode + - name: bigcode/santacoder + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + - name: bigcode/starcoder + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Bigscience + - name: bigscience/bloom + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: bigscience/T0pp + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Cohere + - name: cohere/cohere + tokenizer_spec: + class_name: "helm.proxy.tokenizers.cohere_tokenizer.CohereTokenizer" + end_of_text_token: "" + prefix_token: ":" + + # EleutherAI + - name: EleutherAI/gpt-j-6B + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + - name: EleutherAI/gpt-neox-20b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Facebook + - name: facebook/opt-66b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Google + - name: google/t5-11b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: google/flan-t5-xxl + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: google/ul2 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Hf-internal-testing + - name: hf-internal-testing/llama-tokenizer + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # HuggingFaceM4 + - name: HuggingFaceM4/idefics-9b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-9b-instruct + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-80b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + - name: HuggingFaceM4/idefics-80b-instruct + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Huggingface + - name: huggingface/gpt2 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Lighting AI + - name: lightningai/lit-gpt + tokenizer_spec: + class_name: "helm.proxy.tokenizers.lit_gpt_tokenizer.LitGPTTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Meta-llama + - name: meta-llama/Llama-2-7b-hf + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Mistralai + - name: mistralai/Mistral-7B-v0.1 + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "" + prefix_token: "" + + # Neurips + - name: neurips/local + tokenizer_spec: + class_name: "helm.proxy.tokenizers.http_model_tokenizer.HTTPModelTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Openai + - name: openai/cl100k_base + tokenizer_spec: + class_name: "helm.proxy.tokenizers.tiktoken_tokenizer.TiktokenTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "<|endoftext|>" + + # Tiiuae + - name: tiiuae/falcon-7b + tokenizer_spec: + class_name: "helm.proxy.tokenizers.huggingface_tokenizer.HuggingFaceTokenizer" + end_of_text_token: "<|endoftext|>" + prefix_token: "" + + # TsinghuaKEG + - name: TsinghuaKEG/ice + tokenizer_spec: + class_name: "helm.proxy.tokenizers.ice_tokenizer.ICETokenizer" + end_of_text_token: "" + prefix_token: "" + + # Yandex + - name: Yandex/yalm + tokenizer_spec: + class_name: "helm.proxy.tokenizers.yalm_tokenizer.YaLMTokenizer" + end_of_text_token: "" + prefix_token: "" \ No newline at end of file diff --git a/src/helm/proxy/clients/aleph_alpha_client.py b/src/helm/proxy/clients/aleph_alpha_client.py index a988938ae33..ae7116cef2d 100644 --- a/src/helm/proxy/clients/aleph_alpha_client.py +++ b/src/helm/proxy/clients/aleph_alpha_client.py @@ -2,8 +2,6 @@ import requests from typing import Any, Dict, List -from aleph_alpha_client import Client as AlephAlphaPythonClient - from helm.common.cache import CacheConfig from helm.common.request import wrap_request_time, Request, RequestResult, Sequence, Token from helm.proxy.tokenizers.tokenizer import Tokenizer @@ -16,7 +14,6 @@ class AlephAlphaClient(CachingClient): def __init__(self, api_key: str, tokenizer: Tokenizer, cache_config: CacheConfig): super().__init__(cache_config=cache_config, tokenizer=tokenizer) self.api_key: str = api_key - self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key) def _send_request(self, endpoint: str, raw_request: Dict[str, Any]) -> Dict[str, Any]: response = requests.request( diff --git a/src/helm/proxy/clients/anthropic_client.py b/src/helm/proxy/clients/anthropic_client.py index 0cfde926c58..6f6dd8f1c7d 100644 --- a/src/helm/proxy/clients/anthropic_client.py +++ b/src/helm/proxy/clients/anthropic_client.py @@ -249,7 +249,7 @@ def make_request(self, request: Request) -> RequestResult: if request.embedding: return EMBEDDING_UNAVAILABLE_REQUEST_RESULT # Validate the fields of `Request` - if request.model != "anthropic/stanford-online-all-v4-s3": + if request.model_engine != "stanford-online-all-v4-s3": raise ValueError(f"Invalid model: {request.model}") if request.max_tokens > AnthropicLegacyClient.MAX_COMPLETION_LENGTH: raise ValueError( diff --git a/src/helm/proxy/clients/auto_client.py b/src/helm/proxy/clients/auto_client.py index 84239fb4b05..bcb00e7b8a5 100644 --- a/src/helm/proxy/clients/auto_client.py +++ b/src/helm/proxy/clients/auto_client.py @@ -1,11 +1,10 @@ import os from dataclasses import replace -from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional from retrying import Attempt, RetryError -from helm.benchmark.model_deployment_registry import get_model_deployment +from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment from helm.benchmark.tokenizer_config_registry import get_tokenizer_config from helm.common.cache import CacheConfig, MongoCacheConfig, SqliteCacheConfig from helm.common.hierarchical_logger import hlog @@ -24,7 +23,6 @@ from helm.proxy.tokenizers.tokenizer import Tokenizer from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer -from .http_model_client import HTTPModelClient if TYPE_CHECKING: import helm.proxy.clients.huggingface_client @@ -35,11 +33,7 @@ class AuthenticationError(NonRetriableException): class AutoClient(Client): - """Automatically dispatch to the proper `Client` based on the organization. - - The modules for each client are lazily imported when the respective client is created. - This greatly speeds up the import time of this module, and allows the client modules to - use optional dependencies.""" + """Automatically dispatch to the proper `Client` based on the model deployment name.""" def __init__(self, credentials: Mapping[str, Any], cache_path: str, mongo_uri: str = ""): self.credentials = credentials @@ -62,177 +56,77 @@ def _build_cache_config(self, organization: str) -> CacheConfig: # TODO: Allow setting CacheConfig.follower_cache_path from a command line flag. return SqliteCacheConfig(client_cache_path) - def _get_client(self, model: str) -> Client: + def _provide_api_key(self, host_organization: str, model_deployment_name: Optional[str] = None) -> Optional[str]: + api_key_name = host_organization + "ApiKey" + if api_key_name in self.credentials: + hlog(f"Using host_organization api key defined in credentials.conf: {api_key_name}") + return self.credentials[api_key_name] + if "deployments" not in self.credentials: + hlog( + "WARNING: Could not find key 'deployments' in credentials.conf, " + f"therefore the API key {api_key_name} should be specified." + ) + return None + deployment_api_keys = self.credentials["deployments"] + if model_deployment_name is None: + hlog( + f"WARNING: Could not find key '{api_key_name}' in credentials.conf " + "and no model_deployment_name provided" + ) + return None + if model_deployment_name not in deployment_api_keys: + hlog(f"WARNING: Could not find key '{model_deployment_name}' under key 'deployments' in credentials.conf") + return None + return deployment_api_keys[model_deployment_name] + + def _get_client(self, model_deployment_name: str) -> Client: """Return a client based on the model, creating it if necessary.""" - client: Optional[Client] = self.clients.get(model) - - if client is None: - organization: str = model.split("/")[0] - cache_config: CacheConfig = self._build_cache_config(organization) - tokenizer: Tokenizer = self._get_tokenizer(organization) - - # TODO: Migrate all clients to use model deployments - model_deployment = get_model_deployment(model) - if model_deployment: - - def provide_api_key(): - if "deployments" not in self.credentials: - raise AuthenticationError("Could not find key 'deployments' in credentials.conf") - deployment_api_keys = self.credentials["deployments"] - if model not in deployment_api_keys: - raise AuthenticationError( - f"Could not find key '{model}' under key 'deployments' in credentials.conf" - ) - return deployment_api_keys[model] - - # Perform dependency injection to fill in remaining arguments. - # Dependency injection is needed here for these reasons: - # - # 1. Different clients have different parameters. Dependency injection provides arguments - # that match the parameters of the client. - # 2. Some arguments, such as the tokenizer, are not static data objects that can be - # in the users configuration file. Instead, they have to be constructed dynamically at - # runtime. - # 3. The providers must be lazily-evaluated, because eager evaluation can result in an - # exception. For instance, some clients do not require an API key, so trying to fetch - # the API key from configuration eagerly will result in an exception because the user - # will not have configured an API key. - client_spec = inject_object_spec_args( - model_deployment.client_spec, - constant_bindings={"cache_config": cache_config}, - provider_bindings={"api_key": provide_api_key}, - ) - client = create_object(client_spec) - elif organization == "neurips": - client = HTTPModelClient(tokenizer=tokenizer, cache_config=cache_config) - elif organization == "openai": - from helm.proxy.clients.openai_client import OpenAIClient - - org_id = self.credentials.get("openaiOrgId", None) - api_key = self.credentials.get("openaiApiKey", None) - client = OpenAIClient( - tokenizer=tokenizer, - cache_config=cache_config, - api_key=api_key, - org_id=org_id, - ) - elif organization == "AlephAlpha": - from helm.proxy.clients.aleph_alpha_client import AlephAlphaClient - - client = AlephAlphaClient( - tokenizer=tokenizer, - api_key=self.credentials["alephAlphaKey"], - cache_config=cache_config, - ) - elif organization == "ai21": - from helm.proxy.clients.ai21_client import AI21Client - - client = AI21Client( - tokenizer=tokenizer, - api_key=self.credentials["ai21ApiKey"], - cache_config=cache_config, - ) - elif organization == "cohere": - from helm.proxy.clients.cohere_client import CohereClient - - client = CohereClient( - tokenizer=tokenizer, - api_key=self.credentials["cohereApiKey"], - cache_config=cache_config, - ) - elif organization == "gooseai": - from helm.proxy.clients.goose_ai_client import GooseAIClient - - org_id = self.credentials.get("gooseaiOrgId", None) - client = GooseAIClient( - tokenizer=tokenizer, - api_key=self.credentials["gooseaiApiKey"], - cache_config=cache_config, - org_id=org_id, - ) - elif organization == "huggingface": - from helm.proxy.clients.huggingface_client import HuggingFaceClient - - client = HuggingFaceClient(tokenizer=tokenizer, cache_config=cache_config) - elif organization == "anthropic": - from helm.proxy.clients.anthropic_client import AnthropicClient - - client = AnthropicClient( - api_key=self.credentials.get("anthropicApiKey", None), - tokenizer=tokenizer, - cache_config=cache_config, - ) - elif organization == "microsoft": - from helm.proxy.clients.microsoft_client import MicrosoftClient - - org_id = self.credentials.get("microsoftOrgId", None) - lock_file_path: str = os.path.join(self.cache_path, f"{organization}.lock") - client = MicrosoftClient( - api_key=self.credentials.get("microsoftApiKey", None), - tokenizer=tokenizer, - lock_file_path=lock_file_path, - cache_config=cache_config, - org_id=org_id, - ) - elif organization == "google": - from helm.proxy.clients.google_client import GoogleClient - - client = GoogleClient( - tokenizer=tokenizer, - cache_config=cache_config, - ) - elif organization in [ - "together", - "databricks", - "eleutherai", - "lmsys", - "meta", - "mistralai", - "mosaicml", - "stabilityai", - "stanford", - "tiiuae", - ]: - from helm.proxy.clients.together_client import TogetherClient - - client = TogetherClient( - api_key=self.credentials.get("togetherApiKey", None), - tokenizer=tokenizer, - cache_config=cache_config, - ) - elif organization == "simple": - from helm.proxy.clients.simple_client import SimpleClient - - client = SimpleClient(tokenizer=tokenizer, cache_config=cache_config) - elif organization == "writer": - from helm.proxy.clients.palmyra_client import PalmyraClient - - client = PalmyraClient( - api_key=self.credentials["writerApiKey"], - tokenizer=tokenizer, - cache_config=cache_config, - ) - elif organization == "nvidia": - from helm.proxy.clients.megatron_client import MegatronClient - - client = MegatronClient(tokenizer=tokenizer, cache_config=cache_config) - - elif organization == "lightningai": - from helm.proxy.clients.lit_gpt_client import LitGPTClient - - client = LitGPTClient( - tokenizer=tokenizer, - cache_config=cache_config, - checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")), - precision=os.environ.get("LIT_GPT_PRECISION", "bf16-true"), - ) - elif organization == "HuggingFaceM4": - from helm.proxy.clients.vision_language.idefics_client import IDEFICSClient - - client = IDEFICSClient(tokenizer=tokenizer, cache_config=cache_config) - else: - raise ValueError(f"Could not find client for model: {model}") - self.clients[model] = client + # First try to find the client in the cache + client: Optional[Client] = self.clients.get(model_deployment_name) + if client is not None: + return client + + # Otherwise, create the client + model_deployment: ModelDeployment = get_model_deployment(model_deployment_name) + if model_deployment: + # Perform dependency injection to fill in remaining arguments. + # Dependency injection is needed here for these reasons: + # + # 1. Different clients have different parameters. Dependency injection provides arguments + # that match the parameters of the client. + # 2. Some arguments, such as the tokenizer, are not static data objects that can be + # in the users configuration file. Instead, they have to be constructed dynamically at + # runtime. + # 3. The providers must be lazily-evaluated, because eager evaluation can result in an + # exception. For instance, some clients do not require an API key, so trying to fetch + # the API key from configuration eagerly will result in an exception because the user + # will not have configured an API key. + + # Prepare a cache + host_organization: str = model_deployment.host_organization + cache_config: CacheConfig = self._build_cache_config(host_organization) + + client_spec = inject_object_spec_args( + model_deployment.client_spec, + constant_bindings={"cache_config": cache_config}, + provider_bindings={ + "api_key": lambda: self._provide_api_key(host_organization, model_deployment_name), + "tokenizer": lambda: self._get_tokenizer( + tokenizer_name=model_deployment.tokenizer_name or model_deployment.name + ), + "org_id": lambda: self.credentials.get( + host_organization + "OrgId", None + ), # OpenAI, GooseAI, Microsoft + "lock_file_path": lambda: os.path.join(self.cache_path, f"{host_organization}.lock"), # Microsoft + }, + ) + client = create_object(client_spec) + else: + raise ValueError(f"Could not find client for model deployment: {model_deployment_name}") + + # Cache the client + self.clients[model_deployment_name] = client + return client def make_request(self, request: Request) -> RequestResult: @@ -246,14 +140,15 @@ def make_request(self, request: Request) -> RequestResult: def make_request_with_retry(client: Client, request: Request) -> RequestResult: return client.make_request(request) - client: Client = self._get_client(request.model) + client: Client = self._get_client(request.model_deployment) try: return make_request_with_retry(client=client, request=request) except RetryError as e: last_attempt: Attempt = e.last_attempt retry_error: str = ( - f"Failed to make request to {request.model} after retrying {last_attempt.attempt_number} times" + f"Failed to make request to {request.model_deployment} after retrying " + f"{last_attempt.attempt_number} times" ) hlog(retry_error) @@ -270,90 +165,19 @@ def _get_tokenizer(self, tokenizer_name: str) -> Tokenizer: organization: str = tokenizer_name.split("/")[0] cache_config: CacheConfig = self._build_cache_config(organization) - # TODO: Migrate all clients to use tokenizer configs tokenizer_config = get_tokenizer_config(tokenizer_name) if tokenizer_config: tokenizer_spec = inject_object_spec_args( - tokenizer_config.tokenizer_spec, constant_bindings={"cache_config": cache_config} - ) - return create_object(tokenizer_spec) - elif organization in [ - "gooseai", - "huggingface", - "microsoft", - "google", - "writer", # Palmyra - "nvidia", - "EleutherAI", - "facebook", - "meta-llama", - "hf-internal-testing", - "mistralai", - "HuggingFaceM4", - # Together - "together", - "databricks", - "eleutherai", - "lmsys", - "meta", - "mosaicml", - "stabilityai", - "stanford", - "tiiuae", - "bigcode", - "bigscience", - ]: - from helm.proxy.tokenizers.huggingface_tokenizer import HuggingFaceTokenizer - - tokenizer = HuggingFaceTokenizer(cache_config=cache_config) - elif organization == "neurips": - from helm.proxy.tokenizers.http_model_tokenizer import HTTPModelTokenizer - - tokenizer = HTTPModelTokenizer(cache_config=cache_config) - elif organization == "openai": - from helm.proxy.tokenizers.tiktoken_tokenizer import TiktokenTokenizer - - tokenizer = TiktokenTokenizer(cache_config=cache_config) - elif organization == "AlephAlpha": - from helm.proxy.tokenizers.aleph_alpha_tokenizer import AlephAlphaTokenizer - - tokenizer = AlephAlphaTokenizer(api_key=self.credentials["alephAlphaKey"], cache_config=cache_config) - elif organization == "ai21": - from helm.proxy.tokenizers.ai21_tokenizer import AI21Tokenizer - - tokenizer = AI21Tokenizer(api_key=self.credentials["ai21ApiKey"], cache_config=cache_config) - elif organization == "cohere": - from helm.proxy.tokenizers.cohere_tokenizer import CohereTokenizer - - tokenizer = CohereTokenizer(api_key=self.credentials["cohereApiKey"], cache_config=cache_config) - elif organization == "anthropic": - from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer - - tokenizer = AnthropicTokenizer(cache_config=cache_config) - elif organization == "simple": - from helm.proxy.tokenizers.simple_tokenizer import SimpleTokenizer - - tokenizer = SimpleTokenizer() - elif organization == "lightningai": - from helm.proxy.tokenizers.lit_gpt_tokenizer import LitGPTTokenizer - - tokenizer = LitGPTTokenizer( - cache_config=cache_config, - checkpoint_dir=Path(os.environ.get("LIT_GPT_CHECKPOINT_DIR", "")), + tokenizer_config.tokenizer_spec, + constant_bindings={"cache_config": cache_config}, + provider_bindings={ + "api_key": lambda: self._provide_api_key(organization), + }, ) - elif organization == "TsinghuaKEG": - from helm.proxy.tokenizers.ice_tokenizer import ICETokenizer - - tokenizer = ICETokenizer(cache_config=cache_config) - elif organization == "Yandex": - from helm.proxy.tokenizers.yalm_tokenizer import YaLMTokenizer - - tokenizer = YaLMTokenizer(cache_config=cache_config) - - if tokenizer is None: - raise ValueError(f"Could not find tokenizer for model: {tokenizer_name}") + tokenizer = create_object(tokenizer_spec) # Cache the tokenizer + assert isinstance(tokenizer, Tokenizer) # To make mypy happy self.tokenizers[tokenizer_name] = tokenizer return tokenizer diff --git a/src/helm/proxy/clients/cohere_client.py b/src/helm/proxy/clients/cohere_client.py index c92fcc4330f..2d626b99150 100644 --- a/src/helm/proxy/clients/cohere_client.py +++ b/src/helm/proxy/clients/cohere_client.py @@ -11,7 +11,7 @@ Sequence, Token, ) -from helm.proxy.models import get_models_by_organization +from helm.benchmark.model_deployment_registry import get_model_deployments_by_host_organization from helm.proxy.tokenizers.tokenizer import Tokenizer from .client import CachingClient, truncate_sequence from .cohere_utils import get_cohere_url, DEFAULT_COHERE_API_VERSION @@ -45,7 +45,7 @@ def make_request(self, request: Request) -> RequestResult: assert request.max_tokens > 0, "max_tokens can only be 0 if echo_prompt=True" # model: "Currently available models are small, medium, large, xlarge" - assert request.model in get_models_by_organization("cohere") + assert request.model_deployment in get_model_deployments_by_host_organization("cohere") # temperature: "min value of 0.0, max value of 5.0" assert 0.0 <= request.temperature <= 5.0, f"Invalid temperature: {request.temperature}. Valid range: [0,5]" # num_generations: "min value of 1, max value of 5" diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py index 498a810d608..b4e25987e6d 100644 --- a/src/helm/proxy/clients/huggingface_client.py +++ b/src/helm/proxy/clients/huggingface_client.py @@ -203,9 +203,9 @@ def make_request(self, request: Request) -> RequestResult: if self._pretrained_model_name_or_path: pretrained_model_name_or_path = self._pretrained_model_name_or_path else: - pretrained_model_name_or_path = resolve_alias(request.model) + pretrained_model_name_or_path = resolve_alias(request.model_deployment) huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server( - helm_model_name=request.model, + helm_model_name=request.model_deployment, pretrained_model_name_or_path=pretrained_model_name_or_path, revision=self._revision, ) diff --git a/src/helm/proxy/clients/palmyra_client.py b/src/helm/proxy/clients/palmyra_client.py index 46a17e961ec..550b4b56984 100644 --- a/src/helm/proxy/clients/palmyra_client.py +++ b/src/helm/proxy/clients/palmyra_client.py @@ -100,7 +100,10 @@ def do_it(): return RequestResult(success=False, cached=False, error=error, completions=[], embedding=[]) if _is_content_moderation_failure(response): - hlog(f"WARNING: Returning empty request for {request.model} due to content moderation filter") + hlog( + f"WARNING: Returning empty request for {request.model_deployment} " + "due to content moderation filter" + ) return RequestResult( success=False, cached=False, diff --git a/src/helm/proxy/clients/test_auto_client.py b/src/helm/proxy/clients/test_auto_client.py index 6fffdf35ba0..98c13b6870d 100644 --- a/src/helm/proxy/clients/test_auto_client.py +++ b/src/helm/proxy/clients/test_auto_client.py @@ -27,6 +27,7 @@ def make_request_and_check_result(self, request, expected_result): def test_make_request_databricks(self): request = Request( model="databricks/dolly-v2-3b", + model_deployment="together/dolly-v2-3b", prompt="Elephants are one of the most", temperature=0.0, max_tokens=10, @@ -69,6 +70,7 @@ def test_make_request_databricks(self): ) request = Request( model="databricks/dolly-v2-3b", + model_deployment="together/dolly-v2-3b", prompt="Elephants are one of the most", temperature=0.0, max_tokens=10, diff --git a/src/helm/proxy/clients/test_client.py b/src/helm/proxy/clients/test_client.py index 8ca194de198..256282d835b 100644 --- a/src/helm/proxy/clients/test_client.py +++ b/src/helm/proxy/clients/test_client.py @@ -19,13 +19,31 @@ def truncate_sequence_helper(tokens: List[str], request: Request, expected_token def test_truncate_sequence(): # echo_prompt = True, nothing gets truncated - truncate_sequence_helper(["a", "b", "c"], Request(prompt="abc", echo_prompt=True), ["a", "b", "c"]) + truncate_sequence_helper( + ["a", "b", "c"], + Request( + model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", prompt="abc", echo_prompt=True + ), + ["a", "b", "c"], + ) # Nothing gets truncated - truncate_sequence_helper(["hello", " world"], Request(stop_sequences=["#"]), ["hello", " world"]) + truncate_sequence_helper( + ["hello", " world"], + Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["#"]), + ["hello", " world"], + ) # Truncate using stop sequences - truncate_sequence_helper(["hello", " world", "\n", "what"], Request(stop_sequences=["\n"]), ["hello", " world"]) + truncate_sequence_helper( + ["hello", " world", "\n", "what"], + Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", stop_sequences=["\n"]), + ["hello", " world"], + ) # Truncate using max tokens - truncate_sequence_helper(["a", "b", "c"], Request(max_tokens=2), ["a", "b"]) + truncate_sequence_helper( + ["a", "b", "c"], + Request(model="openai/text-davinci-002", model_deployment="openai/text-davinci-002", max_tokens=2), + ["a", "b"], + ) diff --git a/src/helm/proxy/clients/test_huggingface_client.py b/src/helm/proxy/clients/test_huggingface_client.py index f5c59f2d8f3..09efeca3b27 100644 --- a/src/helm/proxy/clients/test_huggingface_client.py +++ b/src/helm/proxy/clients/test_huggingface_client.py @@ -29,30 +29,30 @@ def teardown_method(self, method): def test_tokenize(self): request = TokenizationRequest(text="I am a computer scientist.") - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.client.tokenizer.tokenize(request) assert not result.cached, "First time making the tokenize request. Result should not be cached" - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.client.tokenizer.tokenize(request) assert result.cached, "Result should be cached" assert result.raw_tokens == ["I", " am", " a", " computer", " scientist", "."] def test_encode(self): request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1) - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.client.tokenizer.tokenize(request) assert not result.cached, "First time making the tokenize request. Result should not be cached" - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.client.tokenizer.tokenize(request) assert result.cached, "Result should be cached" assert result.raw_tokens == [40] request = TokenizationRequest(text="I am a computer scientist.", encode=True, truncation=True, max_length=1024) - result = self.client.tokenize(request) + result = self.client.tokenizer.tokenize(request) assert not result.cached, "First time making this particular request. Result should not be cached" assert result.raw_tokens == [40, 716, 257, 3644, 11444, 13] def test_decode(self): request = DecodeRequest(tokens=[40, 716, 257, 3644, 11444, 13]) - result: DecodeRequestResult = self.client.decode(request) + result: DecodeRequestResult = self.client.tokenizer.decode(request) assert not result.cached, "First time making the decode request. Result should not be cached" - result: DecodeRequestResult = self.client.decode(request) + result: DecodeRequestResult = self.client.tokenizer.decode(request) assert result.cached, "Result should be cached" assert result.text == "I am a computer scientist." @@ -60,7 +60,8 @@ def test_gpt2(self): prompt: str = "I am a computer scientist." result: RequestResult = self.client.make_request( Request( - model="huggingface/gpt2", + model="openai/gpt2", + model_deployment="huggingface/gpt2", prompt=prompt, num_completions=3, top_k_per_token=5, @@ -77,7 +78,8 @@ def test_gpt2(self): def test_gptj_6b(self): result: RequestResult = self.client.make_request( Request( - model="huggingface/gpt-j-6b", + model="eleutherai/gpt-j-6b", + model_deployment="huggingface/gpt-j-6b", prompt="I am a computer scientist.", num_completions=3, top_k_per_token=5, diff --git a/src/helm/proxy/clients/test_together_client.py b/src/helm/proxy/clients/test_together_client.py index 59eebab9b9d..312fed545e0 100644 --- a/src/helm/proxy/clients/test_together_client.py +++ b/src/helm/proxy/clients/test_together_client.py @@ -27,6 +27,7 @@ def teardown_method(self, method): ( Request( model="together/redpajama-incite-base-3b-v1", + model_deployment="together/redpajama-incite-base-3b-v1", ), { "best_of": 1, @@ -45,6 +46,7 @@ def teardown_method(self, method): ( Request( model="meta/llama-7b", + model_deployment="together/llama-7b", prompt="I am a computer scientist.", temperature=0, num_completions=4, @@ -71,6 +73,7 @@ def teardown_method(self, method): ( Request( model="stanford/alpaca-7b", + model_deployment="together/alpaca-7b", stop_sequences=["\n"], ), { @@ -95,4 +98,4 @@ def test_convert_to_raw_request(self, test_input, expected): def test_api_key_error(self): with pytest.raises(TogetherClientError): - self.client.make_request(Request(model="together/bloom")) + self.client.make_request(Request(model="bigscience/bloom", model_deployment="together/bloom")) diff --git a/src/helm/proxy/clients/together_client.py b/src/helm/proxy/clients/together_client.py index ad3365af5a9..24cb5b6a387 100644 --- a/src/helm/proxy/clients/together_client.py +++ b/src/helm/proxy/clients/together_client.py @@ -16,6 +16,10 @@ "h3-2.7b": "h3-2.7b-h3", "opt-1.3b": "opt-1.3b-ft-tp1", "opt-6.7b": "opt-6.7b-ft-tp1", + "mpt-7b": "togethercomputer/mpt-7b", + "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct", + "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b", + "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b", # Production models "redpajama-incite-base-3b-v1": "togethercomputer/RedPajama-INCITE-Base-3B-v1", "redpajama-incite-instruct-3b-v1": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1", @@ -29,6 +33,8 @@ "falcon-7b-instruct": "togethercomputer/falcon-7b-instruct", "falcon-40b": "togethercomputer/falcon-40b", "falcon-40b-instruct": "togethercomputer/falcon-40b-instruct", + "gpt-jt-6b-v1": "togethercomputer/GPT-JT-6B-v1", + "gpt-neoxt-chat-base-20b": "togethercomputer/GPT-NeoXT-Chat-Base-20B", "llama-7b": "huggyllama/llama-7b", "llama-13b": "huggyllama/llama-13b", "llama-30b": "huggyllama/llama-30b", @@ -37,16 +43,12 @@ "llama-2-13b": "togethercomputer/llama-2-13b", "llama-2-70b": "togethercomputer/llama-2-70b", "mistral-7b-v0.1": "mistralai/Mistral-7B-v0.1", - "mpt-7b": "togethercomputer/mpt-7b", - "mpt-instruct-7b": "togethercomputer/mpt-7b-instruct", "mpt-30b": "togethercomputer/mpt-30b", "mpt-instruct-30b": "togethercomputer/mpt-30b-instruct", "pythia-1b-v0": "EleutherAI/pythia-1b-v0", "pythia-2.8b-v0": "EleutherAI/pythia-2.8b-v0", "pythia-6.9b": "EleutherAI/pythia-6.9b", "pythia-12b-v0": "EleutherAI/pythia-12b-v0", - "stablelm-base-alpha-3b": "stabilityai/stablelm-base-alpha-3b", - "stablelm-base-alpha-7b": "stabilityai/stablelm-base-alpha-7b", "vicuna-7b-v1.3": "lmsys/vicuna-7b-v1.3", "vicuna-13b-v1.3": "lmsys/vicuna-13b-v1.3", } @@ -55,7 +57,7 @@ HELM users use a shorter model name (e.g. together/flan-t5-xxl) whereas the Together client sends and caches requests using a longer model name that is suffixed with the implementation framework -(e.g. flan-t5-xxl-hf). This allows trackcing exactly which +(e.g. flan-t5-xxl-hf). This allows tracking exactly which implementation was used in the cached results, since some results may be different depending on the implementation (e.g. efficiency metrics). This also allows future migration of results in the case of changes of diff --git a/src/helm/proxy/clients/vision_language/idefics_client.py b/src/helm/proxy/clients/vision_language/idefics_client.py index 90d290667cd..38aa6d93ab8 100644 --- a/src/helm/proxy/clients/vision_language/idefics_client.py +++ b/src/helm/proxy/clients/vision_language/idefics_client.py @@ -78,10 +78,10 @@ def _get_model(self, checkpoint: str) -> LoadedIDEFICSModelProcessor: return loaded_model_processor def make_request(self, request: Request) -> RequestResult: - assert request.model in _models, f"Not a valid model for this client: {request.model}" + assert request.model_deployment in _models, f"Not a valid model for this client: {request.model_deployment}" assert request.multimodal_prompt is not None, "Multimodal prompt is required" - loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model) + loaded_model_processor: LoadedIDEFICSModelProcessor = self._get_model(request.model_deployment) model = loaded_model_processor.model processor = loaded_model_processor.processor diff --git a/src/helm/proxy/critique/model_critique_client.py b/src/helm/proxy/critique/model_critique_client.py index 7c4caaeca65..f26b79897da 100644 --- a/src/helm/proxy/critique/model_critique_client.py +++ b/src/helm/proxy/critique/model_critique_client.py @@ -2,6 +2,7 @@ import string import dataclasses +from helm.benchmark.run_specs import get_default_model_deployment_for_model from helm.common.critique_request import ( CritiqueRequest, CritiqueRequestResult, @@ -26,6 +27,10 @@ class ModelCritiqueClient(CritiqueClient): def __init__(self, client: Client, model_name): self._client = client self._model_name = model_name + self._model_deployment_name = ( + get_default_model_deployment_for_model(model_name, warn_arg_deprecated=False, ignore_deprecated=True) + or self._model_name + ) def _interpolate_fields(self, text: str, fields: Dict[str, str]) -> str: for key, value in fields.items(): @@ -75,6 +80,7 @@ def _task_to_requests(self, task: CritiqueTaskTemplate, fields: Dict[str, str]) request = Request( model=self._model_name, + model_deployment=self._model_deployment_name, prompt=prompt, max_tokens=max_tokens, echo_prompt=False, diff --git a/src/helm/proxy/example_queries.py b/src/helm/proxy/example_queries.py index 10009f8c249..bad6b6fc39b 100644 --- a/src/helm/proxy/example_queries.py +++ b/src/helm/proxy/example_queries.py @@ -63,13 +63,13 @@ def dedent(text: str) -> str: """ temperature: 0 stop_sequences: [.] - model: ${model} # Try out multiple models + model_deployment: ${model_deployment} # Try out multiple models """ ), environments=dedent( """ occupation: [mathematician, lawyer, doctor] - model: [openai/davinci, ai21/j1-jumbo] + model_deployment: [openai/davinci, ai21/j1-jumbo] """ ), ), @@ -88,12 +88,12 @@ def dedent(text: str) -> str: temperature: 0.5 stop_sequences: [\\n] num_completions: 5 - model: ${model} # Try out GPT-3 and Jurassic + model_deployment: ${model_deployment} # Try out GPT-3 and Jurassic """ ), environments=dedent( """ - model: [openai/davinci, ai21/j1-jumbo] + model_deployment: [openai/davinci, ai21/j1-jumbo] """ ), ), @@ -122,12 +122,12 @@ def dedent(text: str) -> str: temperature: 0 max_tokens: 1 top_k_per_token: 4 - model: ${model} # Try out GPT-3 and Jurassic + model_deployment: ${model_deployment} # Try out GPT-3 and Jurassic """ ), environments=dedent( """ - model: [openai/davinci, ai21/j1-jumbo] + model_deployment: [openai/davinci, ai21/j1-jumbo] """ ), ), @@ -135,7 +135,7 @@ def dedent(text: str) -> str: prompt="Takes two vectors a and b and returns their Euclidean distance", settings=dedent( """ - model: openai/code-davinci-001 # Codex for code generation + model_deployment: openai/code-davinci-001 # Codex for code generation """ ), environments="", @@ -144,14 +144,14 @@ def dedent(text: str) -> str: prompt="The quick brown fox", settings=dedent( """ - model: ${model} + model_deployment: ${model_deployment} temperature: 0.3 stop_sequences: [\\n] """ ), environments=dedent( """ - model: [ + model_deployment: [ "openai/davinci", "openai/text-davinci-002", "openai/text-davinci-003", "ai21/j1-grande-v2-beta", "together/gpt-j-6b", "together/gpt-jt-6b-v1", diff --git a/src/helm/proxy/models.py b/src/helm/proxy/models.py deleted file mode 100644 index 262b6ffcffc..00000000000 --- a/src/helm/proxy/models.py +++ /dev/null @@ -1,951 +0,0 @@ -from dataclasses import dataclass, field -from typing import Dict, List - -# Different modalities -TEXT_MODEL_TAG: str = "text" -IMAGE_MODEL_TAG: str = "image" -CODE_MODEL_TAG: str = "code" -EMBEDDING_MODEL_TAG: str = "embedding" - -# Some model APIs have limited functionalities -FULL_FUNCTIONALITY_TEXT_MODEL_TAG: str = "full_functionality_text" -LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG: str = "limited_functionality_text" - -# ChatML format -CHATML_MODEL_TAG: str = "chatml" - -# OpenAI Chat format -OPENAI_CHATGPT_MODEL_TAG: str = "openai_chatgpt" - -# For Anthropic models -ANTHROPIC_CLAUDE_1_MODEL_TAG: str = "claude_1" -ANTHROPIC_CLAUDE_2_MODEL_TAG: str = "claude_2" - -# For OpenAI models with wider context windows -# TODO(#1455): Simplify context window tags. -WIDER_CONTEXT_WINDOW_TAG: str = "openai_wider_context_window" # huggingface/gpt2 tokenizer, 4000 tokens -GPT_TURBO_CONTEXT_WINDOW_TAG: str = "gpt_turbo_context_window" # cl100k_base tokenizer, 4000 tokens -GPT_TURBO_16K_CONTEXT_WINDOW_TAG: str = "gpt_turbo_16k_context_window" # cl100k_base tokenizer, 8000 tokens -GPT4_CONTEXT_WINDOW_TAG: str = "gpt4_context_window" # cl100k_base tokenizer, 8192 tokens -GPT4_32K_CONTEXT_WINDOW_TAG: str = "gpt4_32k_context_window" # cl100k_base tokenizer, 32768 tokens - -# For AI21 Jurassic-2 models with wider context windows -AI21_WIDER_CONTEXT_WINDOW_TAG: str = "ai21_wider_context_window" - -# For AI21 Jurassic-2 Jumbo -# AI21 has recommended using a sequence length of 6000 tokens to avoid OOMs. -AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG: str = "ai21_jurassic_2_jumbo_context_window" # 6000 - -# To fetch models that use these tokenizers -GPT2_TOKENIZER_TAG: str = "gpt2_tokenizer" -AI21_TOKENIZER_TAG: str = "ai21_tokenizer" -COHERE_TOKENIZER_TAG: str = "cohere_tokenizer" -OPT_TOKENIZER_TAG: str = "opt_tokenizer" -GPTJ_TOKENIZER_TAG: str = "gptj_tokenizer" -GPT4_TOKENIZER_TAG: str = "gpt4_tokenizer" -GPTNEO_TOKENIZER_TAG: str = "gptneo_tokenizer" - -# Models which emit garbage tokens when temperature=0. -BUGGY_TEMP_0_TAG: str = "buggy_temp_0" - -# Models that are used for ablations and fine-grained analyses. -# These models are selected specifically because of their low marginal cost to evaluate. -ABLATION_MODEL_TAG: str = "ablation" - -# Some models (e.g., T5) have stripped newlines. -# So we cannot use \n as a stop sequence for these models. -NO_NEWLINES_TAG: str = "no_newlines" - -# Some models (e.g., UL2) require a prefix (e.g., [NLG]) in the -# prompts to indicate the mode before doing inference. -NLG_PREFIX_TAG: str = "nlg_prefix_tag" - -# Some models can follow instructions. -INSTRUCTION_FOLLOWING_MODEL_TAG: str = "instruction_following" - -# For Vision-langauge models (VLMs) -VISION_LANGUAGE_MODEL_TAG: str = "vision_language" - - -@dataclass -class Model: - """ - Represents a model that we can make requests to. Conceptually, an instance - of `Model` is tied more to the hosting implementation (where can we send - requests) rather than the conceptual model. These are the same for closed - models, but different for open-source models. Note: for all the metadata - and documentation about the model itself, see `ModelField` in `schema.py`. - """ - - # Model group, used to determine quotas (e.g. "huggingface"). - # This group is only for user accounts, not benchmarking, and should probably - # called something else. - group: str - - # Name of the specific model (e.g. "huggingface/gpt-j-6b") - # The name is / or - # / - # There is also `` (see `ModelField`). - name: str - - # Tags corresponding to the properties of the model. - tags: List[str] = field(default_factory=list) - - @property - def organization(self) -> str: - """ - Extracts the organization from the model name. - Example: 'ai21/j1-jumbo' => 'ai21' - """ - return self.name.split("/")[0] - - @property - def engine(self) -> str: - """ - Extracts the model engine from the model name. - Example: 'ai21/j1-jumbo' => 'j1-jumbo' - """ - return self.name.split("/")[1] - - -# For the list of available models, see the following docs: -# Note that schema.yaml has much of this information now. -# Over time, we should add more information there. - -ALL_MODELS = [ - # Local Model - Model( - group="neurips", - name="neurips/local", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - # AI21: https://studio.ai21.com/pricing - Model( - group="jurassic", - name="ai21/j1-jumbo", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - # From AI21: "the new model is a mid-point in terms of size, cost and performance between Jumbo and Large. - # We also implemented a few tweaks to its training process. Internal benchmarks suggest it can really - # help the unit economics on your end compared to Jumbo, without compromising too much on quality." - Model( - group="jurassic", - name="ai21/j1-grande", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - Model( - group="jurassic", - name="ai21/j1-grande-v2-beta", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - Model( - group="jurassic", - name="ai21/j1-large", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - # AI21 Jurassic-2 Models: https://www.ai21.com/blog/introducing-j2 - Model( - group="jurassic", - name="ai21/j2-jumbo", - tags=[ - TEXT_MODEL_TAG, - AI21_JURASSIC_2_JUMBO_CONTEXT_WINDOW_TAG, - FULL_FUNCTIONALITY_TEXT_MODEL_TAG, - AI21_TOKENIZER_TAG, - ], - ), - Model( - group="jurassic", - name="ai21/j2-grande", - tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - Model( - group="jurassic", - name="ai21/j2-large", - tags=[TEXT_MODEL_TAG, AI21_WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, AI21_TOKENIZER_TAG], - ), - # Aleph Alpha's Luminous models: https://docs.aleph-alpha.com/docs/introduction/luminous - Model( - group="luminous", - name="AlephAlpha/luminous-base", - # Does not support echo - tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="luminous", - name="AlephAlpha/luminous-extended", - # Does not support echo - tags=[TEXT_MODEL_TAG, IMAGE_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="luminous", - name="AlephAlpha/luminous-supreme", - # Does not support echo. - # TODO: images will be supported in the near future. Add IMAGE_MODEL_TAG. - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # TODO: coming soon. Uncomment out the following when Luminous World is released. - # Model( - # group="luminous", - # name="AlephAlpha/luminous-world", - # tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - # ), - # Anthropic - Model( - group="anthropic", - name="anthropic/stanford-online-all-v4-s3", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, ABLATION_MODEL_TAG], - ), - Model( - group="anthropic", - name="anthropic/claude-2.0", - tags=[ - ANTHROPIC_CLAUDE_2_MODEL_TAG, - TEXT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="anthropic", - name="anthropic/claude-v1.3", - tags=[ - ANTHROPIC_CLAUDE_1_MODEL_TAG, - TEXT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - ABLATION_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="anthropic", - name="anthropic/claude-instant-v1", - tags=[ - ANTHROPIC_CLAUDE_1_MODEL_TAG, - TEXT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - ABLATION_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - # BigScience - Model( - group="together", - name="together/bloom", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG], - ), - Model( - group="together", - name="together/t0pp", - # Does not support echo=True - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG], - ), - # Cohere models - # Model versioning and the possible versions are not documented here: - # https://docs.cohere.ai/generate-reference#model-optional. - # So, instead, we got the names of the models from the Cohere Playground. - # - # Note that their tokenizer and model were trained on English text and - # they do not have a dedicated decode API endpoint, so the adaptation - # step for language modeling fails for certain Scenarios: - # the_pile:subset=ArXiv - # the_pile:subset=Github - # the_pile:subset=PubMed Central - Model( - group="cohere", - name="cohere/xlarge-20220609", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/xlarge-20221108", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/large-20220720", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/medium-20220720", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/medium-20221108", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/small-20220720", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG], - ), - Model( - group="cohere", - name="cohere/command-medium-beta", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - Model( - group="cohere", - name="cohere/command-xlarge-beta", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, COHERE_TOKENIZER_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - # EleutherAI - Model( - group="together", - name="together/gpt-j-6b", - tags=[ - TEXT_MODEL_TAG, - FULL_FUNCTIONALITY_TEXT_MODEL_TAG, - ABLATION_MODEL_TAG, - GPTJ_TOKENIZER_TAG, - BUGGY_TEMP_0_TAG, - ], - ), - Model( - group="together", - name="together/gpt-neox-20b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, GPTNEO_TOKENIZER_TAG], - ), - Model( - group="together", - name="eleutherai/pythia-1b-v0", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="eleutherai/pythia-2.8b-v0", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="eleutherai/pythia-6.9b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="eleutherai/pythia-12b-v0", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # Meta - Model( - group="together", - name="meta/llama-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-13b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-30b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-65b", - # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-2-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-2-13b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="meta/llama-2-70b", - # TODO(#1828): Upgrade to FULL_FUNCTIONALITY_TEXT_MODEL_TAG - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # Stanford - Model( - group="together", - name="stanford/alpaca-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - # LMSYS - Model( - group="together", - name="lmsys/vicuna-7b-v1.3", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - Model( - group="together", - name="lmsys/vicuna-13b-v1.3", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - # Mistral AI - Model( - group="mistralai", - name="mistralai/mistral-7b-v0.1", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, INSTRUCTION_FOLLOWING_MODEL_TAG], - ), - # MosaicML - Model( - group="together", - name="mosaicml/mpt-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="mosaicml/mpt-instruct-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="mosaicml/mpt-30b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="mosaicml/mpt-instruct-30b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # TII UAE - Model( - group="together", - name="tiiuae/falcon-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="tiiuae/falcon-7b-instruct", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="tiiuae/falcon-40b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="tiiuae/falcon-40b-instruct", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # GooseAI supported models - Model( - group="gooseai", - name="gooseai/gpt-neo-20b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTNEO_TOKENIZER_TAG], - ), - Model( - group="gooseai", - name="gooseai/gpt-j-6b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG], - ), - # HuggingFace - Model( - group="huggingface", - name="huggingface/gpt2", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="huggingface", - name="huggingface/gpt-j-6b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG], - ), - Model( - group="huggingface", - name="huggingface/santacoder", - tags=[CODE_MODEL_TAG], - ), - Model( - group="huggingface", - name="huggingface/starcoder", - tags=[CODE_MODEL_TAG], - ), - # Google - Model( - group="together", - name="together/t5-11b", - # Does not support echo=True - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG], - ), - Model( - group="together", - name="together/flan-t5-xxl", - # Does not support echo=True - tags=[ - TEXT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - ABLATION_MODEL_TAG, - NO_NEWLINES_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="together", - name="together/ul2", - # Does not support echo=True - tags=[ - TEXT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - ABLATION_MODEL_TAG, - NO_NEWLINES_TAG, - NLG_PREFIX_TAG, - ], - ), - # H3 model - Model( - group="together", - name="together/h3-2.7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - # OPT - Model( - group="together", - name="together/opt-175b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG], - ), - Model( - group="together", - name="together/opt-66b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, OPT_TOKENIZER_TAG], - ), - Model( - group="together", - name="together/opt-6.7b", - tags=[ - TEXT_MODEL_TAG, - FULL_FUNCTIONALITY_TEXT_MODEL_TAG, - ABLATION_MODEL_TAG, - OPT_TOKENIZER_TAG, - BUGGY_TEMP_0_TAG, - ], - ), - Model( - group="together", - name="together/opt-1.3b", - tags=[ - TEXT_MODEL_TAG, - FULL_FUNCTIONALITY_TEXT_MODEL_TAG, - ABLATION_MODEL_TAG, - OPT_TOKENIZER_TAG, - BUGGY_TEMP_0_TAG, - ], - ), - # Microsoft/NVIDIA - Model( - group="microsoft", - name="microsoft/TNLGv2_530B", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="microsoft", - name="microsoft/TNLGv2_7B", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - # OpenAI: https://beta.openai.com/docs/engines/gpt-3 - Model( - group="gpt3", - name="openai/davinci", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/curie", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/babbage", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/ada", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - # TODO: text-davinci-002 supports insertion. Support insertion in our framework. - # https://github.com/stanford-crfm/benchmarking/issues/359 - Model( - group="gpt3", - name="openai/text-davinci-003", - tags=[ - TEXT_MODEL_TAG, - WIDER_CONTEXT_WINDOW_TAG, - FULL_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt3", - name="openai/text-davinci-002", - tags=[TEXT_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/text-davinci-001", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/text-curie-001", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/text-babbage-001", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="gpt3", - name="openai/text-ada-001", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="codex", - name="openai/code-davinci-002", - tags=[CODE_MODEL_TAG, WIDER_CONTEXT_WINDOW_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="codex", - name="openai/code-davinci-001", - tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - Model( - group="codex", - name="openai/code-cushman-001", - tags=[CODE_MODEL_TAG, GPT2_TOKENIZER_TAG], - ), - # GPT-4 - Model( - group="gpt4", - name="openai/gpt-4-0314", - tags=[ - TEXT_MODEL_TAG, - GPT4_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt4", - name="openai/gpt-4-32k-0314", - tags=[ - TEXT_MODEL_TAG, - GPT4_32K_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt4", - name="openai/gpt-4-0613", - tags=[ - TEXT_MODEL_TAG, - GPT4_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt4", - name="openai/gpt-4-32k-0613", - tags=[ - TEXT_MODEL_TAG, - GPT4_32K_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - # ChatGPT: https://openai.com/blog/chatgpt - Model( - group="gpt3", - name="openai/gpt-3.5-turbo-0301", - # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable - # sequence length is smaller at 4087 with one user input message and one assistant - # output message because ChatGPT uses special tokens for message roles and boundaries. - # We use a rounded-down sequence length of 4000 to account for these special tokens. - tags=[ - TEXT_MODEL_TAG, - GPT_TURBO_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt3", - name="openai/gpt-3.5-turbo-0613", - # The claimed sequence length is 4096, but as of 2023-03-07, the empirical usable - # sequence length is smaller at 4087 with one user input message and one assistant - # output message because ChatGPT uses special tokens for message roles and boundaries. - # We use a rounded-down sequence length of 4000 to account for these special tokens. - tags=[ - TEXT_MODEL_TAG, - GPT_TURBO_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - Model( - group="gpt3", - name="openai/gpt-3.5-turbo-16k-0613", - # Claimed length is 16,384; we round down to 16,000 for the same reasons as explained - # in the openai/gpt-3.5-turbo-0613 comment - tags=[ - TEXT_MODEL_TAG, - GPT_TURBO_16K_CONTEXT_WINDOW_TAG, - GPT4_TOKENIZER_TAG, - OPENAI_CHATGPT_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - ], - ), - # OpenAI similarity embedding models: https://beta.openai.com/docs/guides/embeddings - Model( - group="gpt3", - name="openai/text-similarity-davinci-001", - tags=[EMBEDDING_MODEL_TAG], - ), - Model( - group="gpt3", - name="openai/text-similarity-curie-001", - tags=[EMBEDDING_MODEL_TAG], - ), - Model( - group="gpt3", - name="openai/text-similarity-babbage-001", - tags=[EMBEDDING_MODEL_TAG], - ), - Model( - group="gpt3", - name="openai/text-similarity-ada-001", - tags=[EMBEDDING_MODEL_TAG], - ), - Model( - group="gpt3", - name="openai/text-embedding-ada-002", - tags=[EMBEDDING_MODEL_TAG], - ), - # Together - Model( - group="together", - name="together/gpt-jt-6b-v1", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPTJ_TOKENIZER_TAG], - ), - Model( - group="together", - name="together/gpt-neoxt-chat-base-20b", - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, CHATML_MODEL_TAG, GPTNEO_TOKENIZER_TAG], - ), - Model( - group="together", - name="together/redpajama-incite-base-3b-v1", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="together/redpajama-incite-instruct-3b-v1", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="together/redpajama-incite-base-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="together/redpajama-incite-instruct-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # Tsinghua - Model( - group="together", - name="together/glm", - # Inference with echo=True is not feasible -- in the prompt encoding phase, they use - # bidirectional attention and do not perform predictions on them. - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG, NO_NEWLINES_TAG], - ), - # Writer - Model( - group="palmyra", - name="writer/palmyra-base", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="palmyra", - name="writer/palmyra-large", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="palmyra", - name="writer/palmyra-instruct-30", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="palmyra", - name="writer/palmyra-e", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="palmyra", - name="writer/silk-road", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="palmyra", - name="writer/palmyra-x", - # Does not support echo - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # Yandex - Model( - group="together", - name="together/yalm", - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, ABLATION_MODEL_TAG], - ), - # Google - Model( - group="google", - name="google/palm", - tags=[TEXT_MODEL_TAG, LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # NVIDIA - Model( - group="nvidia", - name="nvidia/megatron-gpt2", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG, GPT2_TOKENIZER_TAG, BUGGY_TEMP_0_TAG], - ), - # Databricks - Model( - group="together", - name="databricks/dolly-v2-3b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="databricks/dolly-v2-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="databricks/dolly-v2-12b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - # Stability AI - Model( - group="together", - name="stabilityai/stablelm-base-alpha-3b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="together", - name="stabilityai/stablelm-base-alpha-7b", - tags=[TEXT_MODEL_TAG, FULL_FUNCTIONALITY_TEXT_MODEL_TAG], - ), - Model( - group="lightningai", - name="lightningai/lit-gpt", - tags=[ - TEXT_MODEL_TAG, - INSTRUCTION_FOLLOWING_MODEL_TAG, - LIMITED_FUNCTIONALITY_TEXT_MODEL_TAG, - GPT2_TOKENIZER_TAG, - ], - ), - # Vision-language models (VLMs) - Model( - group="idefics", - name="HuggingFaceM4/idefics-9b", - tags=[VISION_LANGUAGE_MODEL_TAG], - ), - Model( - group="idefics", - name="HuggingFaceM4/idefics-9b-instruct", - tags=[VISION_LANGUAGE_MODEL_TAG], - ), - Model( - group="idefics", - name="HuggingFaceM4/idefics-80b", - tags=[VISION_LANGUAGE_MODEL_TAG], - ), - Model( - group="idefics", - name="HuggingFaceM4/idefics-80b-instruct", - tags=[VISION_LANGUAGE_MODEL_TAG], - ), - # For debugging - Model( - group="simple", - name="simple/model1", - ), -] - -MODEL_NAME_TO_MODEL: Dict[str, Model] = {model.name: model for model in ALL_MODELS} - - -def get_model(model_name: str) -> Model: - """Get the `Model` given the name.""" - if model_name not in MODEL_NAME_TO_MODEL: - raise ValueError(f"No model with name: {model_name}") - - return MODEL_NAME_TO_MODEL[model_name] - - -def get_model_group(model_name: str) -> str: - """Get the model's group given the name.""" - model: Model = get_model(model_name) - return model.group - - -def get_all_models() -> List[str]: - """Get all model names.""" - return list(MODEL_NAME_TO_MODEL.keys()) - - -def get_models_by_organization(organization: str) -> List[str]: - """ - Gets models by organization e.g., ai21 => ai21/j1-jumbo, ai21/j1-grande, ai21-large. - """ - return [model.name for model in ALL_MODELS if model.organization == organization] - - -def get_model_names_with_tag(tag: str) -> List[str]: - """Get all the name of the models with tag `tag`.""" - return [model.name for model in ALL_MODELS if tag in model.tags] - - -def get_all_text_models() -> List[str]: - """Get all text model names.""" - return get_model_names_with_tag(TEXT_MODEL_TAG) - - -def get_all_code_models() -> List[str]: - """Get all code model names.""" - return get_model_names_with_tag(CODE_MODEL_TAG) - - -def get_all_instruction_following_models() -> List[str]: - """Get all instruction-following model names.""" - return get_model_names_with_tag(INSTRUCTION_FOLLOWING_MODEL_TAG) diff --git a/src/helm/proxy/services/server_service.py b/src/helm/proxy/services/server_service.py index 1361859767e..24f891fac2c 100644 --- a/src/helm/proxy/services/server_service.py +++ b/src/helm/proxy/services/server_service.py @@ -2,9 +2,6 @@ import signal from typing import List, Optional -from helm.benchmark.model_metadata_registry import maybe_register_model_metadata_from_base_path -from helm.benchmark.model_deployment_registry import maybe_register_model_deployments_from_base_path -from helm.benchmark.tokenizer_config_registry import maybe_register_tokenizer_configs_from_base_path from helm.common.critique_request import CritiqueRequest, CritiqueRequestResult from helm.common.authentication import Authentication from helm.common.general import ensure_directory_exists, parse_hocon, get_credentials @@ -22,7 +19,8 @@ from helm.proxy.clients.auto_client import AutoClient from helm.proxy.clients.toxicity_classifier_client import ToxicityClassifierClient from helm.proxy.example_queries import example_queries -from helm.proxy.models import ALL_MODELS, get_model_group +from helm.benchmark.model_metadata_registry import ALL_MODELS_METADATA +from helm.benchmark.model_deployment_registry import get_model_deployment_host_organization from helm.proxy.query import Query, QueryResult from helm.proxy.retry import retry_request from helm.proxy.token_counters.auto_token_counter import AutoTokenCounter @@ -48,10 +46,6 @@ def __init__(self, base_path: str = "prod_env", root_mode=False, mongo_uri: str ensure_directory_exists(cache_path) accounts_path = os.path.join(base_path, ACCOUNTS_FILE) - maybe_register_model_metadata_from_base_path(base_path) - maybe_register_model_deployments_from_base_path(base_path) - maybe_register_tokenizer_configs_from_base_path(base_path) - self.client = AutoClient(credentials, cache_path, mongo_uri) self.token_counter = AutoTokenCounter(self.client.get_huggingface_client()) self.accounts = Accounts(accounts_path, root_mode=root_mode) @@ -59,7 +53,7 @@ def __init__(self, base_path: str = "prod_env", root_mode=False, mongo_uri: str self.toxicity_classifier_client: Optional[ToxicityClassifierClient] = None def get_general_info(self) -> GeneralInfo: - return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS) + return GeneralInfo(version=VERSION, example_queries=example_queries, all_models=ALL_MODELS_METADATA) def get_window_service_info(self, model_name) -> WindowServiceInfo: # The import statement is placed here to avoid two problems, please refer to the link for details @@ -95,9 +89,9 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult: # https://github.com/stanford-crfm/benchmarking/issues/56 self.accounts.authenticate(auth) - model_group: str = get_model_group(request.model) + host_organization: str = get_model_deployment_host_organization(request.model_deployment) # Make sure we can use - self.accounts.check_can_use(auth.api_key, model_group) + self.accounts.check_can_use(auth.api_key, host_organization) # Use! request_result: RequestResult = self.client.make_request(request) @@ -106,7 +100,7 @@ def make_request(self, auth: Authentication, request: Request) -> RequestResult: if not request_result.cached: # Count the number of tokens used count: int = self.token_counter.count_tokens(request, request_result.completions) - self.accounts.use(auth.api_key, model_group, count) + self.accounts.use(auth.api_key, host_organization, count) return request_result diff --git a/src/helm/proxy/services/service.py b/src/helm/proxy/services/service.py index f169008ff2c..af3b500c09d 100644 --- a/src/helm/proxy/services/service.py +++ b/src/helm/proxy/services/service.py @@ -14,7 +14,7 @@ DecodeRequestResult, ) from helm.common.request import Request, RequestResult -from helm.proxy.models import Model +from helm.benchmark.model_metadata_registry import ModelMetadata from helm.proxy.query import Query, QueryResult from helm.proxy.accounts import Authentication, Account @@ -29,7 +29,7 @@ class GeneralInfo: version: str example_queries: List[Query] - all_models: List[Model] + all_models: List[ModelMetadata] def expand_environments(environments: Dict[str, List[str]]): @@ -69,6 +69,8 @@ def synthesize_request(prompt: str, settings: str, environment: Dict[str, str]) request: Dict[str, Any] = {} request["prompt"] = substitute_text(prompt, environment) request.update(parse_hocon(substitute_text(settings, environment))) + if "model_deployment" not in request and "model" not in request: + request["model_deployment"] = "openai/text-davinci-002" return Request(**request) diff --git a/src/helm/proxy/services/test_remote_service.py b/src/helm/proxy/services/test_remote_service.py index 63a267d5608..e2f4306f7a3 100644 --- a/src/helm/proxy/services/test_remote_service.py +++ b/src/helm/proxy/services/test_remote_service.py @@ -85,7 +85,7 @@ def create_root_account() -> str: @staticmethod def query(url: str, auth: Authentication, prompt: str): - request = Request(prompt=prompt, model="simple/model1") + request = Request(prompt=prompt, model="simple/model1", model_deployment="simple/model1") response: RequestResult = RemoteService(base_url=url).make_request(auth, request) response_text: str = response.completions[0].text # With the toy model (simple/model1), we should expect the same response as the prompt @@ -121,7 +121,7 @@ def teardown_class(cls): shutil.rmtree(cls.base_path) def test_make_request(self): - request = Request(prompt="1 2 3", model="simple/model1") + request = Request(prompt="1 2 3", model="simple/model1", model_deployment="simple/model1") response: RequestResult = self.service.make_request(self.auth, request) assert response.success @@ -132,7 +132,7 @@ def test_tokenize(self): def test_make_request_plus_sign(self): # Ensure + in prompt doesn't get replaced by a blank space - request = Request(prompt="+", model="simple/model1") + request = Request(prompt="+", model="simple/model1", model_deployment="simple/model1") response: RequestResult = self.service.make_request(self.auth, request) assert response.completions[0].text == "+" assert response.success diff --git a/src/helm/proxy/services/test_service.py b/src/helm/proxy/services/test_service.py index 1d3f2583f9b..6c0fd19b0ce 100644 --- a/src/helm/proxy/services/test_service.py +++ b/src/helm/proxy/services/test_service.py @@ -3,6 +3,7 @@ import shutil import tempfile +from helm.benchmark.model_deployment_registry import ModelDeployment, get_model_deployment from helm.common.authentication import Authentication from helm.common.request import Request from helm.proxy.accounts import AuthenticationError, Accounts @@ -34,7 +35,9 @@ def test_expand_query(self): def test_make_request(self): num_completions = 2 - request = Request(prompt="1 2 3", model="simple/model1", num_completions=num_completions) + request = Request( + prompt="1 2 3", model="simple/model1", model_deployment="simple/model1", num_completions=num_completions + ) result = self.service.make_request(self.auth, request) assert len(result.completions) == num_completions @@ -211,7 +214,7 @@ def helper_prod_test_service(request: Request, expected_text: str): # Models that we want to test -prod_models = ["openai/davinci", "ai21/j1-jumbo"] +prod_model_deployments = ["openai/davinci", "ai21/j1-jumbo"] # TODO: put a flag on this so that it's easy to use pytest to still run these slow tests @@ -220,8 +223,17 @@ def helper_prod_test_service(request: Request, expected_text: str): def test_prod_continue(): # Test that we're continuing prompt = "Paris is the capital of" - for model in prod_models: - request = Request(prompt=prompt, model=model, max_tokens=1, num_completions=1, temperature=0) + for model_deployment_name in prod_model_deployments: + model_deployment: ModelDeployment = get_model_deployment(model_deployment_name) + model_name: str = model_deployment.model_name or model_deployment.name + request = Request( + prompt=prompt, + model=model_name, + model_deployment=model_deployment_name, + max_tokens=1, + num_completions=1, + temperature=0, + ) helper_prod_test_service(request, " France") @@ -229,6 +241,15 @@ def test_prod_continue(): def test_prod_echo(): # If we're echoing the prompt, make sure we're getting the same thing back prompt = "I like pickles." - for model in prod_models: - request = Request(prompt=prompt, model=model, max_tokens=0, num_completions=1, echo_prompt=True) + for model_deployment_name in prod_model_deployments: + model_deployment: ModelDeployment = get_model_deployment(model_deployment_name) + model_name: str = model_deployment.model_name or model_deployment.name + request = Request( + prompt=prompt, + model=model_name, + model_deployment=model_deployment_name, + max_tokens=0, + num_completions=1, + echo_prompt=True, + ) helper_prod_test_service(request, prompt) diff --git a/src/helm/proxy/static/index.js b/src/helm/proxy/static/index.js index 26ad8b8416b..dc97d42ab1b 100644 --- a/src/helm/proxy/static/index.js +++ b/src/helm/proxy/static/index.js @@ -237,7 +237,7 @@ $(function () { // // get_num_bytes() and convert_tokens_to_text() in src/helm/benchmark/basic_metrics.py are adapted from this function. const groups = []; - for (let i = 0; i < tokens.length; ) { + for (let i = 0; i < tokens.length;) { // Aggregate consecutive tokens while they're "bytes:..." const group = { tokens: [] }; if (tokens[i].text.startsWith("bytes:")) { diff --git a/src/helm/proxy/test_models.py b/src/helm/proxy/test_models.py deleted file mode 100644 index c966815927c..00000000000 --- a/src/helm/proxy/test_models.py +++ /dev/null @@ -1,27 +0,0 @@ -from .models import get_model, get_model_group, get_models_by_organization, get_all_code_models, Model - - -def test_get_model(): - model: Model = get_model("ai21/j1-jumbo") - assert model.organization == "ai21" - assert model.engine == "j1-jumbo" - - -def test_get_model_with_invalid_model_name(): - try: - get_model("invalid/model") - assert False, "Expected to throw ValueError" - except ValueError: - pass - - -def test_get_model_group(): - assert get_model_group("openai/text-curie-001") == "gpt3" - - -def test_get_models_by_organization(): - assert get_models_by_organization("simple") == ["simple/model1"] - - -def test_all_code_models(): - assert "openai/code-davinci-002" in get_all_code_models() diff --git a/src/helm/proxy/token_counters/auto_token_counter.py b/src/helm/proxy/token_counters/auto_token_counter.py index 31d93d3d638..60604f2aa83 100644 --- a/src/helm/proxy/token_counters/auto_token_counter.py +++ b/src/helm/proxy/token_counters/auto_token_counter.py @@ -38,5 +38,5 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int: """ Counts tokens based on the organization. """ - token_counter: TokenCounter = self.get_token_counter(request.model_organization) + token_counter: TokenCounter = self.get_token_counter(request.model_host) return token_counter.count_tokens(request, completions) diff --git a/src/helm/proxy/token_counters/openai_token_counter.py b/src/helm/proxy/token_counters/openai_token_counter.py index e3083cea5cd..01ca7d35426 100644 --- a/src/helm/proxy/token_counters/openai_token_counter.py +++ b/src/helm/proxy/token_counters/openai_token_counter.py @@ -15,7 +15,7 @@ def count_tokens(self, request: Request, completions: List[Sequence]) -> int: Counts the total number of tokens using the suggestion here: https://community.openai.com/t/how-do-i-calculate-the-pricing-for-generation-of-text/11662/5 """ - tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenize( + tokenized_prompt: TokenizationRequestResult = self.huggingface_client.tokenizer.tokenize( TokenizationRequest(request.prompt) ) # Number of tokens in the prompt + number of tokens in all the completions diff --git a/src/helm/proxy/token_counters/test_ai21_token_counter.py b/src/helm/proxy/token_counters/test_ai21_token_counter.py index 9ae4541fcb2..026943dfc8e 100644 --- a/src/helm/proxy/token_counters/test_ai21_token_counter.py +++ b/src/helm/proxy/token_counters/test_ai21_token_counter.py @@ -10,11 +10,13 @@ def setup_method(self, method): def test_count_tokens(self): request = Request( + model="openai/text-davinci-002", + model_deployment="openai/text-davinci-002", prompt="The Center for Research on Foundation Models (CRFM) is " "an interdisciplinary initiative born out of the Stanford " "Institute for Human-Centered Artificial Intelligence (HAI) " "that aims to make fundamental advances in the study, development, " - "and deployment of foundation models." + "and deployment of foundation models.", ) completions: List[Sequence] = [ Sequence( diff --git a/src/helm/proxy/token_counters/test_openai_token_counter.py b/src/helm/proxy/token_counters/test_openai_token_counter.py index de9fcc3ef35..3f7bbfaebae 100644 --- a/src/helm/proxy/token_counters/test_openai_token_counter.py +++ b/src/helm/proxy/token_counters/test_openai_token_counter.py @@ -32,7 +32,11 @@ def teardown_method(self, method): os.remove(self.cache_path) def test_count_tokens(self): - request = Request(prompt=TestOpenAITokenCounter.TEST_PROMPT) + request = Request( + model="openai/text-davinci-002", + model_deployment="openai/text-davinci-002", + prompt=TestOpenAITokenCounter.TEST_PROMPT, + ) completions: List[Sequence] = [ Sequence( text=" The CRFM is dedicated to advancing our knowledge of the foundations of artificial intelligence " diff --git a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py index a43c63b8414..313cc0a4be4 100644 --- a/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py +++ b/src/helm/proxy/tokenizers/aleph_alpha_tokenizer.py @@ -31,7 +31,7 @@ class AlephAlphaTokenizer(CachingTokenizer): def __init__(self, api_key: str, cache_config: CacheConfig) -> None: super().__init__(cache_config) self.api_key: str = api_key - self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key) + self._aleph_alpha_client = AlephAlphaPythonClient(token=api_key) if api_key else None self._tokenizer_name_to_tokenizer: Dict[str, InternalTokenizer] = {} def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer: @@ -40,6 +40,8 @@ def _get_tokenizer(self, tokenizer_name: str) -> InternalTokenizer: # Check if the tokenizer is cached if tokenizer_name not in self._tokenizer_name_to_tokenizer: + if self._aleph_alpha_client is None: + raise ValueError("Aleph Alpha API key not set.") self._tokenizer_name_to_tokenizer[tokenizer_name] = self._aleph_alpha_client.tokenizer(tokenizer_name) hlog(f"Initialized tokenizer: {tokenizer_name}") return self._tokenizer_name_to_tokenizer[tokenizer_name] diff --git a/src/helm/proxy/clients/test_anthropic_client.py b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py similarity index 71% rename from src/helm/proxy/clients/test_anthropic_client.py rename to src/helm/proxy/tokenizers/test_anthropic_tokenizer.py index d1a039ef07e..3556978b5ae 100644 --- a/src/helm/proxy/clients/test_anthropic_client.py +++ b/src/helm/proxy/tokenizers/test_anthropic_tokenizer.py @@ -10,11 +10,10 @@ TokenizationRequest, TokenizationRequestResult, ) -from helm.proxy.tokenizers.anthropic_tokenizer import AnthropicTokenizer -from .anthropic_client import AnthropicClient +from .anthropic_tokenizer import AnthropicTokenizer -class TestAnthropicClient: +class TestAnthropicTokenizer: TEST_PROMPT: str = "I am a computer scientist." TEST_ENCODED: List[int] = [45, 1413, 269, 6797, 22228, 18] TEST_TOKENS: List[str] = ["I", " am", " a", " computer", " scientist", "."] @@ -22,42 +21,39 @@ class TestAnthropicClient: def setup_method(self, method): cache_file = tempfile.NamedTemporaryFile(delete=False) self.cache_path: str = cache_file.name - self.client = AnthropicClient( - tokenizer=AnthropicTokenizer(SqliteCacheConfig(self.cache_path)), - cache_config=SqliteCacheConfig(self.cache_path), - ) + self.tokenizer = AnthropicTokenizer(SqliteCacheConfig(self.cache_path)) def teardown_method(self, method): os.remove(self.cache_path) def test_tokenize(self): request = TokenizationRequest(text=self.TEST_PROMPT) - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.tokenizer.tokenize(request) assert not result.cached, "First time making the tokenize request. Result should not be cached" assert result.raw_tokens == self.TEST_TOKENS - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.tokenizer.tokenize(request) assert result.cached, "Result should be cached" assert result.raw_tokens == self.TEST_TOKENS def test_encode(self): request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1) - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.tokenizer.tokenize(request) assert not result.cached, "First time making the tokenize request. Result should not be cached" assert result.raw_tokens == [self.TEST_ENCODED[0]] - result: TokenizationRequestResult = self.client.tokenize(request) + result: TokenizationRequestResult = self.tokenizer.tokenize(request) assert result.cached, "Result should be cached" assert result.raw_tokens == [self.TEST_ENCODED[0]] request = TokenizationRequest(text=self.TEST_PROMPT, encode=True, truncation=True, max_length=1024) - result = self.client.tokenize(request) + result = self.tokenizer.tokenize(request) assert not result.cached, "First time making this particular request. Result should not be cached" assert result.raw_tokens == self.TEST_ENCODED def test_decode(self): request = DecodeRequest(tokens=self.TEST_ENCODED) - result: DecodeRequestResult = self.client.decode(request) + result: DecodeRequestResult = self.tokenizer.decode(request) assert not result.cached, "First time making the decode request. Result should not be cached" assert result.text == self.TEST_PROMPT - result: DecodeRequestResult = self.client.decode(request) + result: DecodeRequestResult = self.tokenizer.decode(request) assert result.cached, "Result should be cached" assert result.text == self.TEST_PROMPT