stanford-crfm · yifanmai · Dec 11, 2023 · Oct 17, 2023 · Dec 11, 2023 · Dec 11, 2023
diff --git a/src/helm/benchmark/window_services/huggingface_window_service.py b/src/helm/benchmark/window_services/huggingface_window_service.py
@@ -10,11 +10,11 @@ def __init__(
         service: TokenizerService,
         tokenizer_name: str,
         pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
         max_sequence_length: Optional[int] = None,
         max_request_length: Optional[int] = None,
         end_of_text_token: Optional[str] = None,
         prefix_token: Optional[str] = None,
+        **kwargs
     ):
         super().__init__(service)
         self._tokenizer_name = tokenizer_name
@@ -27,7 +27,7 @@ def __init__(
         with HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=tokenizer_name,
             pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
-            revision=revision,
+            **kwargs,
         ) as tokenizer:
             self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
             self._max_request_length = max_request_length or self._max_sequence_length

diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py
@@ -39,23 +39,20 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
 class HuggingFaceServer:
     """A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call."""
 
-    def __init__(self, pretrained_model_name_or_path: str, revision: Optional[str] = None):
+    def __init__(self, pretrained_model_name_or_path: str, **kwargs):
         if torch.cuda.is_available():
             hlog("CUDA is available, initializing with a GPU...")
             self.device: str = "cuda:0"
         else:
             self.device = "cpu"
-        model_kwargs = {}
-        if revision:
-            model_kwargs["revision"] = revision
         with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
             # WARNING this may fail if your GPU does not have enough memory
             self.model = AutoModelForCausalLM.from_pretrained(
-                pretrained_model_name_or_path, trust_remote_code=True, **model_kwargs
+                pretrained_model_name_or_path, trust_remote_code=True, **kwargs
             ).to(self.device)
         with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"):
             self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer(
-                pretrained_model_name_or_path, revision
+                pretrained_model_name_or_path, **kwargs
             )
 
     def serve_request(self, raw_request: Dict[str, Any]):
@@ -189,34 +186,54 @@ class HuggingFaceServerFactory:
     _servers_lock: Lock = Lock()
 
     @staticmethod
-    def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None) -> Any:
+    def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwargs) -> Any:
         """
         Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
         Returns the HuggingFaceModel.
         """
         with HuggingFaceServerFactory._servers_lock:
             if helm_model_name not in HuggingFaceServerFactory._servers:
                 with htrack_block(
-                    f"Loading {pretrained_model_name_or_path} (revision={revision}) "
+                    f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
                     f"for HELM model {helm_model_name} with Hugging Face Transformers"
                 ):
                     HuggingFaceServerFactory._servers[helm_model_name] = HuggingFaceServer(
-                        pretrained_model_name_or_path, revision
+                        pretrained_model_name_or_path, **kwargs
                     )
 
         return HuggingFaceServerFactory._servers[helm_model_name]
 
 
+TORCH_DTYPE_KEY = "torch_dtype"
+TORCH_DTYPE_VALUE_PREFIX = "torch."
+
+
+def _process_huggingface_client_kwargs(raw_kwargs: Dict[str, Any]):
+    """Process the kwargs for HuggingFaceClient.
+
+    The kwargs passed to HuggingFaceClient will eventually be passed to AutoModel.from_pretrained().
+    Since the kwargs from HuggingFaceClient may be derived from configuration YAML,
+    they may contain primitive types instead of the unserializable types that
+    AutoModel.from_pretrained() expects (e.g. torch_dtype). This function converts values of
+    primitive types to values of the unserializable types."""
+    processed_kwargs = deepcopy(raw_kwargs)
+
+    # Convert torch_dtype string value to actual dtypes
+    # e.g. the string "torch.bfloat16" is converted to torch.bfloat16
+    torch_dtype = processed_kwargs.get(TORCH_DTYPE_KEY)
+    if torch_dtype and isinstance(torch_dtype, str):
+        if not torch_dtype.startswith(TORCH_DTYPE_VALUE_PREFIX):
+            raise ValueError(f'Unknown dtype "{torch_dtype}"; expected a string such as "torch.bfloat16"')
+        processed_kwargs[TORCH_DTYPE_KEY] = getattr(torch, torch_dtype[len(TORCH_DTYPE_VALUE_PREFIX) :])
+
+    return processed_kwargs
+
+
 class HuggingFaceClient(CachingClient):
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
-    ):
+    def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
         super().__init__(cache_config=cache_config)
         self._pretrained_model_name_or_path = pretrained_model_name_or_path
-        self._revision = revision
+        self._kwargs = _process_huggingface_client_kwargs(kwargs)
 
     def make_request(self, request: Request) -> RequestResult:
         # Embedding not supported for this model
@@ -243,7 +260,7 @@ def make_request(self, request: Request) -> RequestResult:
         huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server(
             helm_model_name=request.model_deployment,
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            revision=self._revision,
+            **self._kwargs,
         )
 
         try:

diff --git a/src/helm/proxy/tokenizers/huggingface_tokenizer.py b/src/helm/proxy/tokenizers/huggingface_tokenizer.py
@@ -49,28 +49,18 @@ class HuggingFaceTokenizer(CachingTokenizer):
     _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
     _tokenizers_lock: Lock = Lock()
 
-    def __init__(
-        self,
-        cache_config: CacheConfig,
-        pretrained_model_name_or_path: Optional[str] = None,
-        revision: Optional[str] = None,
-    ):
+    def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
         super().__init__(cache_config=cache_config)
         self._pretrained_model_name_or_path = pretrained_model_name_or_path
-        self._revision = revision
+        self._kwargs = kwargs
 
     @staticmethod
-    def create_tokenizer(
-        pretrained_model_name_or_path: str, revision: Optional[str] = None
-    ) -> WrappedPreTrainedTokenizer:
+    def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer:
         """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
         # To avoid deadlocks when using HuggingFace tokenizers with multiple processes
         # TODO: Figure out if we actually need this.
         os.environ["TOKENIZERS_PARALLELISM"] = "False"
 
-        tokenizer_kwargs = {}
-        if revision is not None:
-            tokenizer_kwargs["revision"] = revision
         try:
             # From the Hugging Face documentation, "local_files_only(defaults to False) —
             # Whether or not to only look at local files".
@@ -83,20 +73,20 @@ def create_tokenizer(
             # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
             return WrappedPreTrainedTokenizer(
                 AutoTokenizer.from_pretrained(
-                    pretrained_model_name_or_path, local_files_only=True, use_fast=True, **tokenizer_kwargs
+                    pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
                 )
             )
         except OSError:
             hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
             return WrappedPreTrainedTokenizer(
                 AutoTokenizer.from_pretrained(
-                    pretrained_model_name_or_path, local_files_only=False, use_fast=True, **tokenizer_kwargs
+                    pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
                 )
             )
 
     @staticmethod
     def get_tokenizer(
-        helm_tokenizer_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
+        helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs
     ) -> WrappedPreTrainedTokenizer:
         """
         Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
@@ -105,12 +95,12 @@ def get_tokenizer(
         with HuggingFaceTokenizer._tokenizers_lock:
             if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers:
                 with htrack_block(
-                    f"Loading {pretrained_model_name_or_path} (revision={revision}) "
+                    f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
                     f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers"
                 ):
                     # Keep the tokenizer in memory, so we don't recreate it for future requests
                     HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer(
-                        pretrained_model_name_or_path, revision
+                        pretrained_model_name_or_path, **kwargs
                     )
         return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]
 
@@ -124,7 +114,7 @@ def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrain
         return HuggingFaceTokenizer.get_tokenizer(
             helm_tokenizer_name=request["tokenizer"],
             pretrained_model_name_or_path=pretrained_model_name_or_path,
-            revision=self._revision,
+            **self._kwargs,
         )
 
     def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: