diff --git a/src/helm/benchmark/window_services/huggingface_window_service.py b/src/helm/benchmark/window_services/huggingface_window_service.py index 18f3b9fad1f..dccb0268ed9 100644 --- a/src/helm/benchmark/window_services/huggingface_window_service.py +++ b/src/helm/benchmark/window_services/huggingface_window_service.py @@ -10,11 +10,11 @@ def __init__( service: TokenizerService, tokenizer_name: str, pretrained_model_name_or_path: Optional[str] = None, - revision: Optional[str] = None, max_sequence_length: Optional[int] = None, max_request_length: Optional[int] = None, end_of_text_token: Optional[str] = None, prefix_token: Optional[str] = None, + **kwargs ): super().__init__(service) self._tokenizer_name = tokenizer_name @@ -27,7 +27,7 @@ def __init__( with HuggingFaceTokenizer.get_tokenizer( helm_tokenizer_name=tokenizer_name, pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name, - revision=revision, + **kwargs, ) as tokenizer: self._max_sequence_length = max_sequence_length or tokenizer.model_max_length self._max_request_length = max_request_length or self._max_sequence_length diff --git a/src/helm/proxy/clients/huggingface_client.py b/src/helm/proxy/clients/huggingface_client.py index 94e453e25db..afb914d6bb0 100644 --- a/src/helm/proxy/clients/huggingface_client.py +++ b/src/helm/proxy/clients/huggingface_client.py @@ -39,23 +39,20 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa class HuggingFaceServer: """A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call.""" - def __init__(self, pretrained_model_name_or_path: str, revision: Optional[str] = None): + def __init__(self, pretrained_model_name_or_path: str, **kwargs): if torch.cuda.is_available(): hlog("CUDA is available, initializing with a GPU...") self.device: str = "cuda:0" else: self.device = "cpu" - model_kwargs = {} - if revision: - model_kwargs["revision"] = revision with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"): # WARNING this may fail if your GPU does not have enough memory self.model = AutoModelForCausalLM.from_pretrained( - pretrained_model_name_or_path, trust_remote_code=True, **model_kwargs + pretrained_model_name_or_path, trust_remote_code=True, **kwargs ).to(self.device) with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"): self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer( - pretrained_model_name_or_path, revision + pretrained_model_name_or_path, **kwargs ) def serve_request(self, raw_request: Dict[str, Any]): @@ -189,7 +186,7 @@ class HuggingFaceServerFactory: _servers_lock: Lock = Lock() @staticmethod - def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None) -> Any: + def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwargs) -> Any: """ Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached. Returns the HuggingFaceModel. @@ -197,26 +194,46 @@ def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revisio with HuggingFaceServerFactory._servers_lock: if helm_model_name not in HuggingFaceServerFactory._servers: with htrack_block( - f"Loading {pretrained_model_name_or_path} (revision={revision}) " + f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) " f"for HELM model {helm_model_name} with Hugging Face Transformers" ): HuggingFaceServerFactory._servers[helm_model_name] = HuggingFaceServer( - pretrained_model_name_or_path, revision + pretrained_model_name_or_path, **kwargs ) return HuggingFaceServerFactory._servers[helm_model_name] +TORCH_DTYPE_KEY = "torch_dtype" +TORCH_DTYPE_VALUE_PREFIX = "torch." + + +def _process_huggingface_client_kwargs(raw_kwargs: Dict[str, Any]): + """Process the kwargs for HuggingFaceClient. + + The kwargs passed to HuggingFaceClient will eventually be passed to AutoModel.from_pretrained(). + Since the kwargs from HuggingFaceClient may be derived from configuration YAML, + they may contain primitive types instead of the unserializable types that + AutoModel.from_pretrained() expects (e.g. torch_dtype). This function converts values of + primitive types to values of the unserializable types.""" + processed_kwargs = deepcopy(raw_kwargs) + + # Convert torch_dtype string value to actual dtypes + # e.g. the string "torch.bfloat16" is converted to torch.bfloat16 + torch_dtype = processed_kwargs.get(TORCH_DTYPE_KEY) + if torch_dtype and isinstance(torch_dtype, str): + if not torch_dtype.startswith(TORCH_DTYPE_VALUE_PREFIX): + raise ValueError(f'Unknown dtype "{torch_dtype}"; expected a string such as "torch.bfloat16"') + processed_kwargs[TORCH_DTYPE_KEY] = getattr(torch, torch_dtype[len(TORCH_DTYPE_VALUE_PREFIX) :]) + + return processed_kwargs + + class HuggingFaceClient(CachingClient): - def __init__( - self, - cache_config: CacheConfig, - pretrained_model_name_or_path: Optional[str] = None, - revision: Optional[str] = None, - ): + def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs): super().__init__(cache_config=cache_config) self._pretrained_model_name_or_path = pretrained_model_name_or_path - self._revision = revision + self._kwargs = _process_huggingface_client_kwargs(kwargs) def make_request(self, request: Request) -> RequestResult: # Embedding not supported for this model @@ -243,7 +260,7 @@ def make_request(self, request: Request) -> RequestResult: huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server( helm_model_name=request.model_deployment, pretrained_model_name_or_path=pretrained_model_name_or_path, - revision=self._revision, + **self._kwargs, ) try: diff --git a/src/helm/proxy/tokenizers/huggingface_tokenizer.py b/src/helm/proxy/tokenizers/huggingface_tokenizer.py index d61eb072888..b92d6c068b2 100644 --- a/src/helm/proxy/tokenizers/huggingface_tokenizer.py +++ b/src/helm/proxy/tokenizers/huggingface_tokenizer.py @@ -49,28 +49,18 @@ class HuggingFaceTokenizer(CachingTokenizer): _tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {} _tokenizers_lock: Lock = Lock() - def __init__( - self, - cache_config: CacheConfig, - pretrained_model_name_or_path: Optional[str] = None, - revision: Optional[str] = None, - ): + def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs): super().__init__(cache_config=cache_config) self._pretrained_model_name_or_path = pretrained_model_name_or_path - self._revision = revision + self._kwargs = kwargs @staticmethod - def create_tokenizer( - pretrained_model_name_or_path: str, revision: Optional[str] = None - ) -> WrappedPreTrainedTokenizer: + def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer: """Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace.""" # To avoid deadlocks when using HuggingFace tokenizers with multiple processes # TODO: Figure out if we actually need this. os.environ["TOKENIZERS_PARALLELISM"] = "False" - tokenizer_kwargs = {} - if revision is not None: - tokenizer_kwargs["revision"] = revision try: # From the Hugging Face documentation, "local_files_only(defaults to False) — # Whether or not to only look at local files". @@ -83,20 +73,20 @@ def create_tokenizer( # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available. return WrappedPreTrainedTokenizer( AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=True, use_fast=True, **tokenizer_kwargs + pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs ) ) except OSError: hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...") return WrappedPreTrainedTokenizer( AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=False, use_fast=True, **tokenizer_kwargs + pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs ) ) @staticmethod def get_tokenizer( - helm_tokenizer_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None + helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs ) -> WrappedPreTrainedTokenizer: """ Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached. @@ -105,12 +95,12 @@ def get_tokenizer( with HuggingFaceTokenizer._tokenizers_lock: if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers: with htrack_block( - f"Loading {pretrained_model_name_or_path} (revision={revision}) " + f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) " f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers" ): # Keep the tokenizer in memory, so we don't recreate it for future requests HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer( - pretrained_model_name_or_path, revision + pretrained_model_name_or_path, **kwargs ) return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] @@ -124,7 +114,7 @@ def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrain return HuggingFaceTokenizer.get_tokenizer( helm_tokenizer_name=request["tokenizer"], pretrained_model_name_or_path=pretrained_model_name_or_path, - revision=self._revision, + **self._kwargs, ) def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]: