Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support configuring precision and quantization in HuggingFaceClient #1912

Merged
merged 3 commits into from
Dec 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ def __init__(
service: TokenizerService,
tokenizer_name: str,
pretrained_model_name_or_path: Optional[str] = None,
revision: Optional[str] = None,
max_sequence_length: Optional[int] = None,
max_request_length: Optional[int] = None,
end_of_text_token: Optional[str] = None,
prefix_token: Optional[str] = None,
**kwargs
):
super().__init__(service)
self._tokenizer_name = tokenizer_name
Expand All @@ -27,7 +27,7 @@ def __init__(
with HuggingFaceTokenizer.get_tokenizer(
helm_tokenizer_name=tokenizer_name,
pretrained_model_name_or_path=pretrained_model_name_or_path or tokenizer_name,
revision=revision,
**kwargs,
) as tokenizer:
self._max_sequence_length = max_sequence_length or tokenizer.model_max_length
self._max_request_length = max_request_length or self._max_sequence_length
Expand Down
51 changes: 34 additions & 17 deletions src/helm/proxy/clients/huggingface_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,23 +39,20 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
class HuggingFaceServer:
"""A thin wrapper around a Hugging Face AutoModelForCausalLM for HuggingFaceClient to call."""

def __init__(self, pretrained_model_name_or_path: str, revision: Optional[str] = None):
def __init__(self, pretrained_model_name_or_path: str, **kwargs):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a comment here describing common kwargs that should be specified such as revision, precision, ... ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer to do this in the docs instead.

if torch.cuda.is_available():
hlog("CUDA is available, initializing with a GPU...")
self.device: str = "cuda:0"
else:
self.device = "cpu"
model_kwargs = {}
if revision:
model_kwargs["revision"] = revision
with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
# WARNING this may fail if your GPU does not have enough memory
self.model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=True, **model_kwargs
pretrained_model_name_or_path, trust_remote_code=True, **kwargs
).to(self.device)
with htrack_block(f"Loading Hugging Face tokenizer for model {pretrained_model_name_or_path}"):
self.wrapped_tokenizer: WrappedPreTrainedTokenizer = HuggingFaceTokenizer.create_tokenizer(
pretrained_model_name_or_path, revision
pretrained_model_name_or_path, **kwargs
)

def serve_request(self, raw_request: Dict[str, Any]):
Expand Down Expand Up @@ -189,34 +186,54 @@ class HuggingFaceServerFactory:
_servers_lock: Lock = Lock()

@staticmethod
def get_server(helm_model_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None) -> Any:
def get_server(helm_model_name: str, pretrained_model_name_or_path: str, **kwargs) -> Any:
"""
Checks if the desired HuggingFaceModel is cached. Creates the HuggingFaceModel if it's not cached.
Returns the HuggingFaceModel.
"""
with HuggingFaceServerFactory._servers_lock:
if helm_model_name not in HuggingFaceServerFactory._servers:
with htrack_block(
f"Loading {pretrained_model_name_or_path} (revision={revision}) "
f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
f"for HELM model {helm_model_name} with Hugging Face Transformers"
):
HuggingFaceServerFactory._servers[helm_model_name] = HuggingFaceServer(
pretrained_model_name_or_path, revision
pretrained_model_name_or_path, **kwargs
)

return HuggingFaceServerFactory._servers[helm_model_name]


TORCH_DTYPE_KEY = "torch_dtype"
TORCH_DTYPE_VALUE_PREFIX = "torch."


def _process_huggingface_client_kwargs(raw_kwargs: Dict[str, Any]):
"""Process the kwargs for HuggingFaceClient.

The kwargs passed to HuggingFaceClient will eventually be passed to AutoModel.from_pretrained().
Since the kwargs from HuggingFaceClient may be derived from configuration YAML,
they may contain primitive types instead of the unserializable types that
AutoModel.from_pretrained() expects (e.g. torch_dtype). This function converts values of
primitive types to values of the unserializable types."""
processed_kwargs = deepcopy(raw_kwargs)

# Convert torch_dtype string value to actual dtypes
# e.g. the string "torch.bfloat16" is converted to torch.bfloat16
torch_dtype = processed_kwargs.get(TORCH_DTYPE_KEY)
if torch_dtype and isinstance(torch_dtype, str):
if not torch_dtype.startswith(TORCH_DTYPE_VALUE_PREFIX):
raise ValueError(f'Unknown dtype "{torch_dtype}"; expected a string such as "torch.bfloat16"')
processed_kwargs[TORCH_DTYPE_KEY] = getattr(torch, torch_dtype[len(TORCH_DTYPE_VALUE_PREFIX) :])

return processed_kwargs


class HuggingFaceClient(CachingClient):
def __init__(
self,
cache_config: CacheConfig,
pretrained_model_name_or_path: Optional[str] = None,
revision: Optional[str] = None,
):
def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
super().__init__(cache_config=cache_config)
self._pretrained_model_name_or_path = pretrained_model_name_or_path
self._revision = revision
self._kwargs = _process_huggingface_client_kwargs(kwargs)

def make_request(self, request: Request) -> RequestResult:
# Embedding not supported for this model
Expand All @@ -243,7 +260,7 @@ def make_request(self, request: Request) -> RequestResult:
huggingface_model: HuggingFaceServer = HuggingFaceServerFactory.get_server(
helm_model_name=request.model_deployment,
pretrained_model_name_or_path=pretrained_model_name_or_path,
revision=self._revision,
**self._kwargs,
)

try:
Expand Down
28 changes: 9 additions & 19 deletions src/helm/proxy/tokenizers/huggingface_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,28 +49,18 @@ class HuggingFaceTokenizer(CachingTokenizer):
_tokenizers: Dict[str, WrappedPreTrainedTokenizer] = {}
_tokenizers_lock: Lock = Lock()

def __init__(
self,
cache_config: CacheConfig,
pretrained_model_name_or_path: Optional[str] = None,
revision: Optional[str] = None,
):
def __init__(self, cache_config: CacheConfig, pretrained_model_name_or_path: Optional[str] = None, **kwargs):
super().__init__(cache_config=cache_config)
self._pretrained_model_name_or_path = pretrained_model_name_or_path
self._revision = revision
self._kwargs = kwargs

@staticmethod
def create_tokenizer(
pretrained_model_name_or_path: str, revision: Optional[str] = None
) -> WrappedPreTrainedTokenizer:
def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPreTrainedTokenizer:
"""Loads tokenizer using files from disk if they exist. Otherwise, downloads from HuggingFace."""
# To avoid deadlocks when using HuggingFace tokenizers with multiple processes
# TODO: Figure out if we actually need this.
os.environ["TOKENIZERS_PARALLELISM"] = "False"

tokenizer_kwargs = {}
if revision is not None:
tokenizer_kwargs["revision"] = revision
try:
# From the Hugging Face documentation, "local_files_only(defaults to False) —
# Whether or not to only look at local files".
Expand All @@ -83,20 +73,20 @@ def create_tokenizer(
# Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
return WrappedPreTrainedTokenizer(
AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=True, use_fast=True, **tokenizer_kwargs
pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
)
)
except OSError:
hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
return WrappedPreTrainedTokenizer(
AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=False, use_fast=True, **tokenizer_kwargs
pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
)
)

@staticmethod
def get_tokenizer(
helm_tokenizer_name: str, pretrained_model_name_or_path: str, revision: Optional[str] = None
helm_tokenizer_name: str, pretrained_model_name_or_path: str, **kwargs
) -> WrappedPreTrainedTokenizer:
"""
Checks if the desired tokenizer is cached. Creates the tokenizer if it's not cached.
Expand All @@ -105,12 +95,12 @@ def get_tokenizer(
with HuggingFaceTokenizer._tokenizers_lock:
if helm_tokenizer_name not in HuggingFaceTokenizer._tokenizers:
with htrack_block(
f"Loading {pretrained_model_name_or_path} (revision={revision}) "
f"Loading {pretrained_model_name_or_path} (kwargs={kwargs}) "
f"for HELM tokenizer {helm_tokenizer_name} with Hugging Face Transformers"
):
# Keep the tokenizer in memory, so we don't recreate it for future requests
HuggingFaceTokenizer._tokenizers[helm_tokenizer_name] = HuggingFaceTokenizer.create_tokenizer(
pretrained_model_name_or_path, revision
pretrained_model_name_or_path, **kwargs
)
return HuggingFaceTokenizer._tokenizers[helm_tokenizer_name]

Expand All @@ -124,7 +114,7 @@ def _get_tokenizer_for_request(self, request: Dict[str, Any]) -> WrappedPreTrain
return HuggingFaceTokenizer.get_tokenizer(
helm_tokenizer_name=request["tokenizer"],
pretrained_model_name_or_path=pretrained_model_name_or_path,
revision=self._revision,
**self._kwargs,
)

def _tokenize_do_it(self, request: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
Loading