forked from mosaicml/examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuilders.py
208 lines (183 loc) · 8.47 KB
/
builders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
# Copyright 2022 MosaicML Examples authors
# SPDX-License-Identifier: Apache-2.0
import os
from typing import Union
from composer import algorithms
from composer.callbacks import (HealthChecker, LRMonitor, MemoryMonitor,
OptimizerMonitor, RuntimeEstimator,
SpeedMonitor)
from composer.core import Evaluator
from composer.datasets.in_context_learning_evaluation import \
get_icl_task_dataloader
from composer.loggers import WandBLogger
from composer.optim import DecoupledAdamW
from composer.optim.scheduler import (ConstantWithWarmupScheduler,
CosineAnnealingWithWarmupScheduler,
LinearWithWarmupScheduler)
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
from examples.common.fdiff import FDiffMetrics
from examples.common.generate_callback import Generate
from examples.common.monolithic_ckpt_callback import MonolithicCheckpointSaver
from examples.common.optim import (DecoupledAdaLRLion, DecoupledClipLion,
DecoupledLionW)
from examples.common.resumption_callbacks import GlobalLRScaling, LayerFreezing
from examples.common.scheduled_gc_callback import ScheduledGarbageCollector
from examples.common.text_data import Tokenizer, build_text_dataloader
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
def build_callback(name, kwargs):
if name == 'lr_monitor':
return LRMonitor()
elif name == 'memory_monitor':
return MemoryMonitor()
elif name == 'speed_monitor':
return SpeedMonitor(window_size=kwargs.get('window_size', 1),
gpu_flops_available=kwargs.get(
'gpu_flops_available', None))
elif name == 'fdiff':
return FDiffMetrics(**kwargs)
elif name == 'runtime_estimator':
return RuntimeEstimator()
elif name == 'optimizer_monitor':
return OptimizerMonitor(log_optimizer_metrics=kwargs.get(
'log_optimizer_metrics', True),)
elif name == 'health_checker':
return HealthChecker(**kwargs)
elif name == 'generate_callback':
prompts = kwargs.pop('prompts')
return Generate(prompts=list(prompts), **kwargs)
elif name == 'global_lr_scaling':
return GlobalLRScaling(**kwargs)
elif name == 'layer_freezing':
return LayerFreezing(**kwargs)
elif name == 'mono_ckpt_saver':
return MonolithicCheckpointSaver(**kwargs)
elif name == 'scheduled_gc':
return ScheduledGarbageCollector(**kwargs)
else:
raise ValueError(f'Not sure how to build callback: {name}')
def build_logger(name, kwargs):
if name == 'wandb':
return WandBLogger(**kwargs)
else:
raise ValueError(f'Not sure how to build logger: {name}')
def build_algorithm(name, kwargs):
if name == 'gradient_clipping':
return algorithms.GradientClipping(**kwargs)
elif name == 'alibi':
return algorithms.Alibi(**kwargs)
elif name == 'fused_layernorm':
return algorithms.FusedLayerNorm(**kwargs)
elif name == 'gated_linear_units':
return algorithms.GatedLinearUnits(**kwargs)
elif name == 'low_precision_layernorm':
return algorithms.LowPrecisionLayerNorm(**kwargs)
else:
raise ValueError(f'Not sure how to build algorithm: {name}')
def build_optimizer(cfg, model):
if cfg.name == 'decoupled_adamw':
return DecoupledAdamW(model.parameters(),
lr=cfg.lr,
betas=cfg.betas,
eps=cfg.eps,
weight_decay=cfg.weight_decay)
elif cfg.name == 'decoupled_lionw':
return DecoupledLionW(model.parameters(),
lr=cfg.lr,
betas=cfg.betas,
weight_decay=cfg.weight_decay)
elif cfg.name == 'clip_lion':
return DecoupledClipLion(model.parameters(),
lr=cfg.lr,
betas=cfg.betas,
weight_decay=cfg.weight_decay,
outlier_threshold=cfg.outlier_threshold)
elif cfg.name == 'adalr_lion':
return DecoupledAdaLRLion(model.parameters(),
lr=cfg.lr,
betas=cfg.betas,
weight_decay=cfg.weight_decay,
outlier_threshold=cfg.outlier_threshold,
timeout=cfg.timeout,
lr_penalty=cfg.lr_penalty,
min_scale=cfg.min_scale)
else:
raise ValueError(f'Not sure how to build optimizer: {cfg.name}')
def build_scheduler(cfg):
if cfg.name == 'constant_with_warmup':
return ConstantWithWarmupScheduler(t_warmup=cfg.t_warmup)
elif cfg.name == 'cosine_with_warmup':
return CosineAnnealingWithWarmupScheduler(t_warmup=cfg.t_warmup,
alpha_f=cfg.alpha_f)
elif cfg.name == 'linear_decay_with_warmup':
return LinearWithWarmupScheduler(t_warmup=cfg.t_warmup,
alpha_f=cfg.alpha_f)
else:
raise ValueError(f'Not sure how to build scheduler: {cfg.name}')
def build_tokenizer(om_tokenizer_config: DictConfig,) -> Tokenizer:
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
resolved_om_tokenizer_config = om.to_container(om_tokenizer_config,
resolve=True)
tokenizer_kwargs = resolved_om_tokenizer_config.get( # type: ignore
'kwargs', {})
tokenizer_name = resolved_om_tokenizer_config['name'] # type: ignore
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name,
**tokenizer_kwargs)
# HuggingFace does not respect the model_max_length kwarg, and overrides it with
# min(kwargs['model_max_length'], original_config['model_max_length']), so we
# explicitly set it here
tokenizer.model_max_length = tokenizer_kwargs.get(
'model_max_length',
int(1e30),
)
return tokenizer
def build_dataloader(cfg, tokenizer, device_batch_size):
if cfg.name == 'text':
return build_text_dataloader(cfg, tokenizer, device_batch_size)
else:
raise ValueError(f'Not sure how to build dataloader with config: {cfg}')
def build_icl_evaluators(cfg, tokenizer):
evaluators = []
logger_keys = []
def _validate_cfg(icl_cfg):
assert 'dataset_uri' in icl_cfg and icl_cfg.dataset_uri is not None
assert 'icl_task_type' in icl_cfg
assert 'num_fewshot' in icl_cfg
assert 'batch_size' in icl_cfg
assert 'metric_names' in icl_cfg
assert 'prompt_string' in icl_cfg
assert 'example_delimiter' in icl_cfg
assert 'continuation_delimiter' in icl_cfg
assert 'label' in icl_cfg
for icl_cfg in cfg.icl_tasks:
_validate_cfg(icl_cfg)
for num_fewshot in list(icl_cfg.num_fewshot):
if tokenizer.pad_token_id is None:
# Current workaround to support GPT2 tokenizer with `pad_token_id = None`
pad_tok_id = tokenizer.eos_token_id
else:
pad_tok_id = tokenizer.pad_token_id
label = f'{icl_cfg.label}/{num_fewshot}-shot'
metric_names = list(icl_cfg.metric_names)
dataloader = get_icl_task_dataloader(
icl_cfg.icl_task_type,
icl_cfg.dataset_uri,
tokenizer,
batch_size=icl_cfg.batch_size,
max_seq_len=cfg.max_seq_len,
pad_tok_id=pad_tok_id,
num_fewshot=num_fewshot,
prompt_string=icl_cfg.prompt_string,
example_delimiter=icl_cfg.example_delimiter,
continuation_delimiter=icl_cfg.continuation_delimiter,
destination_path=f'{icl_cfg.label}-{num_fewshot}.jsonl',
)
logger_keys.extend([f'metrics/{label}/{m}' for m in metric_names])
evaluators.append(
Evaluator(label=label,
dataloader=dataloader,
metric_names=metric_names))
return evaluators, logger_keys