diff --git "a/MiniCPM-2-V/01minicpm-2-v\345\244\232\346\250\241\346\200\201\345\276\256\350\260\203\350\256\262\350\247\243.md" "b/MiniCPM-2-V/01minicpm-2-v\345\244\232\346\250\241\346\200\201\345\276\256\350\260\203\350\256\262\350\247\243.md" new file mode 100644 index 00000000..b285ab2f --- /dev/null +++ "b/MiniCPM-2-V/01minicpm-2-v\345\244\232\346\250\241\346\200\201\345\276\256\350\260\203\350\256\262\350\247\243.md" @@ -0,0 +1,233 @@ +# MiniCPM-2-V transformers 部署微调和应用 + +MiniCPM-V 2.0 是一个高效的多模态大型语言模型,具备 2.8B 参数。该模型在多个基准测试中表现出色,包括 OCRBench、TextVQA、MME 等,超越了许多参数量更大的模型。MiniCPM-V 2.0 具有以下特点: + +1. **性能卓越**:在多项基准测试中达到最先进水平,尤其在场景文字理解上表现出色,与 Gemini Pro 相当。 +2. **可靠行为**:通过多模态 RLHF 技术(多模态强化学习人类反馈),确保生成内容的可信度,匹配 GPT-4V 的防幻觉能力。 +3. **高分辨率图像处理**:支持任意长宽比的高分辨率图像输入,提升细粒度视觉信息感知能力。 +4. **高效部署**:能够在多数 GPU 卡和个人电脑上高效运行,甚至可在移动设备上运行。 +5. **双语支持**:具备强大的中英文多模态能力,支持跨语言多模态应用。 + +模型可以在 NVIDIA GPU 或 Mac 的 MPS 上进行推理,并通过 vLLM 实现高效推理。详细的安装和使用指南请参考 [GitHub 仓库](https://github.com/OpenBMB/MiniCPM-V)。 + +MiniCPM-V 2.0 完全开源,免费供学术研究使用,并在填写问卷后免费用于商业用途。有关模型的更多信息和技术细节,请访问 [技术博客](https://openbmb.vercel.app/minicpm-v-2)。 + +可以通过如下的方式推理: + + +```python +from chat import MiniCPMVChat, img2base64 +import torch +import json + +torch.manual_seed(0) + +chat_model = MiniCPMVChat('openbmb/MiniCPM-V-2') + +im_64 = img2base64('./assets/airplane.jpeg') + +# First round chat +msgs = [{"role": "user", "content": "Tell me the model of this aircraft."}] + +inputs = {"image": im_64, "question": json.dumps(msgs)} +answer = chat_model.chat(inputs) +print(answer) + +# Second round chat +# pass history context of multi-turn conversation +msgs.append({"role": "assistant", "content": answer}) +msgs.append({"role": "user", "content": "Introduce something about Airbus A380."}) + +inputs = {"image": im_64, "question": json.dumps(msgs)} +answer = chat_model.chat(inputs) +print(answer) +``` + +### 数据准备 + +请将数据准备为如下的json格式,对于多模态图像,需要设置图像路径,支持多轮对话,但是每轮只能使用一张图片。 + + +```python + [ + { + "id": "0", + "image": 'path/to/image_0.jpg', + "conversations": [ + { + 'role': 'user', + 'content': '\nHow many desserts are on the white plate?' + }, + { + 'role': 'assistant', + 'content': 'There are three desserts on the white plate.' + }, + { + 'role': 'user', + 'content': 'What type of desserts are they?' + }, + { + 'role': 'assistant', + 'content': 'The desserts are cakes with bananas and pecans on top. They share similarities with donuts, but the presence of bananas and pecans differentiates them.' + }, + { + 'role': 'user', + 'content': 'What is the setting of the image?'}, + { + 'role': 'assistant', + 'content': 'The image is set on a table top with a plate containing the three desserts.' + }, + ] + }, + ] +``` + +在训练 MiniCPM-V 2.0 模型时,可以使用 `finetune_lora.sh` 脚本。根据验证,最小的训练资源需求为3张RTX 3090显卡,同时需要使用 `cpu_low_memo` 的 `ds_config_zero2.json` 配置文件,因此需要依赖 DeepSpeed 框架。在文件开头需要进行如下设置: + + +### 设置步骤 + +1. **DeepSpeed 配置文件**:需要准备 `ds_config_zero2.json` 配置文件,设置为低内存使用模式(`cpu_low_memo`)。 +2. **多GPU设置**:在脚本开头指定使用3张RTX 3090显卡进行训练。 +3. **依赖安装**:确保环境中已经安装了 DeepSpeed,并正确配置路径和依赖。 + + +```python +#!/bin/bash + +export CUDA_VISIBLE_DEVICES=0,1,2 + +# 设置 HF_HOME 环境变量 设置下载路径 +export HF_HOME=/home/data/username/hf-models/ +export HF_ENDPOINT=https://hf-mirror.com + +GPUS_PER_NODE=3 +NNODES=1 +NODE_RANK=0 +MASTER_ADDR=localhost +MASTER_PORT=6001 + + +MODEL="openbmb/MiniCPM-V-2" # or openbmb/MiniCPM-V-2 +DATA="./data/train_en_train.json" # json file +EVAL_DATA="./data/train_zh_train.json" # json file +LLM_TYPE="minicpm" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm +``` + + +```python +!sh finetune_lora.sh +``` + + + /home/data/ckw/micromamba/envs/kewei-ai/lib/python3.12/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( + [2024-06-08 09:05:58,136] [INFO] [comm.py:637:init_distributed] cdb=None + [2024-06-08 09:05:58,136] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl + /home/data/ckw/micromamba/envs/kewei-ai/lib/python3.12/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( + [2024-06-08 09:05:58,153] [INFO] [comm.py:637:init_distributed] cdb=None + /home/data/ckw/micromamba/envs/kewei-ai/lib/python3.12/site-packages/transformers/training_args.py:1474: FutureWarning: `evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead + warnings.warn( + [2024-06-08 09:05:58,205] [INFO] [comm.py:637:init_distributed] cdb=None + The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. + The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. + The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored. + Loading checkpoint shards: 100%|██████████████████| 2/2 [00:08<00:00, 4.06s/it] + Loading checkpoint shards: 100%|██████████████████| 2/2 [00:08<00:00, 4.08s/it] + Loading checkpoint shards: 100%|██████████████████| 2/2 [00:08<00:00, 4.17s/it] + max_steps is given, it will override any value given in num_train_epochs + Currently using LoRA for fine-tuning the MiniCPM-V model. + max_steps is given, it will override any value given in num_train_epochs + {'Total': 3458558752, 'Trainable': 733677856} + llm_type=minicpm + Loading data... + max_steps is given, it will override any value given in num_train_epochs + 0%| | 0/998 [00:00python3.12(ubuntu22.04)->cuda12.1,我们后期直接通过mamba包管理工具创建即可。 + +![](assets/2024-06-12-10-09-28-image.png) + +![](assets/2024-06-12-10-12-18-image.png)接下来,我们打开刚刚租用服务器的 JupyterLab,如下图所示,然后打开其中的终端,开始环境配置、模型下载和运行演示。我们直接使用代码仓库中的minicpm-2-v.yaml + +```python +cd / +curl -Ls https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj bin/micromamba +./bin/micromamba shell init -s bash -p /micromamba + +vi ~/.bashrc +加上 +alias mamba=micromamba + +mamba env create -f minicpm-2-v.yaml -y +mamba activate minicpm-2-v +pip install ipykernel +python -m ipykernel install --name=minicpm-2-v --display-name minicpm-2-v # 不添加--user属性,可供所有用户使用 +``` + +![](assets/2024-06-12-10-31-00-image.png) + +刷新一下,就能看到我们的环境了 + +![](assets/2024-06-12-10-39-54-image.png) + +## 模型下载 + +使用 modelscope 中的snapshot_download函数下载模型,第一个参数为模型名称,参数cache_dir为模型的下载路径。 + +在 /root/autodl-tmp 路径下新建 download.py 文件并在其中输入以下内容,粘贴代码后记得保存文件。 + +download.py代码如下 + +```python +import torch +from modelscope import snapshot_download, AutoModel, AutoTokenizer +from modelscope import GenerationConfig +model_dir = snapshot_download('openbmb/MiniCPM-V-2', cache_dir='/root/autodl-tmp', revision='master') +``` + +保存好后在终端运行 python /root/autodl-tmp/download.py 执行下载,下载模型需要一些时间。 + +```bash +pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple +pip install modelscope transformers +python /root/autodl-tmp/download.py +``` + +## 微调环境准备-相关包编译 + +```bash +sudo apt-get update +sudo apt-get install cmake build-essential pkg-config libgoogle-perftools-dev +mamba activate minicpm-2-v +git clone https://github.com/google/sentencepiece.git +cd sentencepiece +mkdir build +cd build +cmake .. +make -j $(nproc) +sudo make install +sudo ldconfig -v +pip install SentencePiece +pip install tensorboardX +``` + +![](assets/2024-06-12-11-37-43-image.png) + +然后就来到了较为麻烦的deepspeed源码编译了 + +参考: + +[deepspeed使用zero3 + offload报错:AttributeError: ‘DeepSpeedCPUAdam‘ object has no attribute ‘ds_opt_adam](https://blog.csdn.net/qq_44193969/article/details/137051032) + +[Installation Details - DeepSpeed](https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops) + +```bash +pip uninstall deepspeed +# DS_BUILD_CPU_ADAM=1 pip install deepspeed 这个方法可能有兼容性问题,推荐源码编译 + +git clone https://github.com/microsoft/DeepSpeed +cd DeepSpeed/ + +DS_BUILD_CPU_ADAM=1 python setup.py build_ext -j8 bdist_wheel +pip install dist/deepspeed-0.14.3+b6e24adb-cp312-cp312-linux_x86_64.whl + +pip install timm +``` + +## 开启微调 + +```bash +git clone https://github.com/Ethan-Chen-plus/self-llm.git +cd self-llm/MiniCPM-2-V/ +git clone https://github.com/Ethan-Chen-plus/llava-en-zh-2k-mini.git +mv llava-en-zh-2k-mini data +cp -r ./data/img ./img +``` + +然后在finetune_lora里面设置 + +```text +MODEL="/root/autodl-tmp/openbmb/MiniCPM-V-2" # or openbmb/MiniCPM-V-2 +DATA="./data/sample_50_train.json" # json file +EVAL_DATA="./data/sample_10_test.json" # json file +LLM_TYPE="minicpm" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm +``` + +```bash +sh finetune_lora.sh +``` + +![](assets/2024-06-12-15-04-16-image.png) + +## ![](assets/2024-06-12-15-06-17-image.png) + +![](assets/2024-06-12-16-00-34-image.png) + +## 部署代码准备 + +在`/root/autodl-tmp`路径下新建 `chatBot.py` 文件并在其中输入以下内容,粘贴代码后记得保存文件。下面的代码有很详细的注释,大家如有不理解的地方,欢迎提出issue。 + +chatBot.py代码如下 + +```python +import streamlit as st # 导入 Streamlit 库,用于构建网页应用 +from PIL import Image # 导入 PIL 库中的 Image 模块,用于处理图像 +import torch # 导入 PyTorch 库,用于深度学习模型 +from transformers import AutoModel, AutoTokenizer # 导入 transformers 库中的 AutoModel 和 AutoTokenizer,用于加载预训练模型和分词器 +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Union, Literal, Tuple +from peft import LoraConfig, get_peft_model, TaskType + +# 模型路径 +model_path = "/root/autodl-tmp/openbmb/MiniCPM-V-2" +path_to_adapter = "/root/self-llm/MiniCPM-2-V/output/output_minicpmv2_lora" +# 用户和助手的名称 +U_NAME = "User" +A_NAME = "Assistant" + +# 设置页面配置 +st.set_page_config( + page_title="💬MiniCPM-V-2 Streamlit", # 页面标题 + page_icon=":robot:", # 页面图标 + layout="wide" # 页面布局为宽屏 +) + +@dataclass +class LoraArguments: + lora_r: int = 64 + lora_alpha: int = 64 + lora_dropout: float = 0.05 + lora_target_modules: str = r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)" + lora_weight_path: str = "" + lora_bias: str = "none" + q_lora: bool = False + lora_modules_to_save: str = "" + lora_layer_replication: Optional[List[Tuple[int, int]]] = None + lora_layers_to_transform: Optional[List[int]] = None + lora_layers_pattern: Optional[str] = None + +from peft import LoraConfig, get_peft_model, TaskType + +def load_lora_config(model): + config = LoraConfig( + task_type=TaskType.CAUSAL_LM, + inference_mode=False, + r=64, + lora_alpha=64, + lora_dropout=0.05, + target_modules=r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)" + ) + return get_peft_model(model, config) + +# 加载模型和分词器的函数,并缓存结果以提高性能 +@st.cache_resource +def load_model_and_tokenizer(): + print(f"load_model_and_tokenizer from {model_path}") + # 从预训练模型路径加载模型和分词器,并将模型加载到 CUDA 设备上(如果可用) + model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(device="cuda") + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + model = load_lora_config(model) + vpm_resampler_embedtokens_weight = torch.load(f"{path_to_adapter}/vpm_resampler_embedtokens.pt") + msg = model.load_state_dict(vpm_resampler_embedtokens_weight, strict=False) + return model, tokenizer + +# 初始化会话状态 +if 'model' not in st.session_state: + st.session_state.model, st.session_state.tokenizer = load_model_and_tokenizer() + st.session_state.model.eval() # 将模型设置为评估模式 + print("model and tokenizer had loaded completed!") + +# 初始化聊天记录的会话状态 +if 'chat_history' not in st.session_state: + st.session_state.chat_history = [] + +# 侧边栏设置 +sidebar_name = st.sidebar.title("💬MiniCPM-V-2 Streamlit-powered by self-llm") +# 在侧边栏创建滑块,用于设置生成文本的最大长度、重复惩罚、top_p、top_k 和温度 +max_length = st.sidebar.slider("max_length", 0, 4096, 2048, step=2) +repetition_penalty = st.sidebar.slider("repetition_penalty", 0.0, 2.0, 1.05, step=0.01) +top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01) +top_k = st.sidebar.slider("top_k", 0, 100, 100, step=1) +temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.7, step=0.01) +# 在侧边栏中创建一个标题和一个链接 +with st.sidebar: + st.markdown("## MiniCPM LLM") + "[开源大模型食用指南 self-llm](https://github.com/datawhalechina/self-llm.git)" + "[开源大模型架构教程 llms-from-scratch](https://github.com/datawhalechina/llms-from-scratch-cn.git)" + +# 清除聊天记录的按钮 +buttonClean = st.sidebar.button("Clear chat history", key="clean") +if buttonClean: + st.session_state.chat_history = [] # 清空聊天记录 + st.session_state.response = "" + if torch.cuda.is_available(): + torch.cuda.empty_cache() # 清空 CUDA 缓存 + st.rerun() # 重新运行页面 + +# 显示聊天记录 +for i, message in enumerate(st.session_state.chat_history): + if message["role"] == "user": + # 如果消息是用户的,显示用户的消息 + with st.chat_message(name="user", avatar="user"): + if message["image"] is not None: + st.image(message["image"], caption='User uploaded image', width=448, use_column_width=False) + continue + elif message["content"] is not None: + st.markdown(message["content"]) + else: + # 如果消息是助手的,显示助手的消息 + with st.chat_message(name="model", avatar="assistant"): + st.markdown(message["content"]) + +# 选择模式 +selected_mode = st.sidebar.selectbox("Select mode", ["Text", "Image"]) +if selected_mode == "Image": + # 图片模式 + uploaded_image = st.sidebar.file_uploader("Upload image", key=1, type=["jpg", "jpeg", "png"], accept_multiple_files=False) + if uploaded_image is not None: + st.image(uploaded_image, caption='User uploaded image', width=468, use_column_width=False) + # 将上传的图片添加到聊天记录中 + st.session_state.chat_history.append({"role": "user", "content": None, "image": uploaded_image}) + +# 用户输入框 +user_text = st.chat_input("Enter your question") +if user_text: + with st.chat_message(U_NAME, avatar="user"): + # 将用户输入的文本添加到聊天记录中 + st.session_state.chat_history.append({"role": "user", "content": user_text, "image": None}) + st.markdown(f"{U_NAME}: {user_text}") + + # 使用模型生成回复 + model = st.session_state.model + tokenizer = st.session_state.tokenizer + + with st.chat_message(A_NAME, avatar="assistant"): + # 如果前一条消息包含图片,将图片传递给模型 + if len(st.session_state.chat_history) > 1 and st.session_state.chat_history[-2]["image"] is not None: + uploaded_image = st.session_state.chat_history[-2]["image"] + imagefile = Image.open(uploaded_image).convert('RGB') + + msgs = [{"role": "user", "content": user_text}] + # 使用模型生成回复文本 + res = model.chat(image=imagefile, msgs=msgs, context=None, tokenizer=tokenizer, + sampling=True, top_p=top_p, top_k=top_k, repetition_penalty=repetition_penalty, + temperature=temperature, stream=True) + + # 收集生成的文本字符串 + generated_text = st.write_stream(res) + + st.session_state.chat_history.append({"role": "model", "content": generated_text, "image": None}) + + st.divider() # 添加分割线 +``` + +## 运行demo + +在终端中运行以下命令,启动streamlit服务 + +``` +streamlit run /root/autodl-tmp/chatBot.py --server.address 127.0.0.1 --server.port 6006 +``` + +点击自定义服务 + +![](assets/2024-06-12-16-37-45-image.png) + +点开linux + +![](assets/2024-06-12-16-38-35-image.png) + +然后win+R打开powershell + +``` +ssh -CNg -L 6006:127.0.0.1:6006 root@connect.yza1.seetacloud.com -p 39423 +``` + +输入ssh与密码,按下回车至这样即可,保持命令行界面为开启状态 + +![](assets/2024-06-12-16-39-35-image.png) + +在浏览器中打开链接 http://localhost:6006/ ,即可看到聊天界面。运行效果如下: + +![](assets/2024-06-12-16-40-17-image.png) + +我上传的图片如下 + +![](assets/2024-06-12-17-02-09-联想截图_20240528180602.png) + +![](assets/2024-06-12-17-05-31-image.png) + +可以看到效果还是很不错的。那么我们这期内容就到这里了。如果想要深入了解模型的原理,可以访问我们的仓库:[datawhalechina/llms-from-scratch-cn](https://github.com/datawhalechina/llms-from-scratch-cn/) + + diff --git a/MiniCPM-2-V/assets/2024-06-12-10-08-47-image.png b/MiniCPM-2-V/assets/2024-06-12-10-08-47-image.png new file mode 100644 index 00000000..e4debcc5 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-10-08-47-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-10-09-28-image.png b/MiniCPM-2-V/assets/2024-06-12-10-09-28-image.png new file mode 100644 index 00000000..e4debcc5 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-10-09-28-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-10-12-18-image.png b/MiniCPM-2-V/assets/2024-06-12-10-12-18-image.png new file mode 100644 index 00000000..a3f5653d Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-10-12-18-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-10-31-00-image.png b/MiniCPM-2-V/assets/2024-06-12-10-31-00-image.png new file mode 100644 index 00000000..356aa358 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-10-31-00-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-10-39-54-image.png b/MiniCPM-2-V/assets/2024-06-12-10-39-54-image.png new file mode 100644 index 00000000..6d983b2a Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-10-39-54-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-11-37-43-image.png b/MiniCPM-2-V/assets/2024-06-12-11-37-43-image.png new file mode 100644 index 00000000..6738fddc Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-11-37-43-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-15-04-16-image.png b/MiniCPM-2-V/assets/2024-06-12-15-04-16-image.png new file mode 100644 index 00000000..ee38a845 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-15-04-16-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-15-06-17-image.png b/MiniCPM-2-V/assets/2024-06-12-15-06-17-image.png new file mode 100644 index 00000000..ec2853dd Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-15-06-17-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-00-34-image.png b/MiniCPM-2-V/assets/2024-06-12-16-00-34-image.png new file mode 100644 index 00000000..ef103f3a Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-00-34-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-37-45-image.png b/MiniCPM-2-V/assets/2024-06-12-16-37-45-image.png new file mode 100644 index 00000000..09390073 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-37-45-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-38-35-image.png b/MiniCPM-2-V/assets/2024-06-12-16-38-35-image.png new file mode 100644 index 00000000..22cfaca3 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-38-35-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-39-35-image.png b/MiniCPM-2-V/assets/2024-06-12-16-39-35-image.png new file mode 100644 index 00000000..5953024d Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-39-35-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-40-17-image.png b/MiniCPM-2-V/assets/2024-06-12-16-40-17-image.png new file mode 100644 index 00000000..8078b129 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-40-17-image.png differ diff --git a/MiniCPM-2-V/assets/2024-06-12-16-59-50-image.png b/MiniCPM-2-V/assets/2024-06-12-16-59-50-image.png new file mode 100644 index 00000000..377b17cd Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-16-59-50-image.png differ diff --git "a/MiniCPM-2-V/assets/2024-06-12-17-02-09-\350\201\224\346\203\263\346\210\252\345\233\276_20240528180602.png" "b/MiniCPM-2-V/assets/2024-06-12-17-02-09-\350\201\224\346\203\263\346\210\252\345\233\276_20240528180602.png" new file mode 100644 index 00000000..8847f5b9 Binary files /dev/null and "b/MiniCPM-2-V/assets/2024-06-12-17-02-09-\350\201\224\346\203\263\346\210\252\345\233\276_20240528180602.png" differ diff --git a/MiniCPM-2-V/assets/2024-06-12-17-05-31-image.png b/MiniCPM-2-V/assets/2024-06-12-17-05-31-image.png new file mode 100644 index 00000000..32d7ad33 Binary files /dev/null and b/MiniCPM-2-V/assets/2024-06-12-17-05-31-image.png differ diff --git a/MiniCPM-2-V/data b/MiniCPM-2-V/data new file mode 160000 index 00000000..f2494713 --- /dev/null +++ b/MiniCPM-2-V/data @@ -0,0 +1 @@ +Subproject commit f2494713fb3facf52cd2136a5b4c42f619d366d0 diff --git a/MiniCPM-2-V/dataset.py b/MiniCPM-2-V/dataset.py new file mode 100644 index 00000000..5821e132 --- /dev/null +++ b/MiniCPM-2-V/dataset.py @@ -0,0 +1,458 @@ +import copy +import json +import logging +import math +import os +from dataclasses import dataclass, field +from typing import Dict, List, Optional + +import numpy as np +import torch +from PIL import Image +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import Dataset +from transformers import AutoProcessor, AutoTokenizer + +llama3_chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}" + +class SupervisedDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__( + self, + raw_data, + transform, + tokenizer, + slice_config, + llm_type="minicpm", + patch_size=14, + query_nums=64, + batch_vision=False, + ): + super(SupervisedDataset, self).__init__() + self.raw_data = raw_data + self.tokenizer = tokenizer + self.transform = transform + self.slice_config = slice_config + self.llm_type = llm_type + self.patch_size = patch_size + self.query_nums=query_nums + self.batch_vision = batch_vision + + def __len__(self): + return len(self.raw_data) + + def __getitem__(self, i) -> Dict[str, torch.Tensor]: + image = Image.open(self.raw_data[i]["images"][0]).convert("RGB") + ret = preprocess( + image, + self.raw_data[i]["conversations"], + self.tokenizer, + self.transform, + query_nums=self.query_nums, + slice_config=self.slice_config, + llm_type=self.llm_type, + patch_size=self.patch_size, + batch_vision=self.batch_vision, + ) + ret = dict( + input_ids=ret["input_ids"], + position_ids=ret["position_ids"], + labels=ret["target"], + attention_mask=torch.ones_like(ret["input_ids"], dtype=torch.bool), + pixel_values=ret["pixel_values"], + tgt_sizes=ret["tgt_sizes"], + image_bound=ret["image_bound"], + ) + + return ret + +def data_collator(examples, padding_value=0, max_length=2048): + def trim_and_pad(seq, batch_first, padding_value): + return pad_sequence([s[:max_length] for s in seq], batch_first=True, padding_value=padding_value) + + input_ids = trim_and_pad( + [example["input_ids"] for example in examples], + batch_first=True, + padding_value=padding_value, + ) + position_ids = trim_and_pad( + [example["position_ids"] for example in examples], + batch_first=True, + padding_value=padding_value, + ) + targets = trim_and_pad( + [example["labels"] for example in examples], + batch_first=True, + padding_value=-100, + ) + attention_mask = trim_and_pad( + [example["attention_mask"] for example in examples], + batch_first=True, + padding_value=padding_value, + ) + pixel_values = [example["pixel_values"] for example in examples] + image_bound = [example["image_bound"] for example in examples] + tgt_sizes = [example["tgt_sizes"] for example in examples] + return { + "input_ids": input_ids, + "position_ids": position_ids, + "labels": targets, + "attention_mask": attention_mask, + "image_bound": image_bound, + "tgt_sizes": tgt_sizes, + "pixel_values": pixel_values, + } + + +def conversation_to_ids(conversation, tokenizer, llm_type=None): + """ + for single image multi-turn conversation + conversation: [{'role': 'user', 'content': 'Describe this image'}, + {'role': 'assistant', 'content': 'This is a cat.'}] + """ + if llm_type == "llama3": + input_ids, context, raw_msg = conversation_to_ids_llama3( + conversation, tokenizer + ) + else: + input_ids, context, raw_msg = conversation_to_ids_minicpm( + conversation, tokenizer + ) + + ids = torch.from_numpy(np.hstack(input_ids, dtype=np.int32)) + context = torch.from_numpy(np.hstack(context, dtype=np.int8)) + + # build target + target = torch.full_like(ids, -100, dtype=torch.int32) + for i in range(1, len(ids)): + if context[i] == 0: + target[i - 1] = ids[i] + if context[i] == 1 and context[i - 1] == 0: + if hasattr(tokenizer, "eot_id"): + target[i - 1] = tokenizer.eot_id + else: + target[i - 1] = tokenizer.eos_id + + # build image bound + image_start_tokens = torch.where(ids == tokenizer.im_start_id)[0] + image_start_tokens += 1 + image_end_tokens = torch.where(ids == tokenizer.im_end_id)[0] + if len(image_start_tokens) != len(image_end_tokens): + print("image start token != image end tokens") + + if len(image_start_tokens) > 0: + image_bound = torch.hstack( + [image_start_tokens.unsqueeze(-1), image_end_tokens.unsqueeze(-1)] + ) + else: + image_bound = [] + + position_ids = torch.arange(ids.size(0)).long() + return { + "input_ids": ids, + "target": target, + "image_bound": image_bound, + "raw_msg": raw_msg, + "position_ids": position_ids + } + + +def conversation_to_ids_minicpm(conversation, tokenizer): + raw_msg = "" + input_ids = [] + context = [] + for idx, msg in enumerate(conversation): + role = msg["role"] + message = msg["content"] + assert role in ["user", "assistant"] + if role == "user": + prefix = "<用户>" + else: + prefix = "" + # append eos + if idx == len(conversation) - 1: + message = message + tokenizer.eos_token + prefix_ids = tokenizer.encode(prefix)[1:] # remove bos + message_ids = tokenizer.encode(message)[1:] + + input_ids.append(prefix_ids) + input_ids.append(message_ids) + + context.append(np.ones((len(prefix_ids),), dtype=np.int8)) + if role == "assistant": + context.append(np.zeros((len(message_ids),), dtype=np.int8)) + else: + context.append(np.ones((len(message_ids),), dtype=np.int8)) + + raw_msg += prefix + message + + return input_ids, context, raw_msg + + +def conversation_to_ids_llama3(conversation, tokenizer): + raw_msg = "" + input_ids = [] + context = [] + raw_msg = tokenizer.apply_chat_template( + conversation, tokenize=False, add_generation_prompt=False, chat_template=llama3_chat_template, + ) + input_ids = tokenizer.apply_chat_template( + conversation, tokenize=True, add_generation_prompt=False, chat_template=llama3_chat_template, + ) + input_ids = np.array(input_ids) + + start_header_idxs = np.where( + input_ids == tokenizer.convert_tokens_to_ids("<|start_header_id|>") + )[0] + assistant_idxs = np.where( + input_ids == tokenizer.convert_tokens_to_ids("assistant") + )[0] + end_header_idxs = np.where( + input_ids == tokenizer.convert_tokens_to_ids("<|end_header_id|>") + )[0] + eot_idxs = np.where( + input_ids == tokenizer.convert_tokens_to_ids("<|eot_id|>"))[0] + + context = np.ones_like(input_ids, dtype=np.int8) + + for assistant_idx in assistant_idxs: + if assistant_idx in set((start_header_idxs + end_header_idxs) / 2): + st = assistant_idx + 3 # assistant<|end_header_id|>\n\n + for eot_idx in eot_idxs: + if eot_idx > st: + context[st: eot_idx + 1] = 0 + break + + input_ids = np.hstack(input_ids) + context = np.hstack(context) + + return input_ids, context, raw_msg + + +def preprocess( + image, + conversation, + tokenizer, + transform, + query_nums=64, + slice_config=None, + llm_type=None, + patch_size=14, + batch_vision=False, +): + """ + single image preprocess, the image will be placed at the top of the conversation + """ + conversation = copy.deepcopy(conversation) + assert len(conversation) > 1, "conversation length must large than 2" + assert conversation[0]["role"] == "user", "the first role must be user" + + if slice_config is not None: + assert isinstance(slice_config, Dict) + assert "patch_size" in slice_config + assert "max_slice_nums" in slice_config + assert "scale_resolution" in slice_config + default_image_placeholder = ( + tokenizer.im_start + tokenizer.unk_token * query_nums + tokenizer.im_end + ) + if slice_config: + images = [] + source_image, patches, best_grid = slice_image( + image, + slice_config["max_slice_nums"], + slice_config["scale_resolution"], + slice_config["patch_size"], + ) + images.append(source_image) + image_placeholder = default_image_placeholder + if len(patches) > 0: + for i in range(len(patches)): + for j in range(len(patches[0])): + images.append(patches[i][j]) + + image_placeholder += get_grid_placeholder( + tokenizer, best_grid, query_nums) + images = [transform(i) for i in images] + else: + images = [transform(image)] + image_placeholder = default_image_placeholder + if "" in conversation[0]["content"]: + conversation[0]["content"] = conversation[0]["content"].replace( + "", image_placeholder + ) + else: + conversation[0]["content"] = ( + image_placeholder + "\n" + conversation[0]["content"] + ) + + input_dict = conversation_to_ids(conversation, tokenizer, llm_type) + + if batch_vision: + tgt_sizes = [] + reshape_images = [] + for image in images: + H, W = image.shape[1:] + reshape_image = reshape_by_patch(image, patch_size) + reshape_images.append(reshape_image) + tgt_sizes.append([H // patch_size, W // patch_size]) + if tgt_sizes: + tgt_sizes = torch.Tensor(tgt_sizes).type(torch.int32) + + input_dict["pixel_values"] = reshape_images + input_dict["tgt_sizes"] = tgt_sizes + + else: + input_dict["pixel_values"] = images + input_dict["tgt_sizes"] = [] + + return input_dict + + +def slice_image( + image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False +): + original_size = image.size + original_width, original_height = original_size + log_ratio = math.log(original_width / original_height) + ratio = original_width * original_height / \ + (scale_resolution * scale_resolution) + multiple = min(math.ceil(ratio), max_slice_nums) + + source_image = None + best_grid = None + patches = [] + + if multiple <= 1 or never_split: + # dont need to slice, upsample + best_size = find_best_resize( + original_size, scale_resolution, patch_size, allow_upscale=True + ) + source_image = image.resize(best_size, Image.Resampling.BICUBIC) + else: + candidate_split_grids_nums = [] + for i in [multiple - 1, multiple, multiple + 1]: + if i == 1 or i > max_slice_nums: + continue + candidate_split_grids_nums.append(i) + + # source image, down-sampling and ensure divided by patch_size + best_resize = find_best_resize( + original_size, scale_resolution, patch_size) + source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + candidate_grids = [] + + # find best grid + for split_grids_nums in candidate_split_grids_nums: + m = 1 + while m <= split_grids_nums: + if split_grids_nums % m == 0: + candidate_grids.append([m, split_grids_nums // m]) + m += 1 + + best_grid = [1, 1] + min_error = float("inf") + for grid in candidate_grids: + error = abs(log_ratio - math.log(grid[0] / grid[1])) + if error < min_error: + best_grid = grid + min_error = error + + refine_size = get_refine_size( + original_size, best_grid, scale_resolution, patch_size, allow_upscale=True + ) + + refine_image = image.resize(refine_size, Image.Resampling.BICUBIC) + patches = split_to_patches(refine_image, best_grid) + + return source_image, patches, best_grid + + +def ensure_divide(length, patch_size): + return max(round(length / patch_size) * patch_size, patch_size) + + +def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False): + width, height = original_size + if (width * height > scale_resolution * scale_resolution) or allow_upscale: + r = width / height + height = int(scale_resolution / math.sqrt(r)) + width = int(height * r) + best_width = ensure_divide(width, patch_size) + best_height = ensure_divide(height, patch_size) + return (best_width, best_height) + + +def get_refine_size( + original_size, grid, scale_resolution, patch_size, allow_upscale=False +): + width, height = original_size + grid_x, grid_y = grid + + refine_width = ensure_divide(width, grid_x) + refine_height = ensure_divide(height, grid_y) + + grid_width = refine_width / grid_x + grid_height = refine_height / grid_y + + best_grid_size = find_best_resize( + (grid_width, grid_height), + scale_resolution, + patch_size, + allow_upscale=allow_upscale, + ) + + refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y) + + return refine_size + + +def split_to_patches(image, grid): + patches = [] + width, height = image.size + grid_x = int(width / grid[0]) + grid_y = int(height / grid[1]) + + for i in range(0, height, grid_y): + images = [] + for j in range(0, width, grid_x): + box = (j, i, j + grid_x, i + grid_y) + patch = image.crop(box) + images.append(patch) + patches.append(images) + + return patches + + +def get_grid_placeholder(tokenizer, grid, query_num): + image_placeholder = ( + tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end + ) + + cols = grid[0] + rows = grid[1] + slices = [] + for i in range(rows): + lines = [] + for j in range(cols): + lines.append(image_placeholder) + slices.append("".join(lines)) + slice_placeholder = tokenizer.slice_start + \ + "\n".join(slices) + tokenizer.slice_end + return slice_placeholder + + +def reshape_by_patch(image_tensor, patch_size): + """ + :param image_tensor: shape [3, H, W] + :param patch_size: + :return: [3, patch_size, HW/patch_size] + """ + patches = torch.nn.functional.unfold( + image_tensor, (patch_size, patch_size), stride=(patch_size, patch_size) + ) + + patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1) + patches = patches.permute(0, 1, 3, 2).reshape( + image_tensor.size(0), patch_size, -1) + return patches diff --git a/MiniCPM-2-V/ds_config_zero2.json b/MiniCPM-2-V/ds_config_zero2.json new file mode 100644 index 00000000..b2f8d675 --- /dev/null +++ b/MiniCPM-2-V/ds_config_zero2.json @@ -0,0 +1,54 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + + "bf16": { + "enabled": "auto" + }, + + "optimizer": { + "type": "Adam", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/MiniCPM-2-V/ds_config_zero3.json b/MiniCPM-2-V/ds_config_zero3.json new file mode 100644 index 00000000..5c3cd9c9 --- /dev/null +++ b/MiniCPM-2-V/ds_config_zero3.json @@ -0,0 +1,61 @@ + +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "none", + "pin_memory": true + }, + "offload_param": { + "device": "none", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true + }, + + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 100, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} + diff --git a/MiniCPM-2-V/finetune.py b/MiniCPM-2-V/finetune.py new file mode 100644 index 00000000..e518d315 --- /dev/null +++ b/MiniCPM-2-V/finetune.py @@ -0,0 +1,331 @@ +import glob +import json +import logging +import os +from dataclasses import dataclass, field +from functools import partial +from typing import Dict, List, Optional, Union, Literal, Tuple +from types import MethodType +import torch +import transformers +from accelerate.utils import DistributedType +from deepspeed import zero +from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus + +from transformers import AutoModel, AutoTokenizer +from transformers.integrations import deepspeed +from transformers import AutoModel, AutoTokenizer + +from vlm.modeling_minicpmv import MiniCPMV +from vlm.modeling_minicpmv import LlamaTokenizerWrapper + +from dataset import SupervisedDataset, data_collator +from trainer import CPMTrainer + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training + +@dataclass +class ModelArguments: + model_name_or_path: Optional[str] = field(default="openbmb/MiniCPM-V-2") + + +@dataclass +class DataArguments: + data_path: str = field( + default=None, metadata={"help": "Path to the training data."} + ) + eval_data_path: str = field( + default=None, metadata={"help": "Path to the evaluation data."} + ) + + +@dataclass +class TrainingArguments(transformers.TrainingArguments): + cache_dir: Optional[str] = field(default=None) + optim: str = field(default="adamw_torch") + model_max_length: int = field( + default=2048, + metadata={ + "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." + }, + ) + tune_vision: Optional[bool] = field(default=True) + tune_llm: Optional[bool] = field(default=True) + llm_type: str = field(default="minicpm") + use_lora: Optional[bool] = field(default=False) + max_slice_nums: Optional[int] = field(default=9) + + +@dataclass +class LoraArguments: + lora_r: int = 64 + lora_alpha: int = 64 + lora_dropout: float = 0.05 + lora_target_modules: str = r"llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj|v_proj)" + lora_weight_path: str = "" + lora_bias: str = "none" + q_lora: bool = False + lora_modules_to_save: str = "" + lora_layer_replication: Optional[List[Tuple[int, int]]] = None + lora_layers_to_transform: Optional[List[int]] = None + lora_layers_pattern: Optional[str] = None + +def maybe_zero_3(param): + if hasattr(param, "ds_id"): + assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE + with zero.GatheredParameters([param]): + param = param.data.detach().cpu().clone() + else: + param = param.detach().cpu().clone() + return param + + +# Borrowed from peft.utils.get_peft_model_state_dict +def get_peft_state_maybe_zero_3(named_params, bias): + if bias == "none": + to_return = {k: t for k, t in named_params if "lora_" in k} + elif bias == "all": + to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k} + elif bias == "lora_only": + to_return = {} + maybe_lora_bias = {} + lora_bias_names = set() + for k, t in named_params: + if "lora_" in k: + to_return[k] = t + bias_name = k.split("lora_")[0] + "bias" + lora_bias_names.add(bias_name) + elif "bias" in k: + maybe_lora_bias[k] = t + for k, t in maybe_lora_bias: + if bias_name in lora_bias_names: + to_return[bias_name] = t + else: + raise NotImplementedError + to_return = {k: maybe_zero_3(v) for k, v in to_return.items()} + return to_return + + +local_rank = None +def rank0_print(*args): + if local_rank == 0: + print(*args) + + +def safe_save_model_for_hf_trainer(trainer, output_dir: str, bias="none"): + """Collects the state dict and dump to disk.""" + # check if zero3 mode enabled + if deepspeed.is_deepspeed_zero3_enabled(): + state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict() + else: + if trainer.args.use_lora: + state_dict = get_peft_state_maybe_zero_3( + trainer.model.named_parameters(), bias + ) + else: + state_dict = trainer.model.state_dict() + if trainer.args.should_save and trainer.args.local_rank == 0: + trainer._save(output_dir, state_dict=state_dict) + + +def make_supervised_data_module( + tokenizer: transformers.PreTrainedTokenizer, + data_args, + transform, + data_collator=None, + llm_type="minicpm", + slice_config=None, + patch_size=14, + query_nums=64, + batch_vision=False, + max_length=2048, +) -> Dict: + """Make dataset and collator for supervised fine-tuning.""" + dataset_cls = SupervisedDataset + + rank0_print("Loading data...") + + train_json = json.load(open(data_args.data_path, "r")) + train_dataset = dataset_cls( + train_json, + transform, + tokenizer, + slice_config=slice_config, + llm_type=llm_type, + patch_size=patch_size, + query_nums=query_nums, + batch_vision=batch_vision, + ) + + if data_args.eval_data_path: + eval_json = json.load(open(data_args.eval_data_path, "r")) + eval_dataset = dataset_cls( + eval_json, + transform, + tokenizer, + slice_config=slice_config, + llm_type=llm_type, + patch_size=patch_size, + query_nums=query_nums, + batch_vision=batch_vision, + ) + else: + eval_dataset = None + + return dict( + train_dataset=train_dataset, + eval_dataset=eval_dataset, + data_collator= partial(data_collator, max_length=max_length), + ) + + +def get_parameter_number(model): + trainable_params, all_param = 0, 0 + for param in model.parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return {'Total': all_param, 'Trainable': trainable_params} + + +local_rank = 0 + + +def train(): + global local_rank + parser = transformers.HfArgumentParser( + (ModelArguments, DataArguments, TrainingArguments, LoraArguments) + ) + + ( + model_args, + data_args, + training_args, + lora_args, + ) = parser.parse_args_into_dataclasses() + + if getattr(training_args, "deepspeed", None) : + training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED + + compute_dtype = ( + torch.float16 + if training_args.fp16 + else (torch.bfloat16 if training_args.bf16 else torch.float32) + ) + + local_rank = training_args.local_rank + world_size = int(os.environ.get("WORLD_SIZE", 1)) + ddp = world_size != 1 + device_map = None + if lora_args.q_lora: + device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None + if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled(): + logging.warning( + "FSDP or ZeRO3 are not incompatible with QLoRA." + ) + + model = MiniCPMV.from_pretrained( + model_args.model_name_or_path, + trust_remote_code=True, + torch_dtype=compute_dtype, + device_map=device_map, + ) + + tokenizer = LlamaTokenizerWrapper.from_pretrained( + model_args.model_name_or_path, trust_remote_code=True + ) + + if not training_args.tune_vision: + model.vpm.requires_grad_(False) + if not training_args.tune_llm: + model.llm.requires_grad_(False) + + if training_args.use_lora: + if training_args.use_lora and training_args.tune_llm: + raise ValueError("The model cannot simultaneously adjust LLM parameters and apply LoRA.") + + rank0_print("Currently using LoRA for fine-tuning the MiniCPM-V model.") + for name, param in model.llm.named_parameters(): + param.requires_grad = False + lora_config = LoraConfig( + r=lora_args.lora_r, + lora_alpha=lora_args.lora_alpha, + target_modules=lora_args.lora_target_modules, + lora_dropout=lora_args.lora_dropout, + bias=lora_args.lora_bias, + layers_to_transform=lora_args.lora_layers_to_transform, + task_type="CAUSAL_LM", + ) + if not hasattr(model, 'get_input_embeddings'): + def get_input_embeddings(self): + return self.llm.get_input_embeddings() + model.get_input_embeddings = MethodType(get_input_embeddings, model) + if lora_args.q_lora: + model = prepare_model_for_kbit_training( + model, use_gradient_checkpointing=training_args.gradient_checkpointing + ) + model = get_peft_model(model, lora_config) + model.base_model.resampler.requires_grad_(True) + model.base_model.llm.model.embed_tokens.weight.requires_grad_(True) + if training_args.tune_vision: + model.base_model.vpm.requires_grad_(True) + if training_args.gradient_checkpointing: + model.enable_input_require_grads() + + rank0_print(get_parameter_number(model)) + + llm_type = training_args.llm_type + + rank0_print(f'llm_type={llm_type}') + + + # Load data + if hasattr(model.config, "slice_config"): + model.config.slice_config.max_slice_nums = training_args.max_slice_nums + slice_config = model.config.slice_config.to_dict() + else: + model.config.max_slice_nums = training_args.max_slice_nums + slice_config = model.config.to_dict() + + if hasattr(model.config, "batch_vision_input"): + batch_vision = model.config.batch_vision_input + else: + batch_vision = False + + data_module = make_supervised_data_module( + tokenizer=tokenizer, + data_args=data_args, + transform=model.transform, + data_collator=data_collator, + slice_config=slice_config, + llm_type=llm_type, + patch_size=model.config.patch_size, + query_nums=model.config.query_num, + batch_vision=batch_vision, + max_length=training_args.model_max_length, + ) + + trainer = CPMTrainer( + model=model, + tokenizer=tokenizer, + args=training_args, + **data_module, + ) + + trainer.train() + trainer.save_state() + + safe_save_model_for_hf_trainer( + trainer=trainer, + output_dir=training_args.output_dir, + bias=lora_args.lora_bias) + + +if __name__ == "__main__": + train() diff --git a/MiniCPM-2-V/finetune_lora.sh b/MiniCPM-2-V/finetune_lora.sh new file mode 100644 index 00000000..21cef6bc --- /dev/null +++ b/MiniCPM-2-V/finetune_lora.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +export CUDA_VISIBLE_DEVICES=0,1,2 + +# 设置 HF_HOME 环境变量 设置下载路径 +export HF_HOME=/home/data/username/hf-models/ +export HF_ENDPOINT=https://hf-mirror.com + +GPUS_PER_NODE=3 +NNODES=1 +NODE_RANK=0 +MASTER_ADDR=localhost +MASTER_PORT=6001 + + +MODEL="/root/autodl-tmp/openbmb/MiniCPM-V-2" # or openbmb/MiniCPM-V-2 +DATA="./data/sample_50_train.json" # json file +EVAL_DATA="./data/sample_10_test.json" # json file +LLM_TYPE="minicpm" # if use openbmb/MiniCPM-V-2, please set LLM_TYPE=minicpm + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" +torchrun $DISTRIBUTED_ARGS finetune.py \ + --model_name_or_path $MODEL \ + --llm_type $LLM_TYPE \ + --data_path $DATA \ + --eval_data_path $EVAL_DATA \ + --remove_unused_columns false \ + --label_names "labels" \ + --prediction_loss_only false \ + --bf16 false \ + --bf16_full_eval false \ + --fp16 true \ + --fp16_full_eval true \ + --do_train \ + --do_eval \ + --tune_vision true \ + --tune_llm false \ + --use_lora true \ + --lora_target_modules "llm\..*layers\.\d+\.self_attn\.(q_proj|k_proj)" \ + --model_max_length 2048 \ + --max_slice_nums 9 \ + --max_steps 998 \ + --eval_steps 1000 \ + --output_dir output/output_minicpmv2_lora \ + --logging_dir output/output_minicpmv2_lora \ + --logging_strategy "steps" \ + --per_device_train_batch_size 2 \ + --per_device_eval_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --evaluation_strategy "steps" \ + --save_strategy "steps" \ + --save_steps 1000 \ + --save_total_limit 10 \ + --learning_rate 1e-6 \ + --weight_decay 0.1 \ + --adam_beta2 0.95 \ + --warmup_ratio 0.01 \ + --lr_scheduler_type "cosine" \ + --logging_steps 1 \ + --gradient_checkpointing true \ + --deepspeed ds_config_zero2.json \ + --report_to "tensorboard" # wandb diff --git a/MiniCPM-2-V/minicpm-2-v.yaml b/MiniCPM-2-V/minicpm-2-v.yaml new file mode 100644 index 00000000..4777c814 --- /dev/null +++ b/MiniCPM-2-V/minicpm-2-v.yaml @@ -0,0 +1,297 @@ +name: minicpm-2-v +channels: +- conda-forge +- nvidia/label/cuda-11.8.0 +- nvidia +- pytorch +dependencies: +- _libgcc_mutex=0.1=conda_forge +- _openmp_mutex=4.5=2_kmp_llvm +- accelerate=0.31.0=pyhd8ed1ab_0 +- aiohttp=3.9.5=py312h98912ed_0 +- aiosignal=1.3.1=pyhd8ed1ab_0 +- alsa-lib=1.2.11=hd590300_1 +- annotated-types=0.7.0=pyhd8ed1ab_0 +- aom=3.9.0=hac33072_0 +- attr=2.5.1=h166bdaf_1 +- attrs=23.2.0=pyh71513ae_0 +- aws-c-auth=0.7.22=h96bc93b_2 +- aws-c-cal=0.6.14=h88a6e22_1 +- aws-c-common=0.9.19=h4ab18f5_0 +- aws-c-compression=0.2.18=h83b837d_6 +- aws-c-event-stream=0.4.2=ha47c788_12 +- aws-c-http=0.8.1=h29d6fba_17 +- aws-c-io=0.14.8=h21d4f22_5 +- aws-c-mqtt=0.10.4=h759edc4_4 +- aws-c-s3=0.5.9=h594631b_3 +- aws-c-sdkutils=0.1.16=h83b837d_2 +- aws-checksums=0.1.18=h83b837d_6 +- aws-crt-cpp=0.26.9=he3a8b3b_0 +- aws-sdk-cpp=1.11.329=hba8bd5f_3 +- brotli-python=1.1.0=py312h30efb56_1 +- bzip2=1.0.8=hd590300_5 +- c-ares=1.28.1=hd590300_0 +- ca-certificates=2024.6.2=hbcca054_0 +- cairo=1.18.0=h3faef2a_0 +- certifi=2024.2.2=pyhd8ed1ab_0 +- charset-normalizer=3.3.2=pyhd8ed1ab_0 +- colorama=0.4.6=pyhd8ed1ab_0 +- cuda-nvcc=11.8.89=0 +- cuda-version=11.8=h70ddcb2_3 +- cudatoolkit=11.8.0=h4ba93d1_13 +- cudnn=8.9.7.29=hbc23b4c_3 +- datasets=2.19.2=pyhd8ed1ab_0 +- dav1d=1.2.1=hd590300_0 +- dbus=1.13.6=h5008d03_3 +- deepspeed=0.14.2=cpu_py312hd642c6c_1 +- dill=0.3.8=pyhd8ed1ab_0 +- double-conversion=3.3.0=h59595ed_0 +- expat=2.6.2=h59595ed_0 +- ffmpeg=6.1.1=gpl_he44c6f3_112 +- filelock=3.14.0=pyhd8ed1ab_0 +- font-ttf-dejavu-sans-mono=2.37=hab24e00_0 +- font-ttf-inconsolata=3.000=h77eed37_0 +- font-ttf-source-code-pro=2.038=h77eed37_0 +- font-ttf-ubuntu=0.83=h77eed37_2 +- fontconfig=2.14.2=h14ed4e7_0 +- fonts-conda-ecosystem=1=0 +- fonts-conda-forge=1=0 +- freeglut=3.2.2=hac7e632_2 +- freetype=2.12.1=h267a509_2 +- fribidi=1.0.10=h36c2ea0_0 +- frozenlist=1.4.1=py312h98912ed_0 +- fsspec=2024.3.1=pyhca7485f_0 +- gettext=0.22.5=h59595ed_2 +- gettext-tools=0.22.5=h59595ed_2 +- gflags=2.2.2=he1b5a44_1004 +- glib=2.80.2=hf974151_0 +- glib-tools=2.80.2=hb6ce0ca_0 +- glog=0.7.0=hed5481d_0 +- gmp=6.3.0=h59595ed_1 +- gmpy2=2.1.5=py312h1d5cde6_1 +- gnutls=3.7.9=hb077bed_0 +- graphite2=1.3.13=h59595ed_1003 +- gst-plugins-base=1.24.4=h9ad1361_0 +- gstreamer=1.24.4=haf2f30d_0 +- harfbuzz=8.5.0=hfac3d4d_0 +- hdf5=1.14.3=nompi_hdf9ad27_105 +- hjson-py=3.1.0=pyhd8ed1ab_0 +- huggingface_hub=0.23.2=pyhd8ed1ab_0 +- icu=73.2=h59595ed_0 +- idna=3.7=pyhd8ed1ab_0 +- imath=3.1.11=hfc55251_0 +- jasper=4.2.4=h536e39c_0 +- jinja2=3.1.4=pyhd8ed1ab_0 +- keyutils=1.6.1=h166bdaf_0 +- krb5=1.21.2=h659d440_0 +- lame=3.100=h166bdaf_1003 +- lcms2=2.16=hb7c19ff_0 +- ld_impl_linux-64=2.40=hf3520f5_2 +- lerc=4.0.0=h27087fc_0 +- libabseil=20240116.2=cxx17_h59595ed_0 +- libaec=1.1.3=h59595ed_0 +- libaio=0.3.113=h166bdaf_0 +- libarrow=16.1.0=hcb6531f_6_cpu +- libarrow-acero=16.1.0=hac33072_6_cpu +- libarrow-dataset=16.1.0=hac33072_6_cpu +- libarrow-substrait=16.1.0=h7e0c224_6_cpu +- libasprintf=0.22.5=h661eb56_2 +- libasprintf-devel=0.22.5=h661eb56_2 +- libass=0.17.1=h8fe9dca_1 +- libblas=3.9.0=22_linux64_openblas +- libbrotlicommon=1.1.0=hd590300_1 +- libbrotlidec=1.1.0=hd590300_1 +- libbrotlienc=1.1.0=hd590300_1 +- libcap=2.69=h0f662aa_0 +- libcblas=3.9.0=22_linux64_openblas +- libclang-cpp18.1=18.1.6=default_h127d8a8_0 +- libclang13=18.1.6=default_h5d6823c_0 +- libcrc32c=1.1.2=h9c3ff4c_0 +- libcups=2.3.3=h4637d8d_4 +- libcurl=8.8.0=hca28451_0 +- libdeflate=1.20=hd590300_0 +- libdrm=2.4.120=hd590300_0 +- libedit=3.1.20191231=he28a2e2_2 +- libev=4.33=hd590300_2 +- libevent=2.1.12=hf998b51_1 +- libexpat=2.6.2=h59595ed_0 +- libffi=3.4.2=h7f98852_5 +- libflac=1.4.3=h59595ed_0 +- libgcc-ng=13.2.0=h77fa898_7 +- libgcrypt=1.10.3=hd590300_0 +- libgettextpo=0.22.5=h59595ed_2 +- libgettextpo-devel=0.22.5=h59595ed_2 +- libgfortran-ng=13.2.0=h69a702a_7 +- libgfortran5=13.2.0=hca663fb_7 +- libglib=2.80.2=hf974151_0 +- libglu=9.0.0=hac7e632_1003 +- libgoogle-cloud=2.24.0=h2736e30_0 +- libgoogle-cloud-storage=2.24.0=h3d9a0c8_0 +- libgpg-error=1.49=h4f305b6_0 +- libgrpc=1.62.2=h15f2491_0 +- libhwloc=2.10.0=default_h5622ce7_1001 +- libiconv=1.17=hd590300_2 +- libidn2=2.3.7=hd590300_0 +- libjpeg-turbo=3.0.0=hd590300_1 +- liblapack=3.9.0=22_linux64_openblas +- liblapacke=3.9.0=22_linux64_openblas +- libllvm18=18.1.6=hb77312f_0 +- libmagma=2.7.2=h09b5827_2 +- libmagma_sparse=2.7.2=h09b5827_3 +- libnghttp2=1.58.0=h47da74e_1 +- libnsl=2.0.1=hd590300_0 +- libogg=1.3.4=h7f98852_1 +- libopenblas=0.3.27=pthreads_h413a1c8_0 +- libopencv=4.9.0=qt6_py312h711d6c5_615 +- libopenvino=2024.1.0=h2da1b83_7 +- libopenvino-auto-batch-plugin=2024.1.0=hb045406_7 +- libopenvino-auto-plugin=2024.1.0=hb045406_7 +- libopenvino-hetero-plugin=2024.1.0=h5c03a75_7 +- libopenvino-intel-cpu-plugin=2024.1.0=h2da1b83_7 +- libopenvino-intel-gpu-plugin=2024.1.0=h2da1b83_7 +- libopenvino-intel-npu-plugin=2024.1.0=he02047a_7 +- libopenvino-ir-frontend=2024.1.0=h5c03a75_7 +- libopenvino-onnx-frontend=2024.1.0=h07e8aee_7 +- libopenvino-paddle-frontend=2024.1.0=h07e8aee_7 +- libopenvino-pytorch-frontend=2024.1.0=he02047a_7 +- libopenvino-tensorflow-frontend=2024.1.0=h39126c6_7 +- libopenvino-tensorflow-lite-frontend=2024.1.0=he02047a_7 +- libopus=1.3.1=h7f98852_1 +- libparquet=16.1.0=h6a7eafb_6_cpu +- libpciaccess=0.18=hd590300_0 +- libpng=1.6.43=h2797004_0 +- libpq=16.3=ha72fbe1_0 +- libprotobuf=4.25.3=h08a7969_0 +- libre2-11=2023.09.01=h5a48ba9_2 +- libsndfile=1.2.2=hc60ed4a_1 +- libsqlite=3.45.3=h2797004_0 +- libssh2=1.11.0=h0841786_0 +- libstdcxx-ng=13.2.0=hc0a3c3a_7 +- libsystemd0=255=h3516f8a_1 +- libtasn1=4.19.0=h166bdaf_0 +- libthrift=0.19.0=hb90f79a_1 +- libtiff=4.6.0=h1dd3fc0_3 +- libtorch=2.3.0=cuda118_h8db9d67_301 +- libunistring=0.9.10=h7f98852_0 +- libutf8proc=2.8.0=h166bdaf_0 +- libuuid=2.38.1=h0b41bf4_0 +- libuv=1.48.0=hd590300_0 +- libva=2.21.0=hd590300_0 +- libvorbis=1.3.7=h9c3ff4c_0 +- libvpx=1.14.0=h59595ed_0 +- libwebp-base=1.4.0=hd590300_0 +- libxcb=1.15=h0b41bf4_0 +- libxcrypt=4.4.36=hd590300_1 +- libxkbcommon=1.7.0=h662e7e4_0 +- libxml2=2.12.7=hc051c1a_1 +- libzlib=1.3.1=h4ab18f5_1 +- llvm-openmp=18.1.6=ha31de31_0 +- lz4-c=1.9.4=hcb278e6_0 +- markupsafe=2.1.5=py312h98912ed_0 +- mkl=2023.2.0=h84fe81f_50496 +- mpc=1.3.1=hfe3b2da_0 +- mpfr=4.2.1=h9458935_1 +- mpg123=1.32.6=h59595ed_0 +- mpmath=1.3.0=pyhd8ed1ab_0 +- multidict=6.0.5=py312h98912ed_0 +- multiprocess=0.70.16=py312h98912ed_0 +- mysql-common=8.3.0=hf1915f5_4 +- mysql-libs=8.3.0=hca2cd23_4 +- nccl=2.21.5.1=h6103f9b_0 +- ncurses=6.5=h59595ed_0 +- nettle=3.9.1=h7ab15ed_0 +- networkx=3.3=pyhd8ed1ab_1 +- ninja=1.12.1=h297d8ca_0 +- numpy=1.26.4=py312heda63a1_0 +- ocl-icd=2.3.2=hd590300_1 +- opencv=4.9.0=qt6_py312hac6a15e_615 +- openexr=3.2.2=haf962dd_1 +- openh264=2.4.1=h59595ed_0 +- openjpeg=2.5.2=h488ebb8_0 +- openssl=3.3.1=h4ab18f5_0 +- orc=2.0.1=h17fec99_1 +- p11-kit=0.24.1=hc5aa10d_0 +- packaging=24.0=pyhd8ed1ab_0 +- pandas=2.2.2=py312h1d6d2e6_1 +- pcre2=10.43=hcad00b1_0 +- peft=0.11.1=pyhd8ed1ab_0 +- pillow=10.3.0=py312hdcec9eb_0 +- pip=24.0=pyhd8ed1ab_0 +- pixman=0.43.2=h59595ed_0 +- psutil=5.9.8=py312h98912ed_0 +- pthread-stubs=0.4=h36c2ea0_1001 +- pugixml=1.14=h59595ed_0 +- pulseaudio-client=17.0=hb77b528_0 +- py-cpuinfo=9.0.0=pyhd8ed1ab_0 +- py-opencv=4.9.0=qt6_py312hb24711e_615 +- pyarrow=16.1.0=py312h8da182e_1 +- pyarrow-core=16.1.0=py312h5429d62_1_cpu +- pyarrow-hotfix=0.6=pyhd8ed1ab_0 +- pydantic=2.7.3=pyhd8ed1ab_0 +- pydantic-core=2.18.4=py312h4413252_0 +- pynvml=11.5.0=pyhd8ed1ab_0 +- pysocks=1.7.1=pyha2e5f31_6 +- python=3.12.3=hab00c5b_0_cpython +- python-dateutil=2.9.0=pyhd8ed1ab_0 +- python-tzdata=2024.1=pyhd8ed1ab_0 +- python-xxhash=3.4.1=py312h98912ed_0 +- python_abi=3.12=4_cp312 +- pytorch=2.3.0=cuda118_py312h3690e1b_301 +- pytz=2024.1=pyhd8ed1ab_0 +- pyyaml=6.0.1=py312h98912ed_1 +- qt6-main=6.7.1=h2471661_2 +- re2=2023.09.01=h7f4b329_2 +- readline=8.2=h8228510_1 +- regex=2024.5.15=py312h9a8786e_0 +- requests=2.32.3=pyhd8ed1ab_0 +- s2n=1.4.15=he19d79f_0 +- safetensors=0.4.3=py312h4b3b743_0 +- setuptools=70.0.0=pyhd8ed1ab_0 +- six=1.16.0=pyh6c4a22f_0 +- sleef=3.5.1=h9b69904_2 +- snappy=1.2.0=hdb0a2a9_1 +- svt-av1=2.1.0=hac33072_0 +- sympy=1.12=pypyh9d50eac_103 +- tbb=2021.12.0=h297d8ca_1 +- tk=8.6.13=noxft_h4845f30_101 +- tokenizers=0.19.1=py312hfef1a59_0 +- torchvision=0.18.0=cuda118py312hff21f95_0 +- tqdm=4.66.4=pyhd8ed1ab_0 +- transformers=4.41.2=pyhd8ed1ab_0 +- typing-extensions=4.12.1=hd8ed1ab_0 +- typing_extensions=4.12.1=pyha770c72_0 +- tzdata=2024a=h0c530f3_0 +- urllib3=2.2.1=pyhd8ed1ab_0 +- wayland=1.23.0=h5291e77_0 +- wheel=0.43.0=pyhd8ed1ab_1 +- x264=1!164.3095=h166bdaf_2 +- x265=3.5=h924138e_3 +- xcb-util=0.4.0=hd590300_1 +- xcb-util-cursor=0.1.4=hd590300_1 +- xcb-util-image=0.4.0=h8ee46fc_1 +- xcb-util-keysyms=0.4.0=h8ee46fc_1 +- xcb-util-renderutil=0.3.9=hd590300_1 +- xcb-util-wm=0.4.1=h8ee46fc_1 +- xkeyboard-config=2.41=hd590300_0 +- xorg-fixesproto=5.0=h7f98852_1002 +- xorg-inputproto=2.3.2=h7f98852_1002 +- xorg-kbproto=1.0.7=h7f98852_1002 +- xorg-libice=1.1.1=hd590300_0 +- xorg-libsm=1.2.4=h7391055_0 +- xorg-libx11=1.8.9=h8ee46fc_0 +- xorg-libxau=1.0.11=hd590300_0 +- xorg-libxdmcp=1.1.3=h7f98852_0 +- xorg-libxext=1.3.4=h0b41bf4_2 +- xorg-libxfixes=5.0.3=h7f98852_1004 +- xorg-libxi=1.7.10=h7f98852_0 +- xorg-libxrender=0.9.11=hd590300_0 +- xorg-renderproto=0.11.1=h7f98852_1002 +- xorg-xextproto=7.3.0=h0b41bf4_1003 +- xorg-xproto=7.0.31=h7f98852_1007 +- xxhash=0.8.2=hd590300_0 +- xz=5.2.6=h166bdaf_0 +- yaml=0.2.5=h7f98852_2 +- yarl=1.9.4=py312h98912ed_0 +- zlib=1.3.1=h4ab18f5_1 +- zstd=1.5.6=ha6fb4c9_0 + diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/README.md b/MiniCPM-2-V/output/output_minicpmv2_lora/README.md new file mode 100644 index 00000000..df6e421c --- /dev/null +++ b/MiniCPM-2-V/output/output_minicpmv2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: /root/autodl-tmp/openbmb/MiniCPM-V-2 +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_config.json b/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_config.json new file mode 100644 index 00000000..a3e8145e --- /dev/null +++ b/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_config.json @@ -0,0 +1,26 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/root/autodl-tmp/openbmb/MiniCPM-V-2", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 64, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": "llm\\..*layers\\.\\d+\\.self_attn\\.(q_proj|k_proj)", + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_model.safetensors b/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_model.safetensors new file mode 100644 index 00000000..edcfde1c Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/adapter_model.safetensors differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175455.autodl-container-f5e543ac25-2cef338a b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175455.autodl-container-f5e543ac25-2cef338a new file mode 100644 index 00000000..b124607c Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175455.autodl-container-f5e543ac25-2cef338a differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175711.autodl-container-f5e543ac25-2cef338a b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175711.autodl-container-f5e543ac25-2cef338a new file mode 100644 index 00000000..9274c5fb Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175711.autodl-container-f5e543ac25-2cef338a differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175798.autodl-container-f5e543ac25-2cef338a b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175798.autodl-container-f5e543ac25-2cef338a new file mode 100644 index 00000000..5a813e60 Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/events.out.tfevents.1718175798.autodl-container-f5e543ac25-2cef338a differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/special_tokens_map.json b/MiniCPM-2-V/output/output_minicpmv2_lora/special_tokens_map.json new file mode 100644 index 00000000..d0621528 --- /dev/null +++ b/MiniCPM-2-V/output/output_minicpmv2_lora/special_tokens_map.json @@ -0,0 +1,44 @@ +{ + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer.model b/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer.model new file mode 100644 index 00000000..80808857 Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer.model differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer_config.json b/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer_config.json new file mode 100644 index 00000000..22c0c6e8 --- /dev/null +++ b/MiniCPM-2-V/output/output_minicpmv2_lora/tokenizer_config.json @@ -0,0 +1,160 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "109": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "110": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "111": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "112": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "auto_map": { + "AutoTokenizer": [ + "modeling_minicpmv.LlamaTokenizerWrapper", + null + ] + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizerWrapper", + "truncation_side": "right", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/trainer_state.json b/MiniCPM-2-V/output/output_minicpmv2_lora/trainer_state.json new file mode 100644 index 00000000..a9e7efbb --- /dev/null +++ b/MiniCPM-2-V/output/output_minicpmv2_lora/trainer_state.json @@ -0,0 +1,7028 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 110.88888888888889, + "eval_steps": 1000, + "global_step": 998, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.1111111111111111, + "grad_norm": 4.49665641784668, + "learning_rate": 0.0, + "loss": 1.1593, + "step": 1 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 4.124442100524902, + "learning_rate": 3.010299956639811e-07, + "loss": 1.2563, + "step": 2 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 4.124442100524902, + "learning_rate": 3.010299956639811e-07, + "loss": 1.3851, + "step": 3 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 5.840461730957031, + "learning_rate": 4.771212547196623e-07, + "loss": 1.5863, + "step": 4 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 3.0390214920043945, + "learning_rate": 6.020599913279622e-07, + "loss": 1.2133, + "step": 5 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.79929518699646, + "learning_rate": 6.989700043360186e-07, + "loss": 1.2552, + "step": 6 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 5.730904579162598, + "learning_rate": 7.781512503836435e-07, + "loss": 1.2875, + "step": 7 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.5650827884674072, + "learning_rate": 8.450980400142567e-07, + "loss": 1.2407, + "step": 8 + }, + { + "epoch": 1.0, + "grad_norm": 2.2476112842559814, + "learning_rate": 9.030899869919433e-07, + "loss": 1.1513, + "step": 9 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 3.357285499572754, + "learning_rate": 9.542425094393247e-07, + "loss": 1.2379, + "step": 10 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 4.208262920379639, + "learning_rate": 9.999999999999997e-07, + "loss": 1.2437, + "step": 11 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 3.168548822402954, + "learning_rate": 1e-06, + "loss": 1.2147, + "step": 12 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 3.168548822402954, + "learning_rate": 1e-06, + "loss": 1.324, + "step": 13 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 4.0574049949646, + "learning_rate": 1e-06, + "loss": 1.5244, + "step": 14 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 3.3592240810394287, + "learning_rate": 1e-06, + "loss": 1.3505, + "step": 15 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.0200934410095215, + "learning_rate": 1e-06, + "loss": 1.2925, + "step": 16 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 3.4108314514160156, + "learning_rate": 1e-06, + "loss": 1.1949, + "step": 17 + }, + { + "epoch": 2.0, + "grad_norm": 2.670156717300415, + "learning_rate": 1e-06, + "loss": 1.1889, + "step": 18 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 2.848493814468384, + "learning_rate": 1e-06, + "loss": 1.271, + "step": 19 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 1.862730860710144, + "learning_rate": 1e-06, + "loss": 1.2516, + "step": 20 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 2.4025464057922363, + "learning_rate": 1e-06, + "loss": 1.1905, + "step": 21 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 3.0819387435913086, + "learning_rate": 1e-06, + "loss": 1.2979, + "step": 22 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 1.6566828489303589, + "learning_rate": 1e-06, + "loss": 1.2004, + "step": 23 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 2.6607396602630615, + "learning_rate": 1e-06, + "loss": 1.2039, + "step": 24 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 1.3347333669662476, + "learning_rate": 1e-06, + "loss": 1.1513, + "step": 25 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 2.2526512145996094, + "learning_rate": 1e-06, + "loss": 1.3316, + "step": 26 + }, + { + "epoch": 3.0, + "grad_norm": 2.0493719577789307, + "learning_rate": 1e-06, + "loss": 1.2515, + "step": 27 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 2.002530097961426, + "learning_rate": 1e-06, + "loss": 1.062, + "step": 28 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 3.979753255844116, + "learning_rate": 1e-06, + "loss": 1.3528, + "step": 29 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 1.227388620376587, + "learning_rate": 1e-06, + "loss": 1.07, + "step": 30 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 2.14018177986145, + "learning_rate": 1e-06, + "loss": 1.3551, + "step": 31 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.196567416191101, + "learning_rate": 1e-06, + "loss": 1.1258, + "step": 32 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.9265096187591553, + "learning_rate": 1e-06, + "loss": 1.2725, + "step": 33 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 1.6760306358337402, + "learning_rate": 1e-06, + "loss": 1.222, + "step": 34 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 2.2533202171325684, + "learning_rate": 1e-06, + "loss": 1.3392, + "step": 35 + }, + { + "epoch": 4.0, + "grad_norm": 1.3224655389785767, + "learning_rate": 1e-06, + "loss": 1.0899, + "step": 36 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 1.8413286209106445, + "learning_rate": 1e-06, + "loss": 1.1676, + "step": 37 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 1.279106855392456, + "learning_rate": 1e-06, + "loss": 1.2108, + "step": 38 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.5242613554000854, + "learning_rate": 1e-06, + "loss": 1.1748, + "step": 39 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 1.0554380416870117, + "learning_rate": 1e-06, + "loss": 1.0035, + "step": 40 + }, + { + "epoch": 4.555555555555555, + "grad_norm": 1.679937720298767, + "learning_rate": 1e-06, + "loss": 1.3456, + "step": 41 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.8965375423431396, + "learning_rate": 1e-06, + "loss": 1.1724, + "step": 42 + }, + { + "epoch": 4.777777777777778, + "grad_norm": 1.4479789733886719, + "learning_rate": 1e-06, + "loss": 1.2201, + "step": 43 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 1.9123609066009521, + "learning_rate": 1e-06, + "loss": 1.3352, + "step": 44 + }, + { + "epoch": 5.0, + "grad_norm": 1.4676909446716309, + "learning_rate": 1e-06, + "loss": 1.2047, + "step": 45 + }, + { + "epoch": 5.111111111111111, + "grad_norm": 2.168522834777832, + "learning_rate": 1e-06, + "loss": 1.3256, + "step": 46 + }, + { + "epoch": 5.222222222222222, + "grad_norm": 1.8151073455810547, + "learning_rate": 1e-06, + "loss": 1.2249, + "step": 47 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 1.126733422279358, + "learning_rate": 1e-06, + "loss": 1.0838, + "step": 48 + }, + { + "epoch": 5.444444444444445, + "grad_norm": 1.6664854288101196, + "learning_rate": 1e-06, + "loss": 1.2194, + "step": 49 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 1.6171505451202393, + "learning_rate": 1e-06, + "loss": 1.2697, + "step": 50 + }, + { + "epoch": 5.666666666666667, + "grad_norm": 1.2255887985229492, + "learning_rate": 1e-06, + "loss": 1.1271, + "step": 51 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 2.212496042251587, + "learning_rate": 1e-06, + "loss": 1.1988, + "step": 52 + }, + { + "epoch": 5.888888888888889, + "grad_norm": 1.0164906978607178, + "learning_rate": 1e-06, + "loss": 1.1331, + "step": 53 + }, + { + "epoch": 6.0, + "grad_norm": 2.234724998474121, + "learning_rate": 1e-06, + "loss": 1.3019, + "step": 54 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 1.503280758857727, + "learning_rate": 1e-06, + "loss": 1.2054, + "step": 55 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 2.433361291885376, + "learning_rate": 1e-06, + "loss": 1.2562, + "step": 56 + }, + { + "epoch": 6.333333333333333, + "grad_norm": 1.3219724893569946, + "learning_rate": 1e-06, + "loss": 1.1501, + "step": 57 + }, + { + "epoch": 6.444444444444445, + "grad_norm": 1.4367828369140625, + "learning_rate": 1e-06, + "loss": 1.2746, + "step": 58 + }, + { + "epoch": 6.555555555555555, + "grad_norm": 2.0494906902313232, + "learning_rate": 1e-06, + "loss": 1.1025, + "step": 59 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 2.1568455696105957, + "learning_rate": 1e-06, + "loss": 1.1992, + "step": 60 + }, + { + "epoch": 6.777777777777778, + "grad_norm": 1.7505625486373901, + "learning_rate": 1e-06, + "loss": 1.1024, + "step": 61 + }, + { + "epoch": 6.888888888888889, + "grad_norm": 1.1023458242416382, + "learning_rate": 1e-06, + "loss": 1.133, + "step": 62 + }, + { + "epoch": 7.0, + "grad_norm": 1.459398627281189, + "learning_rate": 1e-06, + "loss": 1.0812, + "step": 63 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 2.39670991897583, + "learning_rate": 1e-06, + "loss": 1.3085, + "step": 64 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 1.1903048753738403, + "learning_rate": 1e-06, + "loss": 1.1586, + "step": 65 + }, + { + "epoch": 7.333333333333333, + "grad_norm": 1.5839647054672241, + "learning_rate": 1e-06, + "loss": 1.2127, + "step": 66 + }, + { + "epoch": 7.444444444444445, + "grad_norm": 1.6372401714324951, + "learning_rate": 1e-06, + "loss": 0.9815, + "step": 67 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 1.3241337537765503, + "learning_rate": 1e-06, + "loss": 1.223, + "step": 68 + }, + { + "epoch": 7.666666666666667, + "grad_norm": 2.198045253753662, + "learning_rate": 1e-06, + "loss": 1.1799, + "step": 69 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 2.249744176864624, + "learning_rate": 1e-06, + "loss": 1.1941, + "step": 70 + }, + { + "epoch": 7.888888888888889, + "grad_norm": 2.1148593425750732, + "learning_rate": 1e-06, + "loss": 1.2595, + "step": 71 + }, + { + "epoch": 8.0, + "grad_norm": 1.7665213346481323, + "learning_rate": 1e-06, + "loss": 1.1593, + "step": 72 + }, + { + "epoch": 8.11111111111111, + "grad_norm": 1.337896466255188, + "learning_rate": 1e-06, + "loss": 1.2676, + "step": 73 + }, + { + "epoch": 8.222222222222221, + "grad_norm": 1.5488286018371582, + "learning_rate": 1e-06, + "loss": 1.2088, + "step": 74 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 1.7703675031661987, + "learning_rate": 1e-06, + "loss": 1.1042, + "step": 75 + }, + { + "epoch": 8.444444444444445, + "grad_norm": 1.053892731666565, + "learning_rate": 1e-06, + "loss": 1.1404, + "step": 76 + }, + { + "epoch": 8.555555555555555, + "grad_norm": 1.4253357648849487, + "learning_rate": 1e-06, + "loss": 1.1009, + "step": 77 + }, + { + "epoch": 8.666666666666666, + "grad_norm": 1.2994499206542969, + "learning_rate": 1e-06, + "loss": 1.1469, + "step": 78 + }, + { + "epoch": 8.777777777777779, + "grad_norm": 1.9693901538848877, + "learning_rate": 1e-06, + "loss": 1.2267, + "step": 79 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 1.317744493484497, + "learning_rate": 1e-06, + "loss": 1.042, + "step": 80 + }, + { + "epoch": 9.0, + "grad_norm": 1.3617701530456543, + "learning_rate": 1e-06, + "loss": 1.1359, + "step": 81 + }, + { + "epoch": 9.11111111111111, + "grad_norm": 0.9582542777061462, + "learning_rate": 1e-06, + "loss": 1.057, + "step": 82 + }, + { + "epoch": 9.222222222222221, + "grad_norm": 1.245201587677002, + "learning_rate": 1e-06, + "loss": 1.1291, + "step": 83 + }, + { + "epoch": 9.333333333333334, + "grad_norm": 1.705923318862915, + "learning_rate": 1e-06, + "loss": 1.1896, + "step": 84 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 1.6293283700942993, + "learning_rate": 1e-06, + "loss": 1.0956, + "step": 85 + }, + { + "epoch": 9.555555555555555, + "grad_norm": 1.6264375448226929, + "learning_rate": 1e-06, + "loss": 1.1174, + "step": 86 + }, + { + "epoch": 9.666666666666666, + "grad_norm": 2.165580987930298, + "learning_rate": 1e-06, + "loss": 1.2059, + "step": 87 + }, + { + "epoch": 9.777777777777779, + "grad_norm": 1.9255369901657104, + "learning_rate": 1e-06, + "loss": 1.1782, + "step": 88 + }, + { + "epoch": 9.88888888888889, + "grad_norm": 1.4285647869110107, + "learning_rate": 1e-06, + "loss": 1.1703, + "step": 89 + }, + { + "epoch": 10.0, + "grad_norm": 1.2981191873550415, + "learning_rate": 1e-06, + "loss": 1.1108, + "step": 90 + }, + { + "epoch": 10.11111111111111, + "grad_norm": 1.549428105354309, + "learning_rate": 1e-06, + "loss": 1.0294, + "step": 91 + }, + { + "epoch": 10.222222222222221, + "grad_norm": 1.3490827083587646, + "learning_rate": 1e-06, + "loss": 1.1817, + "step": 92 + }, + { + "epoch": 10.333333333333334, + "grad_norm": 1.5004029273986816, + "learning_rate": 1e-06, + "loss": 1.09, + "step": 93 + }, + { + "epoch": 10.444444444444445, + "grad_norm": 2.146407127380371, + "learning_rate": 1e-06, + "loss": 1.0852, + "step": 94 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 2.259960174560547, + "learning_rate": 1e-06, + "loss": 1.0895, + "step": 95 + }, + { + "epoch": 10.666666666666666, + "grad_norm": 1.2890896797180176, + "learning_rate": 1e-06, + "loss": 1.1661, + "step": 96 + }, + { + "epoch": 10.777777777777779, + "grad_norm": 1.8632493019104004, + "learning_rate": 1e-06, + "loss": 1.2956, + "step": 97 + }, + { + "epoch": 10.88888888888889, + "grad_norm": 1.3903627395629883, + "learning_rate": 1e-06, + "loss": 1.0794, + "step": 98 + }, + { + "epoch": 11.0, + "grad_norm": 1.7120167016983032, + "learning_rate": 1e-06, + "loss": 1.1547, + "step": 99 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 1.6115831136703491, + "learning_rate": 1e-06, + "loss": 1.1133, + "step": 100 + }, + { + "epoch": 11.222222222222221, + "grad_norm": 1.6459829807281494, + "learning_rate": 1e-06, + "loss": 1.0619, + "step": 101 + }, + { + "epoch": 11.333333333333334, + "grad_norm": 1.8485289812088013, + "learning_rate": 1e-06, + "loss": 1.2481, + "step": 102 + }, + { + "epoch": 11.444444444444445, + "grad_norm": 1.070677638053894, + "learning_rate": 1e-06, + "loss": 1.0388, + "step": 103 + }, + { + "epoch": 11.555555555555555, + "grad_norm": 1.6385332345962524, + "learning_rate": 1e-06, + "loss": 1.263, + "step": 104 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 2.4001266956329346, + "learning_rate": 1e-06, + "loss": 1.1974, + "step": 105 + }, + { + "epoch": 11.777777777777779, + "grad_norm": 0.9357035756111145, + "learning_rate": 1e-06, + "loss": 1.0523, + "step": 106 + }, + { + "epoch": 11.88888888888889, + "grad_norm": 1.173198938369751, + "learning_rate": 1e-06, + "loss": 1.0577, + "step": 107 + }, + { + "epoch": 12.0, + "grad_norm": 1.9721860885620117, + "learning_rate": 1e-06, + "loss": 0.9721, + "step": 108 + }, + { + "epoch": 12.11111111111111, + "grad_norm": 1.6971633434295654, + "learning_rate": 1e-06, + "loss": 1.2205, + "step": 109 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 1.244423508644104, + "learning_rate": 1e-06, + "loss": 1.1337, + "step": 110 + }, + { + "epoch": 12.333333333333334, + "grad_norm": 1.6942721605300903, + "learning_rate": 1e-06, + "loss": 1.1074, + "step": 111 + }, + { + "epoch": 12.444444444444445, + "grad_norm": 2.0190207958221436, + "learning_rate": 1e-06, + "loss": 1.1925, + "step": 112 + }, + { + "epoch": 12.555555555555555, + "grad_norm": 1.2092722654342651, + "learning_rate": 1e-06, + "loss": 0.997, + "step": 113 + }, + { + "epoch": 12.666666666666666, + "grad_norm": 1.3208969831466675, + "learning_rate": 1e-06, + "loss": 1.0889, + "step": 114 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 1.9457271099090576, + "learning_rate": 1e-06, + "loss": 1.0335, + "step": 115 + }, + { + "epoch": 12.88888888888889, + "grad_norm": 1.5970720052719116, + "learning_rate": 1e-06, + "loss": 1.0577, + "step": 116 + }, + { + "epoch": 13.0, + "grad_norm": 2.3128230571746826, + "learning_rate": 1e-06, + "loss": 1.2225, + "step": 117 + }, + { + "epoch": 13.11111111111111, + "grad_norm": 2.4210803508758545, + "learning_rate": 1e-06, + "loss": 1.0405, + "step": 118 + }, + { + "epoch": 13.222222222222221, + "grad_norm": 1.447824478149414, + "learning_rate": 1e-06, + "loss": 1.1486, + "step": 119 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 1.9287456274032593, + "learning_rate": 1e-06, + "loss": 1.0766, + "step": 120 + }, + { + "epoch": 13.444444444444445, + "grad_norm": 2.595796585083008, + "learning_rate": 1e-06, + "loss": 0.9948, + "step": 121 + }, + { + "epoch": 13.555555555555555, + "grad_norm": 1.565015196800232, + "learning_rate": 1e-06, + "loss": 1.1265, + "step": 122 + }, + { + "epoch": 13.666666666666666, + "grad_norm": 1.7117595672607422, + "learning_rate": 1e-06, + "loss": 1.1516, + "step": 123 + }, + { + "epoch": 13.777777777777779, + "grad_norm": 2.029736042022705, + "learning_rate": 1e-06, + "loss": 1.1422, + "step": 124 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 3.743150472640991, + "learning_rate": 1e-06, + "loss": 1.0826, + "step": 125 + }, + { + "epoch": 14.0, + "grad_norm": 1.5541292428970337, + "learning_rate": 1e-06, + "loss": 1.0494, + "step": 126 + }, + { + "epoch": 14.11111111111111, + "grad_norm": 3.291475296020508, + "learning_rate": 1e-06, + "loss": 1.1142, + "step": 127 + }, + { + "epoch": 14.222222222222221, + "grad_norm": 1.556293249130249, + "learning_rate": 1e-06, + "loss": 1.0599, + "step": 128 + }, + { + "epoch": 14.333333333333334, + "grad_norm": 1.989487648010254, + "learning_rate": 1e-06, + "loss": 1.0385, + "step": 129 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 1.6959519386291504, + "learning_rate": 1e-06, + "loss": 1.0802, + "step": 130 + }, + { + "epoch": 14.555555555555555, + "grad_norm": 1.3137555122375488, + "learning_rate": 1e-06, + "loss": 1.1265, + "step": 131 + }, + { + "epoch": 14.666666666666666, + "grad_norm": 1.102333903312683, + "learning_rate": 1e-06, + "loss": 0.9876, + "step": 132 + }, + { + "epoch": 14.777777777777779, + "grad_norm": 2.564713716506958, + "learning_rate": 1e-06, + "loss": 1.1468, + "step": 133 + }, + { + "epoch": 14.88888888888889, + "grad_norm": 2.0909900665283203, + "learning_rate": 1e-06, + "loss": 1.1308, + "step": 134 + }, + { + "epoch": 15.0, + "grad_norm": 1.6823238134384155, + "learning_rate": 1e-06, + "loss": 1.1025, + "step": 135 + }, + { + "epoch": 15.11111111111111, + "grad_norm": 3.4225268363952637, + "learning_rate": 1e-06, + "loss": 1.1165, + "step": 136 + }, + { + "epoch": 15.222222222222221, + "grad_norm": 1.4099456071853638, + "learning_rate": 1e-06, + "loss": 1.0397, + "step": 137 + }, + { + "epoch": 15.333333333333334, + "grad_norm": 1.374805212020874, + "learning_rate": 1e-06, + "loss": 1.0326, + "step": 138 + }, + { + "epoch": 15.444444444444445, + "grad_norm": 1.4985007047653198, + "learning_rate": 1e-06, + "loss": 1.1647, + "step": 139 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 1.6761070489883423, + "learning_rate": 1e-06, + "loss": 1.13, + "step": 140 + }, + { + "epoch": 15.666666666666666, + "grad_norm": 1.9672166109085083, + "learning_rate": 1e-06, + "loss": 1.0478, + "step": 141 + }, + { + "epoch": 15.777777777777779, + "grad_norm": 2.725348472595215, + "learning_rate": 1e-06, + "loss": 1.0654, + "step": 142 + }, + { + "epoch": 15.88888888888889, + "grad_norm": 4.20762300491333, + "learning_rate": 1e-06, + "loss": 1.0313, + "step": 143 + }, + { + "epoch": 16.0, + "grad_norm": 2.5818567276000977, + "learning_rate": 1e-06, + "loss": 1.0399, + "step": 144 + }, + { + "epoch": 16.11111111111111, + "grad_norm": 2.149756669998169, + "learning_rate": 1e-06, + "loss": 1.0669, + "step": 145 + }, + { + "epoch": 16.22222222222222, + "grad_norm": 1.993472933769226, + "learning_rate": 1e-06, + "loss": 1.1696, + "step": 146 + }, + { + "epoch": 16.333333333333332, + "grad_norm": 2.8747777938842773, + "learning_rate": 1e-06, + "loss": 1.0597, + "step": 147 + }, + { + "epoch": 16.444444444444443, + "grad_norm": 1.075242519378662, + "learning_rate": 1e-06, + "loss": 1.0278, + "step": 148 + }, + { + "epoch": 16.555555555555557, + "grad_norm": 2.5512351989746094, + "learning_rate": 1e-06, + "loss": 1.1481, + "step": 149 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 2.368272066116333, + "learning_rate": 1e-06, + "loss": 1.0817, + "step": 150 + }, + { + "epoch": 16.77777777777778, + "grad_norm": 2.4868478775024414, + "learning_rate": 1e-06, + "loss": 0.9652, + "step": 151 + }, + { + "epoch": 16.88888888888889, + "grad_norm": 1.8313658237457275, + "learning_rate": 1e-06, + "loss": 1.0901, + "step": 152 + }, + { + "epoch": 17.0, + "grad_norm": 2.8137712478637695, + "learning_rate": 1e-06, + "loss": 1.086, + "step": 153 + }, + { + "epoch": 17.11111111111111, + "grad_norm": 2.241750955581665, + "learning_rate": 1e-06, + "loss": 1.1571, + "step": 154 + }, + { + "epoch": 17.22222222222222, + "grad_norm": 1.2296514511108398, + "learning_rate": 1e-06, + "loss": 0.8644, + "step": 155 + }, + { + "epoch": 17.333333333333332, + "grad_norm": 1.9674986600875854, + "learning_rate": 1e-06, + "loss": 1.0547, + "step": 156 + }, + { + "epoch": 17.444444444444443, + "grad_norm": 2.6460955142974854, + "learning_rate": 1e-06, + "loss": 1.1618, + "step": 157 + }, + { + "epoch": 17.555555555555557, + "grad_norm": 1.937049388885498, + "learning_rate": 1e-06, + "loss": 1.0374, + "step": 158 + }, + { + "epoch": 17.666666666666668, + "grad_norm": 7.286559581756592, + "learning_rate": 1e-06, + "loss": 1.0401, + "step": 159 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 2.944786548614502, + "learning_rate": 1e-06, + "loss": 1.0044, + "step": 160 + }, + { + "epoch": 17.88888888888889, + "grad_norm": 1.6332862377166748, + "learning_rate": 1e-06, + "loss": 1.1501, + "step": 161 + }, + { + "epoch": 18.0, + "grad_norm": 3.1889591217041016, + "learning_rate": 1e-06, + "loss": 1.1097, + "step": 162 + }, + { + "epoch": 18.11111111111111, + "grad_norm": 1.8770782947540283, + "learning_rate": 1e-06, + "loss": 1.1764, + "step": 163 + }, + { + "epoch": 18.22222222222222, + "grad_norm": 2.4062719345092773, + "learning_rate": 1e-06, + "loss": 1.0132, + "step": 164 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 2.525468111038208, + "learning_rate": 1e-06, + "loss": 1.0937, + "step": 165 + }, + { + "epoch": 18.444444444444443, + "grad_norm": 2.1997547149658203, + "learning_rate": 1e-06, + "loss": 1.0606, + "step": 166 + }, + { + "epoch": 18.555555555555557, + "grad_norm": 1.4818522930145264, + "learning_rate": 1e-06, + "loss": 0.9401, + "step": 167 + }, + { + "epoch": 18.666666666666668, + "grad_norm": 2.9282243251800537, + "learning_rate": 1e-06, + "loss": 1.0372, + "step": 168 + }, + { + "epoch": 18.77777777777778, + "grad_norm": 2.4035823345184326, + "learning_rate": 1e-06, + "loss": 1.1514, + "step": 169 + }, + { + "epoch": 18.88888888888889, + "grad_norm": 2.7679686546325684, + "learning_rate": 1e-06, + "loss": 1.0395, + "step": 170 + }, + { + "epoch": 19.0, + "grad_norm": 4.128814220428467, + "learning_rate": 1e-06, + "loss": 0.9839, + "step": 171 + }, + { + "epoch": 19.11111111111111, + "grad_norm": 1.5194616317749023, + "learning_rate": 1e-06, + "loss": 1.1265, + "step": 172 + }, + { + "epoch": 19.22222222222222, + "grad_norm": 1.9642343521118164, + "learning_rate": 1e-06, + "loss": 0.8665, + "step": 173 + }, + { + "epoch": 19.333333333333332, + "grad_norm": 1.9796785116195679, + "learning_rate": 1e-06, + "loss": 1.0353, + "step": 174 + }, + { + "epoch": 19.444444444444443, + "grad_norm": 2.676199436187744, + "learning_rate": 1e-06, + "loss": 1.03, + "step": 175 + }, + { + "epoch": 19.555555555555557, + "grad_norm": 3.2983553409576416, + "learning_rate": 1e-06, + "loss": 1.066, + "step": 176 + }, + { + "epoch": 19.666666666666668, + "grad_norm": 2.8115146160125732, + "learning_rate": 1e-06, + "loss": 1.0217, + "step": 177 + }, + { + "epoch": 19.77777777777778, + "grad_norm": 2.9449892044067383, + "learning_rate": 1e-06, + "loss": 1.0992, + "step": 178 + }, + { + "epoch": 19.88888888888889, + "grad_norm": 3.069680690765381, + "learning_rate": 1e-06, + "loss": 1.0751, + "step": 179 + }, + { + "epoch": 20.0, + "grad_norm": 3.17688250541687, + "learning_rate": 1e-06, + "loss": 1.0319, + "step": 180 + }, + { + "epoch": 20.11111111111111, + "grad_norm": 1.3647427558898926, + "learning_rate": 1e-06, + "loss": 1.0608, + "step": 181 + }, + { + "epoch": 20.22222222222222, + "grad_norm": 1.5300778150558472, + "learning_rate": 1e-06, + "loss": 1.0362, + "step": 182 + }, + { + "epoch": 20.333333333333332, + "grad_norm": 4.857977867126465, + "learning_rate": 1e-06, + "loss": 0.931, + "step": 183 + }, + { + "epoch": 20.444444444444443, + "grad_norm": 1.6704386472702026, + "learning_rate": 1e-06, + "loss": 0.9521, + "step": 184 + }, + { + "epoch": 20.555555555555557, + "grad_norm": 4.490090847015381, + "learning_rate": 1e-06, + "loss": 1.0983, + "step": 185 + }, + { + "epoch": 20.666666666666668, + "grad_norm": 2.8315751552581787, + "learning_rate": 1e-06, + "loss": 1.0251, + "step": 186 + }, + { + "epoch": 20.77777777777778, + "grad_norm": 4.339689254760742, + "learning_rate": 1e-06, + "loss": 1.0759, + "step": 187 + }, + { + "epoch": 20.88888888888889, + "grad_norm": 1.6421418190002441, + "learning_rate": 1e-06, + "loss": 1.0163, + "step": 188 + }, + { + "epoch": 21.0, + "grad_norm": 2.5932278633117676, + "learning_rate": 1e-06, + "loss": 1.0651, + "step": 189 + }, + { + "epoch": 21.11111111111111, + "grad_norm": 1.675912857055664, + "learning_rate": 1e-06, + "loss": 1.0187, + "step": 190 + }, + { + "epoch": 21.22222222222222, + "grad_norm": 3.4378342628479004, + "learning_rate": 1e-06, + "loss": 1.0957, + "step": 191 + }, + { + "epoch": 21.333333333333332, + "grad_norm": 2.4542741775512695, + "learning_rate": 1e-06, + "loss": 1.0401, + "step": 192 + }, + { + "epoch": 21.444444444444443, + "grad_norm": 3.2500007152557373, + "learning_rate": 1e-06, + "loss": 0.9598, + "step": 193 + }, + { + "epoch": 21.555555555555557, + "grad_norm": 2.447542428970337, + "learning_rate": 1e-06, + "loss": 1.0375, + "step": 194 + }, + { + "epoch": 21.666666666666668, + "grad_norm": 2.2373616695404053, + "learning_rate": 1e-06, + "loss": 1.0986, + "step": 195 + }, + { + "epoch": 21.77777777777778, + "grad_norm": 2.548806667327881, + "learning_rate": 1e-06, + "loss": 0.9612, + "step": 196 + }, + { + "epoch": 21.88888888888889, + "grad_norm": 1.7553954124450684, + "learning_rate": 1e-06, + "loss": 0.9784, + "step": 197 + }, + { + "epoch": 22.0, + "grad_norm": 2.256045341491699, + "learning_rate": 1e-06, + "loss": 0.9583, + "step": 198 + }, + { + "epoch": 22.11111111111111, + "grad_norm": 1.7303504943847656, + "learning_rate": 1e-06, + "loss": 0.9592, + "step": 199 + }, + { + "epoch": 22.22222222222222, + "grad_norm": 2.455047130584717, + "learning_rate": 1e-06, + "loss": 0.8387, + "step": 200 + }, + { + "epoch": 22.333333333333332, + "grad_norm": 1.6653703451156616, + "learning_rate": 1e-06, + "loss": 1.0378, + "step": 201 + }, + { + "epoch": 22.444444444444443, + "grad_norm": 3.275569438934326, + "learning_rate": 1e-06, + "loss": 1.0977, + "step": 202 + }, + { + "epoch": 22.555555555555557, + "grad_norm": 2.7961230278015137, + "learning_rate": 1e-06, + "loss": 0.9377, + "step": 203 + }, + { + "epoch": 22.666666666666668, + "grad_norm": 1.7350988388061523, + "learning_rate": 1e-06, + "loss": 0.907, + "step": 204 + }, + { + "epoch": 22.77777777777778, + "grad_norm": 1.364461064338684, + "learning_rate": 1e-06, + "loss": 1.0579, + "step": 205 + }, + { + "epoch": 22.88888888888889, + "grad_norm": 4.087749004364014, + "learning_rate": 1e-06, + "loss": 1.0957, + "step": 206 + }, + { + "epoch": 23.0, + "grad_norm": 2.7112507820129395, + "learning_rate": 1e-06, + "loss": 0.9578, + "step": 207 + }, + { + "epoch": 23.11111111111111, + "grad_norm": 1.152789831161499, + "learning_rate": 1e-06, + "loss": 0.9043, + "step": 208 + }, + { + "epoch": 23.22222222222222, + "grad_norm": 3.4849369525909424, + "learning_rate": 1e-06, + "loss": 1.1194, + "step": 209 + }, + { + "epoch": 23.333333333333332, + "grad_norm": 2.0623903274536133, + "learning_rate": 1e-06, + "loss": 0.9221, + "step": 210 + }, + { + "epoch": 23.444444444444443, + "grad_norm": 3.6974055767059326, + "learning_rate": 1e-06, + "loss": 0.9605, + "step": 211 + }, + { + "epoch": 23.555555555555557, + "grad_norm": 3.1083216667175293, + "learning_rate": 1e-06, + "loss": 1.1506, + "step": 212 + }, + { + "epoch": 23.666666666666668, + "grad_norm": 3.6169991493225098, + "learning_rate": 1e-06, + "loss": 0.9246, + "step": 213 + }, + { + "epoch": 23.77777777777778, + "grad_norm": 2.414052963256836, + "learning_rate": 1e-06, + "loss": 0.9999, + "step": 214 + }, + { + "epoch": 23.88888888888889, + "grad_norm": 2.573862314224243, + "learning_rate": 1e-06, + "loss": 0.9834, + "step": 215 + }, + { + "epoch": 24.0, + "grad_norm": 3.6944236755371094, + "learning_rate": 1e-06, + "loss": 0.8926, + "step": 216 + }, + { + "epoch": 24.11111111111111, + "grad_norm": 1.8823630809783936, + "learning_rate": 1e-06, + "loss": 0.851, + "step": 217 + }, + { + "epoch": 24.22222222222222, + "grad_norm": 5.6351399421691895, + "learning_rate": 1e-06, + "loss": 1.0933, + "step": 218 + }, + { + "epoch": 24.333333333333332, + "grad_norm": 5.663857936859131, + "learning_rate": 1e-06, + "loss": 1.0851, + "step": 219 + }, + { + "epoch": 24.444444444444443, + "grad_norm": 1.8298949003219604, + "learning_rate": 1e-06, + "loss": 0.8738, + "step": 220 + }, + { + "epoch": 24.555555555555557, + "grad_norm": 3.446504592895508, + "learning_rate": 1e-06, + "loss": 0.942, + "step": 221 + }, + { + "epoch": 24.666666666666668, + "grad_norm": 3.277195453643799, + "learning_rate": 1e-06, + "loss": 0.9986, + "step": 222 + }, + { + "epoch": 24.77777777777778, + "grad_norm": 4.210409164428711, + "learning_rate": 1e-06, + "loss": 0.8249, + "step": 223 + }, + { + "epoch": 24.88888888888889, + "grad_norm": 2.433013439178467, + "learning_rate": 1e-06, + "loss": 1.0863, + "step": 224 + }, + { + "epoch": 25.0, + "grad_norm": 2.323624610900879, + "learning_rate": 1e-06, + "loss": 0.9202, + "step": 225 + }, + { + "epoch": 25.11111111111111, + "grad_norm": 2.843452215194702, + "learning_rate": 1e-06, + "loss": 1.0604, + "step": 226 + }, + { + "epoch": 25.22222222222222, + "grad_norm": 2.136242151260376, + "learning_rate": 1e-06, + "loss": 0.8826, + "step": 227 + }, + { + "epoch": 25.333333333333332, + "grad_norm": 2.374464511871338, + "learning_rate": 1e-06, + "loss": 0.9877, + "step": 228 + }, + { + "epoch": 25.444444444444443, + "grad_norm": 2.374464511871338, + "learning_rate": 1e-06, + "loss": 1.0213, + "step": 229 + }, + { + "epoch": 25.555555555555557, + "grad_norm": 3.0793685913085938, + "learning_rate": 1e-06, + "loss": 1.0105, + "step": 230 + }, + { + "epoch": 25.666666666666668, + "grad_norm": 3.4420270919799805, + "learning_rate": 1e-06, + "loss": 0.7543, + "step": 231 + }, + { + "epoch": 25.77777777777778, + "grad_norm": 2.5529704093933105, + "learning_rate": 1e-06, + "loss": 0.9346, + "step": 232 + }, + { + "epoch": 25.88888888888889, + "grad_norm": 10.825578689575195, + "learning_rate": 1e-06, + "loss": 1.0162, + "step": 233 + }, + { + "epoch": 26.0, + "grad_norm": 4.381101131439209, + "learning_rate": 1e-06, + "loss": 1.0626, + "step": 234 + }, + { + "epoch": 26.11111111111111, + "grad_norm": 5.618079662322998, + "learning_rate": 1e-06, + "loss": 0.919, + "step": 235 + }, + { + "epoch": 26.22222222222222, + "grad_norm": 3.3376009464263916, + "learning_rate": 1e-06, + "loss": 1.0321, + "step": 236 + }, + { + "epoch": 26.333333333333332, + "grad_norm": 2.4334495067596436, + "learning_rate": 1e-06, + "loss": 0.988, + "step": 237 + }, + { + "epoch": 26.444444444444443, + "grad_norm": 1.8198598623275757, + "learning_rate": 1e-06, + "loss": 1.0523, + "step": 238 + }, + { + "epoch": 26.555555555555557, + "grad_norm": 3.1344549655914307, + "learning_rate": 1e-06, + "loss": 0.9327, + "step": 239 + }, + { + "epoch": 26.666666666666668, + "grad_norm": 2.3528554439544678, + "learning_rate": 1e-06, + "loss": 1.0075, + "step": 240 + }, + { + "epoch": 26.77777777777778, + "grad_norm": 4.295862674713135, + "learning_rate": 1e-06, + "loss": 0.9734, + "step": 241 + }, + { + "epoch": 26.88888888888889, + "grad_norm": 3.496724843978882, + "learning_rate": 1e-06, + "loss": 0.8748, + "step": 242 + }, + { + "epoch": 27.0, + "grad_norm": 6.702414035797119, + "learning_rate": 1e-06, + "loss": 0.7779, + "step": 243 + }, + { + "epoch": 27.11111111111111, + "grad_norm": 2.3519527912139893, + "learning_rate": 1e-06, + "loss": 0.8735, + "step": 244 + }, + { + "epoch": 27.22222222222222, + "grad_norm": 3.316526412963867, + "learning_rate": 1e-06, + "loss": 0.8648, + "step": 245 + }, + { + "epoch": 27.333333333333332, + "grad_norm": 3.897092819213867, + "learning_rate": 1e-06, + "loss": 1.0248, + "step": 246 + }, + { + "epoch": 27.444444444444443, + "grad_norm": 1.909849762916565, + "learning_rate": 1e-06, + "loss": 0.9826, + "step": 247 + }, + { + "epoch": 27.555555555555557, + "grad_norm": 3.588510274887085, + "learning_rate": 1e-06, + "loss": 0.9968, + "step": 248 + }, + { + "epoch": 27.666666666666668, + "grad_norm": 3.782503128051758, + "learning_rate": 1e-06, + "loss": 0.8814, + "step": 249 + }, + { + "epoch": 27.77777777777778, + "grad_norm": 2.2527377605438232, + "learning_rate": 1e-06, + "loss": 0.9562, + "step": 250 + }, + { + "epoch": 27.88888888888889, + "grad_norm": 2.978278636932373, + "learning_rate": 1e-06, + "loss": 0.9675, + "step": 251 + }, + { + "epoch": 28.0, + "grad_norm": 4.310064792633057, + "learning_rate": 1e-06, + "loss": 0.901, + "step": 252 + }, + { + "epoch": 28.11111111111111, + "grad_norm": 4.16270637512207, + "learning_rate": 1e-06, + "loss": 0.9563, + "step": 253 + }, + { + "epoch": 28.22222222222222, + "grad_norm": 6.249475479125977, + "learning_rate": 1e-06, + "loss": 0.9267, + "step": 254 + }, + { + "epoch": 28.333333333333332, + "grad_norm": 3.0315773487091064, + "learning_rate": 1e-06, + "loss": 0.8543, + "step": 255 + }, + { + "epoch": 28.444444444444443, + "grad_norm": 1.769006609916687, + "learning_rate": 1e-06, + "loss": 0.9372, + "step": 256 + }, + { + "epoch": 28.555555555555557, + "grad_norm": 2.3356363773345947, + "learning_rate": 1e-06, + "loss": 0.953, + "step": 257 + }, + { + "epoch": 28.666666666666668, + "grad_norm": 4.292929649353027, + "learning_rate": 1e-06, + "loss": 0.9995, + "step": 258 + }, + { + "epoch": 28.77777777777778, + "grad_norm": 3.0659830570220947, + "learning_rate": 1e-06, + "loss": 0.9725, + "step": 259 + }, + { + "epoch": 28.88888888888889, + "grad_norm": 5.2314772605896, + "learning_rate": 1e-06, + "loss": 0.8234, + "step": 260 + }, + { + "epoch": 29.0, + "grad_norm": 3.8846657276153564, + "learning_rate": 1e-06, + "loss": 0.9401, + "step": 261 + }, + { + "epoch": 29.11111111111111, + "grad_norm": 4.647997856140137, + "learning_rate": 1e-06, + "loss": 0.9284, + "step": 262 + }, + { + "epoch": 29.22222222222222, + "grad_norm": 2.3509411811828613, + "learning_rate": 1e-06, + "loss": 1.0657, + "step": 263 + }, + { + "epoch": 29.333333333333332, + "grad_norm": 1.5279780626296997, + "learning_rate": 1e-06, + "loss": 0.8736, + "step": 264 + }, + { + "epoch": 29.444444444444443, + "grad_norm": 7.212976932525635, + "learning_rate": 1e-06, + "loss": 0.8539, + "step": 265 + }, + { + "epoch": 29.555555555555557, + "grad_norm": 3.6765549182891846, + "learning_rate": 1e-06, + "loss": 0.7891, + "step": 266 + }, + { + "epoch": 29.666666666666668, + "grad_norm": 5.6958489418029785, + "learning_rate": 1e-06, + "loss": 0.8773, + "step": 267 + }, + { + "epoch": 29.77777777777778, + "grad_norm": 5.398871421813965, + "learning_rate": 1e-06, + "loss": 1.0076, + "step": 268 + }, + { + "epoch": 29.88888888888889, + "grad_norm": 3.9249660968780518, + "learning_rate": 1e-06, + "loss": 0.8878, + "step": 269 + }, + { + "epoch": 30.0, + "grad_norm": 5.3084893226623535, + "learning_rate": 1e-06, + "loss": 0.9419, + "step": 270 + }, + { + "epoch": 30.11111111111111, + "grad_norm": 2.6671621799468994, + "learning_rate": 1e-06, + "loss": 0.8644, + "step": 271 + }, + { + "epoch": 30.22222222222222, + "grad_norm": 3.628174066543579, + "learning_rate": 1e-06, + "loss": 0.9431, + "step": 272 + }, + { + "epoch": 30.333333333333332, + "grad_norm": 2.30537748336792, + "learning_rate": 1e-06, + "loss": 0.9341, + "step": 273 + }, + { + "epoch": 30.444444444444443, + "grad_norm": 3.2564539909362793, + "learning_rate": 1e-06, + "loss": 0.9427, + "step": 274 + }, + { + "epoch": 30.555555555555557, + "grad_norm": 2.057532787322998, + "learning_rate": 1e-06, + "loss": 0.9639, + "step": 275 + }, + { + "epoch": 30.666666666666668, + "grad_norm": 7.121829986572266, + "learning_rate": 1e-06, + "loss": 0.7816, + "step": 276 + }, + { + "epoch": 30.77777777777778, + "grad_norm": 9.727752685546875, + "learning_rate": 1e-06, + "loss": 0.9659, + "step": 277 + }, + { + "epoch": 30.88888888888889, + "grad_norm": 2.522280693054199, + "learning_rate": 1e-06, + "loss": 0.8694, + "step": 278 + }, + { + "epoch": 31.0, + "grad_norm": 5.1409101486206055, + "learning_rate": 1e-06, + "loss": 0.837, + "step": 279 + }, + { + "epoch": 31.11111111111111, + "grad_norm": 2.0206334590911865, + "learning_rate": 1e-06, + "loss": 0.9721, + "step": 280 + }, + { + "epoch": 31.22222222222222, + "grad_norm": 4.694882869720459, + "learning_rate": 1e-06, + "loss": 1.0138, + "step": 281 + }, + { + "epoch": 31.333333333333332, + "grad_norm": 4.221930980682373, + "learning_rate": 1e-06, + "loss": 0.7925, + "step": 282 + }, + { + "epoch": 31.444444444444443, + "grad_norm": 2.9225778579711914, + "learning_rate": 1e-06, + "loss": 0.8813, + "step": 283 + }, + { + "epoch": 31.555555555555557, + "grad_norm": 3.095292806625366, + "learning_rate": 1e-06, + "loss": 0.8084, + "step": 284 + }, + { + "epoch": 31.666666666666668, + "grad_norm": 4.435314655303955, + "learning_rate": 1e-06, + "loss": 0.9132, + "step": 285 + }, + { + "epoch": 31.77777777777778, + "grad_norm": 3.6875507831573486, + "learning_rate": 1e-06, + "loss": 0.9679, + "step": 286 + }, + { + "epoch": 31.88888888888889, + "grad_norm": 4.080234527587891, + "learning_rate": 1e-06, + "loss": 0.8634, + "step": 287 + }, + { + "epoch": 32.0, + "grad_norm": 4.801446914672852, + "learning_rate": 1e-06, + "loss": 0.9539, + "step": 288 + }, + { + "epoch": 32.111111111111114, + "grad_norm": 3.996509313583374, + "learning_rate": 1e-06, + "loss": 0.7943, + "step": 289 + }, + { + "epoch": 32.22222222222222, + "grad_norm": 4.389383792877197, + "learning_rate": 1e-06, + "loss": 0.843, + "step": 290 + }, + { + "epoch": 32.333333333333336, + "grad_norm": 2.8417623043060303, + "learning_rate": 1e-06, + "loss": 0.9002, + "step": 291 + }, + { + "epoch": 32.44444444444444, + "grad_norm": 10.290750503540039, + "learning_rate": 1e-06, + "loss": 0.9362, + "step": 292 + }, + { + "epoch": 32.55555555555556, + "grad_norm": 3.2090003490448, + "learning_rate": 1e-06, + "loss": 0.9604, + "step": 293 + }, + { + "epoch": 32.666666666666664, + "grad_norm": 7.777849197387695, + "learning_rate": 1e-06, + "loss": 0.8798, + "step": 294 + }, + { + "epoch": 32.77777777777778, + "grad_norm": 5.8683929443359375, + "learning_rate": 1e-06, + "loss": 0.9205, + "step": 295 + }, + { + "epoch": 32.888888888888886, + "grad_norm": 5.062497138977051, + "learning_rate": 1e-06, + "loss": 0.8351, + "step": 296 + }, + { + "epoch": 33.0, + "grad_norm": 8.156352996826172, + "learning_rate": 1e-06, + "loss": 0.7595, + "step": 297 + }, + { + "epoch": 33.111111111111114, + "grad_norm": 3.4869048595428467, + "learning_rate": 1e-06, + "loss": 1.0815, + "step": 298 + }, + { + "epoch": 33.22222222222222, + "grad_norm": 6.1906280517578125, + "learning_rate": 1e-06, + "loss": 0.7957, + "step": 299 + }, + { + "epoch": 33.333333333333336, + "grad_norm": 4.029522895812988, + "learning_rate": 1e-06, + "loss": 0.8982, + "step": 300 + }, + { + "epoch": 33.44444444444444, + "grad_norm": 2.7688729763031006, + "learning_rate": 1e-06, + "loss": 0.8566, + "step": 301 + }, + { + "epoch": 33.55555555555556, + "grad_norm": 6.378791809082031, + "learning_rate": 1e-06, + "loss": 0.7817, + "step": 302 + }, + { + "epoch": 33.666666666666664, + "grad_norm": 3.277953863143921, + "learning_rate": 1e-06, + "loss": 0.943, + "step": 303 + }, + { + "epoch": 33.77777777777778, + "grad_norm": 4.151890277862549, + "learning_rate": 1e-06, + "loss": 0.8942, + "step": 304 + }, + { + "epoch": 33.888888888888886, + "grad_norm": 6.878890037536621, + "learning_rate": 1e-06, + "loss": 0.8607, + "step": 305 + }, + { + "epoch": 34.0, + "grad_norm": 3.927706480026245, + "learning_rate": 1e-06, + "loss": 0.9639, + "step": 306 + }, + { + "epoch": 34.111111111111114, + "grad_norm": 2.0341031551361084, + "learning_rate": 1e-06, + "loss": 0.8334, + "step": 307 + }, + { + "epoch": 34.22222222222222, + "grad_norm": 5.6038923263549805, + "learning_rate": 1e-06, + "loss": 0.8488, + "step": 308 + }, + { + "epoch": 34.333333333333336, + "grad_norm": 4.395098686218262, + "learning_rate": 1e-06, + "loss": 0.9672, + "step": 309 + }, + { + "epoch": 34.44444444444444, + "grad_norm": 3.8672053813934326, + "learning_rate": 1e-06, + "loss": 0.9002, + "step": 310 + }, + { + "epoch": 34.55555555555556, + "grad_norm": 8.810863494873047, + "learning_rate": 1e-06, + "loss": 0.7316, + "step": 311 + }, + { + "epoch": 34.666666666666664, + "grad_norm": 8.220240592956543, + "learning_rate": 1e-06, + "loss": 0.9079, + "step": 312 + }, + { + "epoch": 34.77777777777778, + "grad_norm": 4.171933174133301, + "learning_rate": 1e-06, + "loss": 0.851, + "step": 313 + }, + { + "epoch": 34.888888888888886, + "grad_norm": 3.8519346714019775, + "learning_rate": 1e-06, + "loss": 0.8641, + "step": 314 + }, + { + "epoch": 35.0, + "grad_norm": 3.0927493572235107, + "learning_rate": 1e-06, + "loss": 0.7855, + "step": 315 + }, + { + "epoch": 35.111111111111114, + "grad_norm": 3.8645637035369873, + "learning_rate": 1e-06, + "loss": 0.9452, + "step": 316 + }, + { + "epoch": 35.22222222222222, + "grad_norm": 3.6525685787200928, + "learning_rate": 1e-06, + "loss": 0.8575, + "step": 317 + }, + { + "epoch": 35.333333333333336, + "grad_norm": 4.366754055023193, + "learning_rate": 1e-06, + "loss": 0.8506, + "step": 318 + }, + { + "epoch": 35.44444444444444, + "grad_norm": 3.3835811614990234, + "learning_rate": 1e-06, + "loss": 0.7879, + "step": 319 + }, + { + "epoch": 35.55555555555556, + "grad_norm": 5.200087070465088, + "learning_rate": 1e-06, + "loss": 0.9133, + "step": 320 + }, + { + "epoch": 35.666666666666664, + "grad_norm": 4.246176719665527, + "learning_rate": 1e-06, + "loss": 0.9067, + "step": 321 + }, + { + "epoch": 35.77777777777778, + "grad_norm": 4.595211982727051, + "learning_rate": 1e-06, + "loss": 0.7934, + "step": 322 + }, + { + "epoch": 35.888888888888886, + "grad_norm": 10.199066162109375, + "learning_rate": 1e-06, + "loss": 0.7892, + "step": 323 + }, + { + "epoch": 36.0, + "grad_norm": 5.402686595916748, + "learning_rate": 1e-06, + "loss": 0.9573, + "step": 324 + }, + { + "epoch": 36.111111111111114, + "grad_norm": 4.45036506652832, + "learning_rate": 1e-06, + "loss": 0.8776, + "step": 325 + }, + { + "epoch": 36.22222222222222, + "grad_norm": 4.292491436004639, + "learning_rate": 1e-06, + "loss": 0.7338, + "step": 326 + }, + { + "epoch": 36.333333333333336, + "grad_norm": 8.217951774597168, + "learning_rate": 1e-06, + "loss": 0.8087, + "step": 327 + }, + { + "epoch": 36.44444444444444, + "grad_norm": 3.9634268283843994, + "learning_rate": 1e-06, + "loss": 0.9723, + "step": 328 + }, + { + "epoch": 36.55555555555556, + "grad_norm": 7.8139753341674805, + "learning_rate": 1e-06, + "loss": 0.831, + "step": 329 + }, + { + "epoch": 36.666666666666664, + "grad_norm": 10.394512176513672, + "learning_rate": 1e-06, + "loss": 0.9424, + "step": 330 + }, + { + "epoch": 36.77777777777778, + "grad_norm": 7.358921527862549, + "learning_rate": 1e-06, + "loss": 0.8161, + "step": 331 + }, + { + "epoch": 36.888888888888886, + "grad_norm": 3.9476616382598877, + "learning_rate": 1e-06, + "loss": 0.8154, + "step": 332 + }, + { + "epoch": 37.0, + "grad_norm": 6.964223861694336, + "learning_rate": 1e-06, + "loss": 0.831, + "step": 333 + }, + { + "epoch": 37.111111111111114, + "grad_norm": 4.15282678604126, + "learning_rate": 1e-06, + "loss": 0.8328, + "step": 334 + }, + { + "epoch": 37.22222222222222, + "grad_norm": 7.021949768066406, + "learning_rate": 1e-06, + "loss": 0.9247, + "step": 335 + }, + { + "epoch": 37.333333333333336, + "grad_norm": 3.2167723178863525, + "learning_rate": 1e-06, + "loss": 0.865, + "step": 336 + }, + { + "epoch": 37.44444444444444, + "grad_norm": 7.6068434715271, + "learning_rate": 1e-06, + "loss": 0.8634, + "step": 337 + }, + { + "epoch": 37.55555555555556, + "grad_norm": 5.761429309844971, + "learning_rate": 1e-06, + "loss": 0.8278, + "step": 338 + }, + { + "epoch": 37.666666666666664, + "grad_norm": 3.397979736328125, + "learning_rate": 1e-06, + "loss": 0.8097, + "step": 339 + }, + { + "epoch": 37.77777777777778, + "grad_norm": 6.263580322265625, + "learning_rate": 1e-06, + "loss": 0.7424, + "step": 340 + }, + { + "epoch": 37.888888888888886, + "grad_norm": 3.70469331741333, + "learning_rate": 1e-06, + "loss": 0.739, + "step": 341 + }, + { + "epoch": 38.0, + "grad_norm": 7.345405578613281, + "learning_rate": 1e-06, + "loss": 0.7307, + "step": 342 + }, + { + "epoch": 38.111111111111114, + "grad_norm": 3.7526581287384033, + "learning_rate": 1e-06, + "loss": 0.8714, + "step": 343 + }, + { + "epoch": 38.22222222222222, + "grad_norm": 3.8463449478149414, + "learning_rate": 1e-06, + "loss": 0.8587, + "step": 344 + }, + { + "epoch": 38.333333333333336, + "grad_norm": 4.341264724731445, + "learning_rate": 1e-06, + "loss": 0.8424, + "step": 345 + }, + { + "epoch": 38.44444444444444, + "grad_norm": 4.381249904632568, + "learning_rate": 1e-06, + "loss": 0.8233, + "step": 346 + }, + { + "epoch": 38.55555555555556, + "grad_norm": 13.448233604431152, + "learning_rate": 1e-06, + "loss": 0.5884, + "step": 347 + }, + { + "epoch": 38.666666666666664, + "grad_norm": 9.100780487060547, + "learning_rate": 1e-06, + "loss": 0.8164, + "step": 348 + }, + { + "epoch": 38.77777777777778, + "grad_norm": 2.789055824279785, + "learning_rate": 1e-06, + "loss": 0.8353, + "step": 349 + }, + { + "epoch": 38.888888888888886, + "grad_norm": 5.998260974884033, + "learning_rate": 1e-06, + "loss": 0.8533, + "step": 350 + }, + { + "epoch": 39.0, + "grad_norm": 8.851040840148926, + "learning_rate": 1e-06, + "loss": 0.7528, + "step": 351 + }, + { + "epoch": 39.111111111111114, + "grad_norm": 4.623925685882568, + "learning_rate": 1e-06, + "loss": 0.8234, + "step": 352 + }, + { + "epoch": 39.22222222222222, + "grad_norm": 2.89211106300354, + "learning_rate": 1e-06, + "loss": 0.9151, + "step": 353 + }, + { + "epoch": 39.333333333333336, + "grad_norm": 6.030821800231934, + "learning_rate": 1e-06, + "loss": 0.6923, + "step": 354 + }, + { + "epoch": 39.44444444444444, + "grad_norm": 6.265501499176025, + "learning_rate": 1e-06, + "loss": 0.77, + "step": 355 + }, + { + "epoch": 39.55555555555556, + "grad_norm": 7.337396621704102, + "learning_rate": 1e-06, + "loss": 0.832, + "step": 356 + }, + { + "epoch": 39.666666666666664, + "grad_norm": 9.531771659851074, + "learning_rate": 1e-06, + "loss": 0.7978, + "step": 357 + }, + { + "epoch": 39.77777777777778, + "grad_norm": 11.29820728302002, + "learning_rate": 1e-06, + "loss": 0.7267, + "step": 358 + }, + { + "epoch": 39.888888888888886, + "grad_norm": 6.624297142028809, + "learning_rate": 1e-06, + "loss": 0.7484, + "step": 359 + }, + { + "epoch": 40.0, + "grad_norm": 5.1134538650512695, + "learning_rate": 1e-06, + "loss": 0.8054, + "step": 360 + }, + { + "epoch": 40.111111111111114, + "grad_norm": 5.639194965362549, + "learning_rate": 1e-06, + "loss": 0.862, + "step": 361 + }, + { + "epoch": 40.22222222222222, + "grad_norm": 5.362929344177246, + "learning_rate": 1e-06, + "loss": 0.715, + "step": 362 + }, + { + "epoch": 40.333333333333336, + "grad_norm": 6.406772613525391, + "learning_rate": 1e-06, + "loss": 0.7676, + "step": 363 + }, + { + "epoch": 40.44444444444444, + "grad_norm": 3.419759750366211, + "learning_rate": 1e-06, + "loss": 0.8547, + "step": 364 + }, + { + "epoch": 40.55555555555556, + "grad_norm": 11.050379753112793, + "learning_rate": 1e-06, + "loss": 0.7386, + "step": 365 + }, + { + "epoch": 40.666666666666664, + "grad_norm": 9.893701553344727, + "learning_rate": 1e-06, + "loss": 0.6842, + "step": 366 + }, + { + "epoch": 40.77777777777778, + "grad_norm": 3.1416172981262207, + "learning_rate": 1e-06, + "loss": 0.8835, + "step": 367 + }, + { + "epoch": 40.888888888888886, + "grad_norm": 12.275131225585938, + "learning_rate": 1e-06, + "loss": 0.7157, + "step": 368 + }, + { + "epoch": 41.0, + "grad_norm": 7.330772876739502, + "learning_rate": 1e-06, + "loss": 0.9548, + "step": 369 + }, + { + "epoch": 41.111111111111114, + "grad_norm": 7.203161716461182, + "learning_rate": 1e-06, + "loss": 0.7215, + "step": 370 + }, + { + "epoch": 41.22222222222222, + "grad_norm": 5.30029296875, + "learning_rate": 1e-06, + "loss": 0.8625, + "step": 371 + }, + { + "epoch": 41.333333333333336, + "grad_norm": 5.315255165100098, + "learning_rate": 1e-06, + "loss": 0.6991, + "step": 372 + }, + { + "epoch": 41.44444444444444, + "grad_norm": 6.887378215789795, + "learning_rate": 1e-06, + "loss": 0.6797, + "step": 373 + }, + { + "epoch": 41.55555555555556, + "grad_norm": 4.063701152801514, + "learning_rate": 1e-06, + "loss": 0.8703, + "step": 374 + }, + { + "epoch": 41.666666666666664, + "grad_norm": 6.096094131469727, + "learning_rate": 1e-06, + "loss": 0.8149, + "step": 375 + }, + { + "epoch": 41.77777777777778, + "grad_norm": 5.813841819763184, + "learning_rate": 1e-06, + "loss": 0.7557, + "step": 376 + }, + { + "epoch": 41.888888888888886, + "grad_norm": 6.558757781982422, + "learning_rate": 1e-06, + "loss": 0.8036, + "step": 377 + }, + { + "epoch": 42.0, + "grad_norm": 5.888645648956299, + "learning_rate": 1e-06, + "loss": 0.7009, + "step": 378 + }, + { + "epoch": 42.111111111111114, + "grad_norm": 2.8863866329193115, + "learning_rate": 1e-06, + "loss": 0.7058, + "step": 379 + }, + { + "epoch": 42.22222222222222, + "grad_norm": 6.019309043884277, + "learning_rate": 1e-06, + "loss": 0.6573, + "step": 380 + }, + { + "epoch": 42.333333333333336, + "grad_norm": 4.3127217292785645, + "learning_rate": 1e-06, + "loss": 0.7143, + "step": 381 + }, + { + "epoch": 42.44444444444444, + "grad_norm": 16.949445724487305, + "learning_rate": 1e-06, + "loss": 0.8646, + "step": 382 + }, + { + "epoch": 42.55555555555556, + "grad_norm": 8.224754333496094, + "learning_rate": 1e-06, + "loss": 0.8311, + "step": 383 + }, + { + "epoch": 42.666666666666664, + "grad_norm": 9.156580924987793, + "learning_rate": 1e-06, + "loss": 0.6792, + "step": 384 + }, + { + "epoch": 42.77777777777778, + "grad_norm": 8.819658279418945, + "learning_rate": 1e-06, + "loss": 0.825, + "step": 385 + }, + { + "epoch": 42.888888888888886, + "grad_norm": 5.61016321182251, + "learning_rate": 1e-06, + "loss": 0.6733, + "step": 386 + }, + { + "epoch": 43.0, + "grad_norm": 5.059127330780029, + "learning_rate": 1e-06, + "loss": 0.675, + "step": 387 + }, + { + "epoch": 43.111111111111114, + "grad_norm": 4.955588340759277, + "learning_rate": 1e-06, + "loss": 0.7557, + "step": 388 + }, + { + "epoch": 43.22222222222222, + "grad_norm": 5.710536479949951, + "learning_rate": 1e-06, + "loss": 0.6388, + "step": 389 + }, + { + "epoch": 43.333333333333336, + "grad_norm": 7.523907661437988, + "learning_rate": 1e-06, + "loss": 0.8559, + "step": 390 + }, + { + "epoch": 43.44444444444444, + "grad_norm": 5.0749006271362305, + "learning_rate": 1e-06, + "loss": 0.8258, + "step": 391 + }, + { + "epoch": 43.55555555555556, + "grad_norm": 5.0749006271362305, + "learning_rate": 1e-06, + "loss": 0.6667, + "step": 392 + }, + { + "epoch": 43.666666666666664, + "grad_norm": 4.573469638824463, + "learning_rate": 1e-06, + "loss": 0.7823, + "step": 393 + }, + { + "epoch": 43.77777777777778, + "grad_norm": 6.0362443923950195, + "learning_rate": 1e-06, + "loss": 0.8015, + "step": 394 + }, + { + "epoch": 43.888888888888886, + "grad_norm": 8.296324729919434, + "learning_rate": 1e-06, + "loss": 0.6795, + "step": 395 + }, + { + "epoch": 44.0, + "grad_norm": 8.753950119018555, + "learning_rate": 1e-06, + "loss": 0.6169, + "step": 396 + }, + { + "epoch": 44.111111111111114, + "grad_norm": 4.90728235244751, + "learning_rate": 1e-06, + "loss": 0.7305, + "step": 397 + }, + { + "epoch": 44.22222222222222, + "grad_norm": 5.369459629058838, + "learning_rate": 1e-06, + "loss": 0.7806, + "step": 398 + }, + { + "epoch": 44.333333333333336, + "grad_norm": 19.801265716552734, + "learning_rate": 1e-06, + "loss": 0.6901, + "step": 399 + }, + { + "epoch": 44.44444444444444, + "grad_norm": 6.647787570953369, + "learning_rate": 1e-06, + "loss": 0.8508, + "step": 400 + }, + { + "epoch": 44.55555555555556, + "grad_norm": 6.838271141052246, + "learning_rate": 1e-06, + "loss": 0.6888, + "step": 401 + }, + { + "epoch": 44.666666666666664, + "grad_norm": 13.494742393493652, + "learning_rate": 1e-06, + "loss": 0.6697, + "step": 402 + }, + { + "epoch": 44.77777777777778, + "grad_norm": 8.38461971282959, + "learning_rate": 1e-06, + "loss": 0.7882, + "step": 403 + }, + { + "epoch": 44.888888888888886, + "grad_norm": 4.009434223175049, + "learning_rate": 1e-06, + "loss": 0.6849, + "step": 404 + }, + { + "epoch": 45.0, + "grad_norm": 5.498405933380127, + "learning_rate": 1e-06, + "loss": 0.7171, + "step": 405 + }, + { + "epoch": 45.111111111111114, + "grad_norm": 10.407638549804688, + "learning_rate": 1e-06, + "loss": 0.5655, + "step": 406 + }, + { + "epoch": 45.22222222222222, + "grad_norm": 32.24665069580078, + "learning_rate": 1e-06, + "loss": 0.7461, + "step": 407 + }, + { + "epoch": 45.333333333333336, + "grad_norm": 6.997832298278809, + "learning_rate": 1e-06, + "loss": 0.7449, + "step": 408 + }, + { + "epoch": 45.44444444444444, + "grad_norm": 10.8290433883667, + "learning_rate": 1e-06, + "loss": 0.8314, + "step": 409 + }, + { + "epoch": 45.55555555555556, + "grad_norm": 8.135465621948242, + "learning_rate": 1e-06, + "loss": 0.7846, + "step": 410 + }, + { + "epoch": 45.666666666666664, + "grad_norm": 7.876898288726807, + "learning_rate": 1e-06, + "loss": 0.772, + "step": 411 + }, + { + "epoch": 45.77777777777778, + "grad_norm": 7.03132438659668, + "learning_rate": 1e-06, + "loss": 0.6581, + "step": 412 + }, + { + "epoch": 45.888888888888886, + "grad_norm": 8.415534019470215, + "learning_rate": 1e-06, + "loss": 0.6757, + "step": 413 + }, + { + "epoch": 46.0, + "grad_norm": 18.6757755279541, + "learning_rate": 1e-06, + "loss": 0.5128, + "step": 414 + }, + { + "epoch": 46.111111111111114, + "grad_norm": 6.357339859008789, + "learning_rate": 1e-06, + "loss": 0.6669, + "step": 415 + }, + { + "epoch": 46.22222222222222, + "grad_norm": 7.287088871002197, + "learning_rate": 1e-06, + "loss": 0.6389, + "step": 416 + }, + { + "epoch": 46.333333333333336, + "grad_norm": 20.443561553955078, + "learning_rate": 1e-06, + "loss": 0.7393, + "step": 417 + }, + { + "epoch": 46.44444444444444, + "grad_norm": 5.268315315246582, + "learning_rate": 1e-06, + "loss": 0.7416, + "step": 418 + }, + { + "epoch": 46.55555555555556, + "grad_norm": 8.214530944824219, + "learning_rate": 1e-06, + "loss": 0.6159, + "step": 419 + }, + { + "epoch": 46.666666666666664, + "grad_norm": 10.254910469055176, + "learning_rate": 1e-06, + "loss": 0.8518, + "step": 420 + }, + { + "epoch": 46.77777777777778, + "grad_norm": 5.9776291847229, + "learning_rate": 1e-06, + "loss": 0.78, + "step": 421 + }, + { + "epoch": 46.888888888888886, + "grad_norm": 8.078858375549316, + "learning_rate": 1e-06, + "loss": 0.7244, + "step": 422 + }, + { + "epoch": 47.0, + "grad_norm": 11.86533260345459, + "learning_rate": 1e-06, + "loss": 0.6972, + "step": 423 + }, + { + "epoch": 47.111111111111114, + "grad_norm": 3.921278715133667, + "learning_rate": 1e-06, + "loss": 0.7662, + "step": 424 + }, + { + "epoch": 47.22222222222222, + "grad_norm": 9.437241554260254, + "learning_rate": 1e-06, + "loss": 0.7574, + "step": 425 + }, + { + "epoch": 47.333333333333336, + "grad_norm": 14.276528358459473, + "learning_rate": 1e-06, + "loss": 0.6134, + "step": 426 + }, + { + "epoch": 47.44444444444444, + "grad_norm": 10.37995433807373, + "learning_rate": 1e-06, + "loss": 0.5157, + "step": 427 + }, + { + "epoch": 47.55555555555556, + "grad_norm": 6.347216606140137, + "learning_rate": 1e-06, + "loss": 0.7678, + "step": 428 + }, + { + "epoch": 47.666666666666664, + "grad_norm": 5.487001895904541, + "learning_rate": 1e-06, + "loss": 0.7783, + "step": 429 + }, + { + "epoch": 47.77777777777778, + "grad_norm": 11.019286155700684, + "learning_rate": 1e-06, + "loss": 0.7027, + "step": 430 + }, + { + "epoch": 47.888888888888886, + "grad_norm": 12.408769607543945, + "learning_rate": 1e-06, + "loss": 0.6449, + "step": 431 + }, + { + "epoch": 48.0, + "grad_norm": 11.846124649047852, + "learning_rate": 1e-06, + "loss": 0.7414, + "step": 432 + }, + { + "epoch": 48.111111111111114, + "grad_norm": 9.108853340148926, + "learning_rate": 1e-06, + "loss": 0.7528, + "step": 433 + }, + { + "epoch": 48.22222222222222, + "grad_norm": 6.275120258331299, + "learning_rate": 1e-06, + "loss": 0.6772, + "step": 434 + }, + { + "epoch": 48.333333333333336, + "grad_norm": 7.314884662628174, + "learning_rate": 1e-06, + "loss": 0.5802, + "step": 435 + }, + { + "epoch": 48.44444444444444, + "grad_norm": 10.12442398071289, + "learning_rate": 1e-06, + "loss": 0.7076, + "step": 436 + }, + { + "epoch": 48.55555555555556, + "grad_norm": 6.602572441101074, + "learning_rate": 1e-06, + "loss": 0.6451, + "step": 437 + }, + { + "epoch": 48.666666666666664, + "grad_norm": 13.967925071716309, + "learning_rate": 1e-06, + "loss": 0.6436, + "step": 438 + }, + { + "epoch": 48.77777777777778, + "grad_norm": 9.761388778686523, + "learning_rate": 1e-06, + "loss": 0.7352, + "step": 439 + }, + { + "epoch": 48.888888888888886, + "grad_norm": 11.754059791564941, + "learning_rate": 1e-06, + "loss": 0.6275, + "step": 440 + }, + { + "epoch": 49.0, + "grad_norm": 10.045551300048828, + "learning_rate": 1e-06, + "loss": 0.7252, + "step": 441 + }, + { + "epoch": 49.111111111111114, + "grad_norm": 6.035585880279541, + "learning_rate": 1e-06, + "loss": 0.6906, + "step": 442 + }, + { + "epoch": 49.22222222222222, + "grad_norm": 7.387479305267334, + "learning_rate": 1e-06, + "loss": 0.7464, + "step": 443 + }, + { + "epoch": 49.333333333333336, + "grad_norm": 13.737719535827637, + "learning_rate": 1e-06, + "loss": 0.4615, + "step": 444 + }, + { + "epoch": 49.44444444444444, + "grad_norm": 12.792169570922852, + "learning_rate": 1e-06, + "loss": 0.6913, + "step": 445 + }, + { + "epoch": 49.55555555555556, + "grad_norm": 12.118124008178711, + "learning_rate": 1e-06, + "loss": 0.661, + "step": 446 + }, + { + "epoch": 49.666666666666664, + "grad_norm": 7.464675426483154, + "learning_rate": 1e-06, + "loss": 0.6937, + "step": 447 + }, + { + "epoch": 49.77777777777778, + "grad_norm": 6.767265796661377, + "learning_rate": 1e-06, + "loss": 0.8403, + "step": 448 + }, + { + "epoch": 49.888888888888886, + "grad_norm": 6.507316589355469, + "learning_rate": 1e-06, + "loss": 0.591, + "step": 449 + }, + { + "epoch": 50.0, + "grad_norm": 12.289777755737305, + "learning_rate": 1e-06, + "loss": 0.5881, + "step": 450 + }, + { + "epoch": 50.111111111111114, + "grad_norm": 10.271717071533203, + "learning_rate": 1e-06, + "loss": 0.7218, + "step": 451 + }, + { + "epoch": 50.22222222222222, + "grad_norm": 7.040245056152344, + "learning_rate": 1e-06, + "loss": 0.7396, + "step": 452 + }, + { + "epoch": 50.333333333333336, + "grad_norm": 7.0577073097229, + "learning_rate": 1e-06, + "loss": 0.7116, + "step": 453 + }, + { + "epoch": 50.44444444444444, + "grad_norm": 9.217415809631348, + "learning_rate": 1e-06, + "loss": 0.6831, + "step": 454 + }, + { + "epoch": 50.55555555555556, + "grad_norm": 13.35080337524414, + "learning_rate": 1e-06, + "loss": 0.4963, + "step": 455 + }, + { + "epoch": 50.666666666666664, + "grad_norm": 13.842920303344727, + "learning_rate": 1e-06, + "loss": 0.6888, + "step": 456 + }, + { + "epoch": 50.77777777777778, + "grad_norm": 9.883484840393066, + "learning_rate": 1e-06, + "loss": 0.7122, + "step": 457 + }, + { + "epoch": 50.888888888888886, + "grad_norm": 4.838831901550293, + "learning_rate": 1e-06, + "loss": 0.6746, + "step": 458 + }, + { + "epoch": 51.0, + "grad_norm": 18.547691345214844, + "learning_rate": 1e-06, + "loss": 0.6575, + "step": 459 + }, + { + "epoch": 51.111111111111114, + "grad_norm": 4.299892902374268, + "learning_rate": 1e-06, + "loss": 0.7669, + "step": 460 + }, + { + "epoch": 51.22222222222222, + "grad_norm": 6.47662878036499, + "learning_rate": 1e-06, + "loss": 0.7221, + "step": 461 + }, + { + "epoch": 51.333333333333336, + "grad_norm": 12.47951889038086, + "learning_rate": 1e-06, + "loss": 0.6849, + "step": 462 + }, + { + "epoch": 51.44444444444444, + "grad_norm": 10.59024715423584, + "learning_rate": 1e-06, + "loss": 0.6039, + "step": 463 + }, + { + "epoch": 51.55555555555556, + "grad_norm": 10.585366249084473, + "learning_rate": 1e-06, + "loss": 0.6626, + "step": 464 + }, + { + "epoch": 51.666666666666664, + "grad_norm": 11.459756851196289, + "learning_rate": 1e-06, + "loss": 0.6546, + "step": 465 + }, + { + "epoch": 51.77777777777778, + "grad_norm": 18.51183319091797, + "learning_rate": 1e-06, + "loss": 0.5327, + "step": 466 + }, + { + "epoch": 51.888888888888886, + "grad_norm": 5.380756378173828, + "learning_rate": 1e-06, + "loss": 0.6049, + "step": 467 + }, + { + "epoch": 52.0, + "grad_norm": 12.356815338134766, + "learning_rate": 1e-06, + "loss": 0.6658, + "step": 468 + }, + { + "epoch": 52.111111111111114, + "grad_norm": 11.049664497375488, + "learning_rate": 1e-06, + "loss": 0.6701, + "step": 469 + }, + { + "epoch": 52.22222222222222, + "grad_norm": 6.026761054992676, + "learning_rate": 1e-06, + "loss": 0.6885, + "step": 470 + }, + { + "epoch": 52.333333333333336, + "grad_norm": 8.015928268432617, + "learning_rate": 1e-06, + "loss": 0.7027, + "step": 471 + }, + { + "epoch": 52.44444444444444, + "grad_norm": 6.723360061645508, + "learning_rate": 1e-06, + "loss": 0.5933, + "step": 472 + }, + { + "epoch": 52.55555555555556, + "grad_norm": 13.261897087097168, + "learning_rate": 1e-06, + "loss": 0.6587, + "step": 473 + }, + { + "epoch": 52.666666666666664, + "grad_norm": 8.888422966003418, + "learning_rate": 1e-06, + "loss": 0.5887, + "step": 474 + }, + { + "epoch": 52.77777777777778, + "grad_norm": 7.298074245452881, + "learning_rate": 1e-06, + "loss": 0.6522, + "step": 475 + }, + { + "epoch": 52.888888888888886, + "grad_norm": 7.339013576507568, + "learning_rate": 1e-06, + "loss": 0.5477, + "step": 476 + }, + { + "epoch": 53.0, + "grad_norm": 15.717872619628906, + "learning_rate": 1e-06, + "loss": 0.5587, + "step": 477 + }, + { + "epoch": 53.111111111111114, + "grad_norm": 7.707242012023926, + "learning_rate": 1e-06, + "loss": 0.6611, + "step": 478 + }, + { + "epoch": 53.22222222222222, + "grad_norm": 8.254220008850098, + "learning_rate": 1e-06, + "loss": 0.5738, + "step": 479 + }, + { + "epoch": 53.333333333333336, + "grad_norm": 15.977533340454102, + "learning_rate": 1e-06, + "loss": 0.5312, + "step": 480 + }, + { + "epoch": 53.44444444444444, + "grad_norm": 5.4340691566467285, + "learning_rate": 1e-06, + "loss": 0.7916, + "step": 481 + }, + { + "epoch": 53.55555555555556, + "grad_norm": 9.354497909545898, + "learning_rate": 1e-06, + "loss": 0.4556, + "step": 482 + }, + { + "epoch": 53.666666666666664, + "grad_norm": 20.250926971435547, + "learning_rate": 1e-06, + "loss": 0.6069, + "step": 483 + }, + { + "epoch": 53.77777777777778, + "grad_norm": 9.834790229797363, + "learning_rate": 1e-06, + "loss": 0.7832, + "step": 484 + }, + { + "epoch": 53.888888888888886, + "grad_norm": 13.727291107177734, + "learning_rate": 1e-06, + "loss": 0.5843, + "step": 485 + }, + { + "epoch": 54.0, + "grad_norm": 12.71815013885498, + "learning_rate": 1e-06, + "loss": 0.569, + "step": 486 + }, + { + "epoch": 54.111111111111114, + "grad_norm": 10.053393363952637, + "learning_rate": 1e-06, + "loss": 0.5606, + "step": 487 + }, + { + "epoch": 54.22222222222222, + "grad_norm": 5.9269304275512695, + "learning_rate": 1e-06, + "loss": 0.5882, + "step": 488 + }, + { + "epoch": 54.333333333333336, + "grad_norm": 5.646124839782715, + "learning_rate": 1e-06, + "loss": 0.5955, + "step": 489 + }, + { + "epoch": 54.44444444444444, + "grad_norm": 8.270062446594238, + "learning_rate": 1e-06, + "loss": 0.5955, + "step": 490 + }, + { + "epoch": 54.55555555555556, + "grad_norm": 12.416106224060059, + "learning_rate": 1e-06, + "loss": 0.6372, + "step": 491 + }, + { + "epoch": 54.666666666666664, + "grad_norm": 8.680733680725098, + "learning_rate": 1e-06, + "loss": 0.5739, + "step": 492 + }, + { + "epoch": 54.77777777777778, + "grad_norm": 8.680733680725098, + "learning_rate": 1e-06, + "loss": 0.6075, + "step": 493 + }, + { + "epoch": 54.888888888888886, + "grad_norm": 9.26663875579834, + "learning_rate": 1e-06, + "loss": 0.6477, + "step": 494 + }, + { + "epoch": 55.0, + "grad_norm": 17.249530792236328, + "learning_rate": 1e-06, + "loss": 0.5217, + "step": 495 + }, + { + "epoch": 55.111111111111114, + "grad_norm": 9.652692794799805, + "learning_rate": 1e-06, + "loss": 0.4477, + "step": 496 + }, + { + "epoch": 55.22222222222222, + "grad_norm": 9.652692794799805, + "learning_rate": 1e-06, + "loss": 0.6635, + "step": 497 + }, + { + "epoch": 55.333333333333336, + "grad_norm": 4.5503458976745605, + "learning_rate": 1e-06, + "loss": 0.7162, + "step": 498 + }, + { + "epoch": 55.44444444444444, + "grad_norm": 11.172821044921875, + "learning_rate": 1e-06, + "loss": 0.5481, + "step": 499 + }, + { + "epoch": 55.55555555555556, + "grad_norm": 13.021891593933105, + "learning_rate": 1e-06, + "loss": 0.5972, + "step": 500 + }, + { + "epoch": 55.666666666666664, + "grad_norm": 9.355863571166992, + "learning_rate": 1e-06, + "loss": 0.6927, + "step": 501 + }, + { + "epoch": 55.77777777777778, + "grad_norm": 15.529994010925293, + "learning_rate": 1e-06, + "loss": 0.5071, + "step": 502 + }, + { + "epoch": 55.888888888888886, + "grad_norm": 35.66951370239258, + "learning_rate": 1e-06, + "loss": 0.5603, + "step": 503 + }, + { + "epoch": 56.0, + "grad_norm": 21.544647216796875, + "learning_rate": 1e-06, + "loss": 0.4521, + "step": 504 + }, + { + "epoch": 56.111111111111114, + "grad_norm": 10.466456413269043, + "learning_rate": 1e-06, + "loss": 0.5864, + "step": 505 + }, + { + "epoch": 56.22222222222222, + "grad_norm": 23.28225326538086, + "learning_rate": 1e-06, + "loss": 0.6824, + "step": 506 + }, + { + "epoch": 56.333333333333336, + "grad_norm": 5.185910224914551, + "learning_rate": 1e-06, + "loss": 0.5964, + "step": 507 + }, + { + "epoch": 56.44444444444444, + "grad_norm": 13.984573364257812, + "learning_rate": 1e-06, + "loss": 0.491, + "step": 508 + }, + { + "epoch": 56.55555555555556, + "grad_norm": 10.063070297241211, + "learning_rate": 1e-06, + "loss": 0.6974, + "step": 509 + }, + { + "epoch": 56.666666666666664, + "grad_norm": 12.225486755371094, + "learning_rate": 1e-06, + "loss": 0.5792, + "step": 510 + }, + { + "epoch": 56.77777777777778, + "grad_norm": 21.144817352294922, + "learning_rate": 1e-06, + "loss": 0.5296, + "step": 511 + }, + { + "epoch": 56.888888888888886, + "grad_norm": 13.413711547851562, + "learning_rate": 1e-06, + "loss": 0.6781, + "step": 512 + }, + { + "epoch": 57.0, + "grad_norm": 11.24283504486084, + "learning_rate": 1e-06, + "loss": 0.5804, + "step": 513 + }, + { + "epoch": 57.111111111111114, + "grad_norm": 6.140107154846191, + "learning_rate": 1e-06, + "loss": 0.6449, + "step": 514 + }, + { + "epoch": 57.22222222222222, + "grad_norm": 13.432300567626953, + "learning_rate": 1e-06, + "loss": 0.5595, + "step": 515 + }, + { + "epoch": 57.333333333333336, + "grad_norm": 9.86408805847168, + "learning_rate": 1e-06, + "loss": 0.619, + "step": 516 + }, + { + "epoch": 57.44444444444444, + "grad_norm": 8.814032554626465, + "learning_rate": 1e-06, + "loss": 0.5641, + "step": 517 + }, + { + "epoch": 57.55555555555556, + "grad_norm": 17.814529418945312, + "learning_rate": 1e-06, + "loss": 0.5103, + "step": 518 + }, + { + "epoch": 57.666666666666664, + "grad_norm": 11.252058982849121, + "learning_rate": 1e-06, + "loss": 0.5511, + "step": 519 + }, + { + "epoch": 57.77777777777778, + "grad_norm": 13.884390830993652, + "learning_rate": 1e-06, + "loss": 0.5799, + "step": 520 + }, + { + "epoch": 57.888888888888886, + "grad_norm": 12.692092895507812, + "learning_rate": 1e-06, + "loss": 0.5508, + "step": 521 + }, + { + "epoch": 58.0, + "grad_norm": 12.135607719421387, + "learning_rate": 1e-06, + "loss": 0.6414, + "step": 522 + }, + { + "epoch": 58.111111111111114, + "grad_norm": 11.153215408325195, + "learning_rate": 1e-06, + "loss": 0.6184, + "step": 523 + }, + { + "epoch": 58.22222222222222, + "grad_norm": 12.569489479064941, + "learning_rate": 1e-06, + "loss": 0.4797, + "step": 524 + }, + { + "epoch": 58.333333333333336, + "grad_norm": 7.50900936126709, + "learning_rate": 1e-06, + "loss": 0.6387, + "step": 525 + }, + { + "epoch": 58.44444444444444, + "grad_norm": 13.03006362915039, + "learning_rate": 1e-06, + "loss": 0.4345, + "step": 526 + }, + { + "epoch": 58.55555555555556, + "grad_norm": 6.9065141677856445, + "learning_rate": 1e-06, + "loss": 0.6189, + "step": 527 + }, + { + "epoch": 58.666666666666664, + "grad_norm": 20.355173110961914, + "learning_rate": 1e-06, + "loss": 0.6316, + "step": 528 + }, + { + "epoch": 58.77777777777778, + "grad_norm": 15.553384780883789, + "learning_rate": 1e-06, + "loss": 0.5959, + "step": 529 + }, + { + "epoch": 58.888888888888886, + "grad_norm": 13.16717529296875, + "learning_rate": 1e-06, + "loss": 0.613, + "step": 530 + }, + { + "epoch": 59.0, + "grad_norm": 18.80185890197754, + "learning_rate": 1e-06, + "loss": 0.577, + "step": 531 + }, + { + "epoch": 59.111111111111114, + "grad_norm": 10.476286888122559, + "learning_rate": 1e-06, + "loss": 0.4525, + "step": 532 + }, + { + "epoch": 59.22222222222222, + "grad_norm": 58.211631774902344, + "learning_rate": 1e-06, + "loss": 0.5151, + "step": 533 + }, + { + "epoch": 59.333333333333336, + "grad_norm": 11.117036819458008, + "learning_rate": 1e-06, + "loss": 0.4735, + "step": 534 + }, + { + "epoch": 59.44444444444444, + "grad_norm": 9.911279678344727, + "learning_rate": 1e-06, + "loss": 0.5196, + "step": 535 + }, + { + "epoch": 59.55555555555556, + "grad_norm": 12.505763053894043, + "learning_rate": 1e-06, + "loss": 0.7274, + "step": 536 + }, + { + "epoch": 59.666666666666664, + "grad_norm": 12.444116592407227, + "learning_rate": 1e-06, + "loss": 0.7051, + "step": 537 + }, + { + "epoch": 59.77777777777778, + "grad_norm": 8.258490562438965, + "learning_rate": 1e-06, + "loss": 0.6056, + "step": 538 + }, + { + "epoch": 59.888888888888886, + "grad_norm": 21.258846282958984, + "learning_rate": 1e-06, + "loss": 0.4569, + "step": 539 + }, + { + "epoch": 60.0, + "grad_norm": 15.47060489654541, + "learning_rate": 1e-06, + "loss": 0.4579, + "step": 540 + }, + { + "epoch": 60.111111111111114, + "grad_norm": 10.653116226196289, + "learning_rate": 1e-06, + "loss": 0.5397, + "step": 541 + }, + { + "epoch": 60.22222222222222, + "grad_norm": 9.660061836242676, + "learning_rate": 1e-06, + "loss": 0.542, + "step": 542 + }, + { + "epoch": 60.333333333333336, + "grad_norm": 15.951205253601074, + "learning_rate": 1e-06, + "loss": 0.6156, + "step": 543 + }, + { + "epoch": 60.44444444444444, + "grad_norm": 7.894092082977295, + "learning_rate": 1e-06, + "loss": 0.6179, + "step": 544 + }, + { + "epoch": 60.55555555555556, + "grad_norm": 12.907491683959961, + "learning_rate": 1e-06, + "loss": 0.5808, + "step": 545 + }, + { + "epoch": 60.666666666666664, + "grad_norm": 12.47942066192627, + "learning_rate": 1e-06, + "loss": 0.7016, + "step": 546 + }, + { + "epoch": 60.77777777777778, + "grad_norm": 12.986385345458984, + "learning_rate": 1e-06, + "loss": 0.455, + "step": 547 + }, + { + "epoch": 60.888888888888886, + "grad_norm": 17.07754135131836, + "learning_rate": 1e-06, + "loss": 0.2587, + "step": 548 + }, + { + "epoch": 61.0, + "grad_norm": 15.102787971496582, + "learning_rate": 1e-06, + "loss": 0.4736, + "step": 549 + }, + { + "epoch": 61.111111111111114, + "grad_norm": 6.536594867706299, + "learning_rate": 1e-06, + "loss": 0.7536, + "step": 550 + }, + { + "epoch": 61.22222222222222, + "grad_norm": 11.23619556427002, + "learning_rate": 1e-06, + "loss": 0.5148, + "step": 551 + }, + { + "epoch": 61.333333333333336, + "grad_norm": 14.43019962310791, + "learning_rate": 1e-06, + "loss": 0.5967, + "step": 552 + }, + { + "epoch": 61.44444444444444, + "grad_norm": 14.75289249420166, + "learning_rate": 1e-06, + "loss": 0.3841, + "step": 553 + }, + { + "epoch": 61.55555555555556, + "grad_norm": 9.131050109863281, + "learning_rate": 1e-06, + "loss": 0.5141, + "step": 554 + }, + { + "epoch": 61.666666666666664, + "grad_norm": 13.470495223999023, + "learning_rate": 1e-06, + "loss": 0.3826, + "step": 555 + }, + { + "epoch": 61.77777777777778, + "grad_norm": 26.136198043823242, + "learning_rate": 1e-06, + "loss": 0.4245, + "step": 556 + }, + { + "epoch": 61.888888888888886, + "grad_norm": 10.957473754882812, + "learning_rate": 1e-06, + "loss": 0.6194, + "step": 557 + }, + { + "epoch": 62.0, + "grad_norm": 14.989158630371094, + "learning_rate": 1e-06, + "loss": 0.6112, + "step": 558 + }, + { + "epoch": 62.111111111111114, + "grad_norm": 9.993459701538086, + "learning_rate": 1e-06, + "loss": 0.6367, + "step": 559 + }, + { + "epoch": 62.22222222222222, + "grad_norm": 17.46868133544922, + "learning_rate": 1e-06, + "loss": 0.4716, + "step": 560 + }, + { + "epoch": 62.333333333333336, + "grad_norm": 9.334859848022461, + "learning_rate": 1e-06, + "loss": 0.2963, + "step": 561 + }, + { + "epoch": 62.44444444444444, + "grad_norm": 15.969205856323242, + "learning_rate": 1e-06, + "loss": 0.4884, + "step": 562 + }, + { + "epoch": 62.55555555555556, + "grad_norm": 8.313102722167969, + "learning_rate": 1e-06, + "loss": 0.559, + "step": 563 + }, + { + "epoch": 62.666666666666664, + "grad_norm": 18.453330993652344, + "learning_rate": 1e-06, + "loss": 0.5372, + "step": 564 + }, + { + "epoch": 62.77777777777778, + "grad_norm": 16.20953941345215, + "learning_rate": 1e-06, + "loss": 0.6166, + "step": 565 + }, + { + "epoch": 62.888888888888886, + "grad_norm": 9.620970726013184, + "learning_rate": 1e-06, + "loss": 0.7017, + "step": 566 + }, + { + "epoch": 63.0, + "grad_norm": 13.220686912536621, + "learning_rate": 1e-06, + "loss": 0.5589, + "step": 567 + }, + { + "epoch": 63.111111111111114, + "grad_norm": 11.21936321258545, + "learning_rate": 1e-06, + "loss": 0.5397, + "step": 568 + }, + { + "epoch": 63.22222222222222, + "grad_norm": 9.564626693725586, + "learning_rate": 1e-06, + "loss": 0.6135, + "step": 569 + }, + { + "epoch": 63.333333333333336, + "grad_norm": 10.362954139709473, + "learning_rate": 1e-06, + "loss": 0.4402, + "step": 570 + }, + { + "epoch": 63.44444444444444, + "grad_norm": 11.989562034606934, + "learning_rate": 1e-06, + "loss": 0.4356, + "step": 571 + }, + { + "epoch": 63.55555555555556, + "grad_norm": 11.262677192687988, + "learning_rate": 1e-06, + "loss": 0.5137, + "step": 572 + }, + { + "epoch": 63.666666666666664, + "grad_norm": 14.058714866638184, + "learning_rate": 1e-06, + "loss": 0.5343, + "step": 573 + }, + { + "epoch": 63.77777777777778, + "grad_norm": 9.94858455657959, + "learning_rate": 1e-06, + "loss": 0.5306, + "step": 574 + }, + { + "epoch": 63.888888888888886, + "grad_norm": 14.94963264465332, + "learning_rate": 1e-06, + "loss": 0.5348, + "step": 575 + }, + { + "epoch": 64.0, + "grad_norm": 26.670515060424805, + "learning_rate": 1e-06, + "loss": 0.4509, + "step": 576 + }, + { + "epoch": 64.11111111111111, + "grad_norm": 13.178230285644531, + "learning_rate": 1e-06, + "loss": 0.4162, + "step": 577 + }, + { + "epoch": 64.22222222222223, + "grad_norm": 15.019264221191406, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 578 + }, + { + "epoch": 64.33333333333333, + "grad_norm": 10.497014999389648, + "learning_rate": 1e-06, + "loss": 0.5496, + "step": 579 + }, + { + "epoch": 64.44444444444444, + "grad_norm": 14.222382545471191, + "learning_rate": 1e-06, + "loss": 0.5306, + "step": 580 + }, + { + "epoch": 64.55555555555556, + "grad_norm": 12.88569164276123, + "learning_rate": 1e-06, + "loss": 0.5376, + "step": 581 + }, + { + "epoch": 64.66666666666667, + "grad_norm": 8.154383659362793, + "learning_rate": 1e-06, + "loss": 0.4711, + "step": 582 + }, + { + "epoch": 64.77777777777777, + "grad_norm": 11.11835765838623, + "learning_rate": 1e-06, + "loss": 0.7347, + "step": 583 + }, + { + "epoch": 64.88888888888889, + "grad_norm": 17.219839096069336, + "learning_rate": 1e-06, + "loss": 0.3581, + "step": 584 + }, + { + "epoch": 65.0, + "grad_norm": 18.624225616455078, + "learning_rate": 1e-06, + "loss": 0.438, + "step": 585 + }, + { + "epoch": 65.11111111111111, + "grad_norm": 9.676173210144043, + "learning_rate": 1e-06, + "loss": 0.3149, + "step": 586 + }, + { + "epoch": 65.22222222222223, + "grad_norm": 9.431666374206543, + "learning_rate": 1e-06, + "loss": 0.7194, + "step": 587 + }, + { + "epoch": 65.33333333333333, + "grad_norm": 13.123614311218262, + "learning_rate": 1e-06, + "loss": 0.5112, + "step": 588 + }, + { + "epoch": 65.44444444444444, + "grad_norm": 9.960709571838379, + "learning_rate": 1e-06, + "loss": 0.3549, + "step": 589 + }, + { + "epoch": 65.55555555555556, + "grad_norm": 11.746330261230469, + "learning_rate": 1e-06, + "loss": 0.3357, + "step": 590 + }, + { + "epoch": 65.66666666666667, + "grad_norm": 14.073381423950195, + "learning_rate": 1e-06, + "loss": 0.5623, + "step": 591 + }, + { + "epoch": 65.77777777777777, + "grad_norm": 32.164527893066406, + "learning_rate": 1e-06, + "loss": 0.5326, + "step": 592 + }, + { + "epoch": 65.88888888888889, + "grad_norm": 11.489296913146973, + "learning_rate": 1e-06, + "loss": 0.5922, + "step": 593 + }, + { + "epoch": 66.0, + "grad_norm": 24.028797149658203, + "learning_rate": 1e-06, + "loss": 0.2933, + "step": 594 + }, + { + "epoch": 66.11111111111111, + "grad_norm": 11.044596672058105, + "learning_rate": 1e-06, + "loss": 0.4833, + "step": 595 + }, + { + "epoch": 66.22222222222223, + "grad_norm": 12.68044662475586, + "learning_rate": 1e-06, + "loss": 0.466, + "step": 596 + }, + { + "epoch": 66.33333333333333, + "grad_norm": 10.503570556640625, + "learning_rate": 1e-06, + "loss": 0.3187, + "step": 597 + }, + { + "epoch": 66.44444444444444, + "grad_norm": 17.548078536987305, + "learning_rate": 1e-06, + "loss": 0.5033, + "step": 598 + }, + { + "epoch": 66.55555555555556, + "grad_norm": 23.246736526489258, + "learning_rate": 1e-06, + "loss": 0.5707, + "step": 599 + }, + { + "epoch": 66.66666666666667, + "grad_norm": 13.684090614318848, + "learning_rate": 1e-06, + "loss": 0.6318, + "step": 600 + }, + { + "epoch": 66.77777777777777, + "grad_norm": 13.645058631896973, + "learning_rate": 1e-06, + "loss": 0.5115, + "step": 601 + }, + { + "epoch": 66.88888888888889, + "grad_norm": 23.177127838134766, + "learning_rate": 1e-06, + "loss": 0.3568, + "step": 602 + }, + { + "epoch": 67.0, + "grad_norm": 13.34229564666748, + "learning_rate": 1e-06, + "loss": 0.5765, + "step": 603 + }, + { + "epoch": 67.11111111111111, + "grad_norm": 12.200125694274902, + "learning_rate": 1e-06, + "loss": 0.247, + "step": 604 + }, + { + "epoch": 67.22222222222223, + "grad_norm": 4.842804431915283, + "learning_rate": 1e-06, + "loss": 0.7349, + "step": 605 + }, + { + "epoch": 67.33333333333333, + "grad_norm": 10.725499153137207, + "learning_rate": 1e-06, + "loss": 0.2966, + "step": 606 + }, + { + "epoch": 67.44444444444444, + "grad_norm": 10.875903129577637, + "learning_rate": 1e-06, + "loss": 0.3984, + "step": 607 + }, + { + "epoch": 67.55555555555556, + "grad_norm": 17.279890060424805, + "learning_rate": 1e-06, + "loss": 0.3551, + "step": 608 + }, + { + "epoch": 67.66666666666667, + "grad_norm": 14.787617683410645, + "learning_rate": 1e-06, + "loss": 0.466, + "step": 609 + }, + { + "epoch": 67.77777777777777, + "grad_norm": 11.71757698059082, + "learning_rate": 1e-06, + "loss": 0.5869, + "step": 610 + }, + { + "epoch": 67.88888888888889, + "grad_norm": 23.87479019165039, + "learning_rate": 1e-06, + "loss": 0.4583, + "step": 611 + }, + { + "epoch": 68.0, + "grad_norm": 21.126707077026367, + "learning_rate": 1e-06, + "loss": 0.386, + "step": 612 + }, + { + "epoch": 68.11111111111111, + "grad_norm": 10.029325485229492, + "learning_rate": 1e-06, + "loss": 0.3472, + "step": 613 + }, + { + "epoch": 68.22222222222223, + "grad_norm": 23.34467887878418, + "learning_rate": 1e-06, + "loss": 0.424, + "step": 614 + }, + { + "epoch": 68.33333333333333, + "grad_norm": 8.534692764282227, + "learning_rate": 1e-06, + "loss": 0.6844, + "step": 615 + }, + { + "epoch": 68.44444444444444, + "grad_norm": 17.90964698791504, + "learning_rate": 1e-06, + "loss": 0.4514, + "step": 616 + }, + { + "epoch": 68.55555555555556, + "grad_norm": 16.31854248046875, + "learning_rate": 1e-06, + "loss": 0.4944, + "step": 617 + }, + { + "epoch": 68.66666666666667, + "grad_norm": 11.200562477111816, + "learning_rate": 1e-06, + "loss": 0.475, + "step": 618 + }, + { + "epoch": 68.77777777777777, + "grad_norm": 14.622681617736816, + "learning_rate": 1e-06, + "loss": 0.3692, + "step": 619 + }, + { + "epoch": 68.88888888888889, + "grad_norm": 28.57972526550293, + "learning_rate": 1e-06, + "loss": 0.487, + "step": 620 + }, + { + "epoch": 69.0, + "grad_norm": 25.452842712402344, + "learning_rate": 1e-06, + "loss": 0.418, + "step": 621 + }, + { + "epoch": 69.11111111111111, + "grad_norm": 15.237101554870605, + "learning_rate": 1e-06, + "loss": 0.4898, + "step": 622 + }, + { + "epoch": 69.22222222222223, + "grad_norm": 17.362398147583008, + "learning_rate": 1e-06, + "loss": 0.3264, + "step": 623 + }, + { + "epoch": 69.33333333333333, + "grad_norm": 9.451605796813965, + "learning_rate": 1e-06, + "loss": 0.4409, + "step": 624 + }, + { + "epoch": 69.44444444444444, + "grad_norm": 11.39873218536377, + "learning_rate": 1e-06, + "loss": 0.6443, + "step": 625 + }, + { + "epoch": 69.55555555555556, + "grad_norm": 10.392268180847168, + "learning_rate": 1e-06, + "loss": 0.3922, + "step": 626 + }, + { + "epoch": 69.66666666666667, + "grad_norm": 9.931982040405273, + "learning_rate": 1e-06, + "loss": 0.3222, + "step": 627 + }, + { + "epoch": 69.77777777777777, + "grad_norm": 13.626066207885742, + "learning_rate": 1e-06, + "loss": 0.4834, + "step": 628 + }, + { + "epoch": 69.88888888888889, + "grad_norm": 21.66139030456543, + "learning_rate": 1e-06, + "loss": 0.4759, + "step": 629 + }, + { + "epoch": 70.0, + "grad_norm": 11.482422828674316, + "learning_rate": 1e-06, + "loss": 0.5625, + "step": 630 + }, + { + "epoch": 70.11111111111111, + "grad_norm": 9.162958145141602, + "learning_rate": 1e-06, + "loss": 0.4308, + "step": 631 + }, + { + "epoch": 70.22222222222223, + "grad_norm": 14.91771125793457, + "learning_rate": 1e-06, + "loss": 0.7107, + "step": 632 + }, + { + "epoch": 70.33333333333333, + "grad_norm": 10.8609037399292, + "learning_rate": 1e-06, + "loss": 0.3054, + "step": 633 + }, + { + "epoch": 70.44444444444444, + "grad_norm": 23.849992752075195, + "learning_rate": 1e-06, + "loss": 0.4158, + "step": 634 + }, + { + "epoch": 70.55555555555556, + "grad_norm": 11.545877456665039, + "learning_rate": 1e-06, + "loss": 0.357, + "step": 635 + }, + { + "epoch": 70.66666666666667, + "grad_norm": 14.881173133850098, + "learning_rate": 1e-06, + "loss": 0.5979, + "step": 636 + }, + { + "epoch": 70.77777777777777, + "grad_norm": 14.787395477294922, + "learning_rate": 1e-06, + "loss": 0.62, + "step": 637 + }, + { + "epoch": 70.88888888888889, + "grad_norm": 12.370532035827637, + "learning_rate": 1e-06, + "loss": 0.2615, + "step": 638 + }, + { + "epoch": 71.0, + "grad_norm": 32.25934600830078, + "learning_rate": 1e-06, + "loss": 0.3086, + "step": 639 + }, + { + "epoch": 71.11111111111111, + "grad_norm": 9.265944480895996, + "learning_rate": 1e-06, + "loss": 0.6678, + "step": 640 + }, + { + "epoch": 71.22222222222223, + "grad_norm": 16.715309143066406, + "learning_rate": 1e-06, + "loss": 0.3971, + "step": 641 + }, + { + "epoch": 71.33333333333333, + "grad_norm": 18.331674575805664, + "learning_rate": 1e-06, + "loss": 0.3554, + "step": 642 + }, + { + "epoch": 71.44444444444444, + "grad_norm": 11.258846282958984, + "learning_rate": 1e-06, + "loss": 0.5774, + "step": 643 + }, + { + "epoch": 71.55555555555556, + "grad_norm": 15.218403816223145, + "learning_rate": 1e-06, + "loss": 0.3449, + "step": 644 + }, + { + "epoch": 71.66666666666667, + "grad_norm": 14.001975059509277, + "learning_rate": 1e-06, + "loss": 0.4212, + "step": 645 + }, + { + "epoch": 71.77777777777777, + "grad_norm": 11.480215072631836, + "learning_rate": 1e-06, + "loss": 0.5821, + "step": 646 + }, + { + "epoch": 71.88888888888889, + "grad_norm": 21.206378936767578, + "learning_rate": 1e-06, + "loss": 0.2921, + "step": 647 + }, + { + "epoch": 72.0, + "grad_norm": 16.64643669128418, + "learning_rate": 1e-06, + "loss": 0.4994, + "step": 648 + }, + { + "epoch": 72.11111111111111, + "grad_norm": 9.926445007324219, + "learning_rate": 1e-06, + "loss": 0.5813, + "step": 649 + }, + { + "epoch": 72.22222222222223, + "grad_norm": 16.543750762939453, + "learning_rate": 1e-06, + "loss": 0.2643, + "step": 650 + }, + { + "epoch": 72.33333333333333, + "grad_norm": 9.599359512329102, + "learning_rate": 1e-06, + "loss": 0.6013, + "step": 651 + }, + { + "epoch": 72.44444444444444, + "grad_norm": 20.609172821044922, + "learning_rate": 1e-06, + "loss": 0.3588, + "step": 652 + }, + { + "epoch": 72.55555555555556, + "grad_norm": 13.269994735717773, + "learning_rate": 1e-06, + "loss": 0.4843, + "step": 653 + }, + { + "epoch": 72.66666666666667, + "grad_norm": 14.450708389282227, + "learning_rate": 1e-06, + "loss": 0.4634, + "step": 654 + }, + { + "epoch": 72.77777777777777, + "grad_norm": 13.952493667602539, + "learning_rate": 1e-06, + "loss": 0.3917, + "step": 655 + }, + { + "epoch": 72.88888888888889, + "grad_norm": 23.202932357788086, + "learning_rate": 1e-06, + "loss": 0.3298, + "step": 656 + }, + { + "epoch": 73.0, + "grad_norm": 20.330535888671875, + "learning_rate": 1e-06, + "loss": 0.4964, + "step": 657 + }, + { + "epoch": 73.11111111111111, + "grad_norm": 18.992097854614258, + "learning_rate": 1e-06, + "loss": 0.592, + "step": 658 + }, + { + "epoch": 73.22222222222223, + "grad_norm": 13.96467113494873, + "learning_rate": 1e-06, + "loss": 0.5497, + "step": 659 + }, + { + "epoch": 73.33333333333333, + "grad_norm": 12.666736602783203, + "learning_rate": 1e-06, + "loss": 0.2804, + "step": 660 + }, + { + "epoch": 73.44444444444444, + "grad_norm": 14.627680778503418, + "learning_rate": 1e-06, + "loss": 0.5209, + "step": 661 + }, + { + "epoch": 73.55555555555556, + "grad_norm": 16.386390686035156, + "learning_rate": 1e-06, + "loss": 0.3763, + "step": 662 + }, + { + "epoch": 73.66666666666667, + "grad_norm": 13.65463924407959, + "learning_rate": 1e-06, + "loss": 0.3713, + "step": 663 + }, + { + "epoch": 73.77777777777777, + "grad_norm": 10.067133903503418, + "learning_rate": 1e-06, + "loss": 0.3755, + "step": 664 + }, + { + "epoch": 73.88888888888889, + "grad_norm": 18.04878807067871, + "learning_rate": 1e-06, + "loss": 0.3357, + "step": 665 + }, + { + "epoch": 74.0, + "grad_norm": 29.715402603149414, + "learning_rate": 1e-06, + "loss": 0.569, + "step": 666 + }, + { + "epoch": 74.11111111111111, + "grad_norm": 13.064371109008789, + "learning_rate": 1e-06, + "loss": 0.3184, + "step": 667 + }, + { + "epoch": 74.22222222222223, + "grad_norm": 21.78408432006836, + "learning_rate": 1e-06, + "loss": 0.4642, + "step": 668 + }, + { + "epoch": 74.33333333333333, + "grad_norm": 11.02605152130127, + "learning_rate": 1e-06, + "loss": 0.4235, + "step": 669 + }, + { + "epoch": 74.44444444444444, + "grad_norm": 12.012453079223633, + "learning_rate": 1e-06, + "loss": 0.3906, + "step": 670 + }, + { + "epoch": 74.55555555555556, + "grad_norm": 19.076032638549805, + "learning_rate": 1e-06, + "loss": 0.4712, + "step": 671 + }, + { + "epoch": 74.66666666666667, + "grad_norm": 15.989340782165527, + "learning_rate": 1e-06, + "loss": 0.419, + "step": 672 + }, + { + "epoch": 74.77777777777777, + "grad_norm": 26.36722755432129, + "learning_rate": 1e-06, + "loss": 0.5004, + "step": 673 + }, + { + "epoch": 74.88888888888889, + "grad_norm": 18.8275203704834, + "learning_rate": 1e-06, + "loss": 0.5133, + "step": 674 + }, + { + "epoch": 75.0, + "grad_norm": 13.265533447265625, + "learning_rate": 1e-06, + "loss": 0.3138, + "step": 675 + }, + { + "epoch": 75.11111111111111, + "grad_norm": 6.969182014465332, + "learning_rate": 1e-06, + "loss": 0.4094, + "step": 676 + }, + { + "epoch": 75.22222222222223, + "grad_norm": 11.76089859008789, + "learning_rate": 1e-06, + "loss": 0.5796, + "step": 677 + }, + { + "epoch": 75.33333333333333, + "grad_norm": 12.156746864318848, + "learning_rate": 1e-06, + "loss": 0.3983, + "step": 678 + }, + { + "epoch": 75.44444444444444, + "grad_norm": 13.805169105529785, + "learning_rate": 1e-06, + "loss": 0.3316, + "step": 679 + }, + { + "epoch": 75.55555555555556, + "grad_norm": 10.571413040161133, + "learning_rate": 1e-06, + "loss": 0.3732, + "step": 680 + }, + { + "epoch": 75.66666666666667, + "grad_norm": 20.640995025634766, + "learning_rate": 1e-06, + "loss": 0.6256, + "step": 681 + }, + { + "epoch": 75.77777777777777, + "grad_norm": 22.100801467895508, + "learning_rate": 1e-06, + "loss": 0.3306, + "step": 682 + }, + { + "epoch": 75.88888888888889, + "grad_norm": 34.743072509765625, + "learning_rate": 1e-06, + "loss": 0.3206, + "step": 683 + }, + { + "epoch": 76.0, + "grad_norm": 26.59357261657715, + "learning_rate": 1e-06, + "loss": 0.3901, + "step": 684 + }, + { + "epoch": 76.11111111111111, + "grad_norm": 15.568726539611816, + "learning_rate": 1e-06, + "loss": 0.5148, + "step": 685 + }, + { + "epoch": 76.22222222222223, + "grad_norm": 17.90516471862793, + "learning_rate": 1e-06, + "loss": 0.2734, + "step": 686 + }, + { + "epoch": 76.33333333333333, + "grad_norm": 15.326568603515625, + "learning_rate": 1e-06, + "loss": 0.3623, + "step": 687 + }, + { + "epoch": 76.44444444444444, + "grad_norm": 15.959515571594238, + "learning_rate": 1e-06, + "loss": 0.4386, + "step": 688 + }, + { + "epoch": 76.55555555555556, + "grad_norm": 11.341651916503906, + "learning_rate": 1e-06, + "loss": 0.4013, + "step": 689 + }, + { + "epoch": 76.66666666666667, + "grad_norm": 18.259204864501953, + "learning_rate": 1e-06, + "loss": 0.4088, + "step": 690 + }, + { + "epoch": 76.77777777777777, + "grad_norm": 18.363588333129883, + "learning_rate": 1e-06, + "loss": 0.3488, + "step": 691 + }, + { + "epoch": 76.88888888888889, + "grad_norm": 12.830467224121094, + "learning_rate": 1e-06, + "loss": 0.5486, + "step": 692 + }, + { + "epoch": 77.0, + "grad_norm": 15.890990257263184, + "learning_rate": 1e-06, + "loss": 0.3285, + "step": 693 + }, + { + "epoch": 77.11111111111111, + "grad_norm": 13.044177055358887, + "learning_rate": 1e-06, + "loss": 0.3666, + "step": 694 + }, + { + "epoch": 77.22222222222223, + "grad_norm": 12.231587409973145, + "learning_rate": 1e-06, + "loss": 0.3625, + "step": 695 + }, + { + "epoch": 77.33333333333333, + "grad_norm": 20.678050994873047, + "learning_rate": 1e-06, + "loss": 0.362, + "step": 696 + }, + { + "epoch": 77.44444444444444, + "grad_norm": 6.719130516052246, + "learning_rate": 1e-06, + "loss": 0.6251, + "step": 697 + }, + { + "epoch": 77.55555555555556, + "grad_norm": 9.232157707214355, + "learning_rate": 1e-06, + "loss": 0.6721, + "step": 698 + }, + { + "epoch": 77.66666666666667, + "grad_norm": 14.166629791259766, + "learning_rate": 1e-06, + "loss": 0.2631, + "step": 699 + }, + { + "epoch": 77.77777777777777, + "grad_norm": 16.01231575012207, + "learning_rate": 1e-06, + "loss": 0.2546, + "step": 700 + }, + { + "epoch": 77.88888888888889, + "grad_norm": 36.08451461791992, + "learning_rate": 1e-06, + "loss": 0.2378, + "step": 701 + }, + { + "epoch": 78.0, + "grad_norm": 15.058095932006836, + "learning_rate": 1e-06, + "loss": 0.3828, + "step": 702 + }, + { + "epoch": 78.11111111111111, + "grad_norm": 13.714009284973145, + "learning_rate": 1e-06, + "loss": 0.4336, + "step": 703 + }, + { + "epoch": 78.22222222222223, + "grad_norm": 17.556991577148438, + "learning_rate": 1e-06, + "loss": 0.2581, + "step": 704 + }, + { + "epoch": 78.33333333333333, + "grad_norm": 10.105042457580566, + "learning_rate": 1e-06, + "loss": 0.2456, + "step": 705 + }, + { + "epoch": 78.44444444444444, + "grad_norm": 13.461723327636719, + "learning_rate": 1e-06, + "loss": 0.4479, + "step": 706 + }, + { + "epoch": 78.55555555555556, + "grad_norm": 11.933578491210938, + "learning_rate": 1e-06, + "loss": 0.3449, + "step": 707 + }, + { + "epoch": 78.66666666666667, + "grad_norm": 21.53246307373047, + "learning_rate": 1e-06, + "loss": 0.5101, + "step": 708 + }, + { + "epoch": 78.77777777777777, + "grad_norm": 33.34193801879883, + "learning_rate": 1e-06, + "loss": 0.2965, + "step": 709 + }, + { + "epoch": 78.88888888888889, + "grad_norm": 11.134191513061523, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 710 + }, + { + "epoch": 79.0, + "grad_norm": 17.771305084228516, + "learning_rate": 1e-06, + "loss": 0.3876, + "step": 711 + }, + { + "epoch": 79.11111111111111, + "grad_norm": 11.073326110839844, + "learning_rate": 1e-06, + "loss": 0.3066, + "step": 712 + }, + { + "epoch": 79.22222222222223, + "grad_norm": 10.925596237182617, + "learning_rate": 1e-06, + "loss": 0.3424, + "step": 713 + }, + { + "epoch": 79.33333333333333, + "grad_norm": 10.468260765075684, + "learning_rate": 1e-06, + "loss": 0.3066, + "step": 714 + }, + { + "epoch": 79.44444444444444, + "grad_norm": 10.818296432495117, + "learning_rate": 1e-06, + "loss": 0.4862, + "step": 715 + }, + { + "epoch": 79.55555555555556, + "grad_norm": 12.462935447692871, + "learning_rate": 1e-06, + "loss": 0.4744, + "step": 716 + }, + { + "epoch": 79.66666666666667, + "grad_norm": 11.673898696899414, + "learning_rate": 1e-06, + "loss": 0.4509, + "step": 717 + }, + { + "epoch": 79.77777777777777, + "grad_norm": 28.83226203918457, + "learning_rate": 1e-06, + "loss": 0.2159, + "step": 718 + }, + { + "epoch": 79.88888888888889, + "grad_norm": 24.529874801635742, + "learning_rate": 1e-06, + "loss": 0.3562, + "step": 719 + }, + { + "epoch": 80.0, + "grad_norm": 25.16718864440918, + "learning_rate": 1e-06, + "loss": 0.2504, + "step": 720 + }, + { + "epoch": 80.11111111111111, + "grad_norm": 21.824430465698242, + "learning_rate": 1e-06, + "loss": 0.271, + "step": 721 + }, + { + "epoch": 80.22222222222223, + "grad_norm": 20.215999603271484, + "learning_rate": 1e-06, + "loss": 0.2306, + "step": 722 + }, + { + "epoch": 80.33333333333333, + "grad_norm": 12.27843952178955, + "learning_rate": 1e-06, + "loss": 0.4984, + "step": 723 + }, + { + "epoch": 80.44444444444444, + "grad_norm": 17.15797233581543, + "learning_rate": 1e-06, + "loss": 0.4082, + "step": 724 + }, + { + "epoch": 80.55555555555556, + "grad_norm": 9.96838092803955, + "learning_rate": 1e-06, + "loss": 0.3039, + "step": 725 + }, + { + "epoch": 80.66666666666667, + "grad_norm": 9.582388877868652, + "learning_rate": 1e-06, + "loss": 0.4622, + "step": 726 + }, + { + "epoch": 80.77777777777777, + "grad_norm": 16.952980041503906, + "learning_rate": 1e-06, + "loss": 0.2816, + "step": 727 + }, + { + "epoch": 80.88888888888889, + "grad_norm": 22.912233352661133, + "learning_rate": 1e-06, + "loss": 0.3845, + "step": 728 + }, + { + "epoch": 81.0, + "grad_norm": 18.167516708374023, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 729 + }, + { + "epoch": 81.11111111111111, + "grad_norm": 10.935508728027344, + "learning_rate": 1e-06, + "loss": 0.266, + "step": 730 + }, + { + "epoch": 81.22222222222223, + "grad_norm": 17.297462463378906, + "learning_rate": 1e-06, + "loss": 0.2954, + "step": 731 + }, + { + "epoch": 81.33333333333333, + "grad_norm": 11.599711418151855, + "learning_rate": 1e-06, + "loss": 0.3198, + "step": 732 + }, + { + "epoch": 81.44444444444444, + "grad_norm": 9.106565475463867, + "learning_rate": 1e-06, + "loss": 0.5203, + "step": 733 + }, + { + "epoch": 81.55555555555556, + "grad_norm": 19.57520866394043, + "learning_rate": 1e-06, + "loss": 0.2707, + "step": 734 + }, + { + "epoch": 81.66666666666667, + "grad_norm": 43.47279739379883, + "learning_rate": 1e-06, + "loss": 0.3651, + "step": 735 + }, + { + "epoch": 81.77777777777777, + "grad_norm": 10.91716194152832, + "learning_rate": 1e-06, + "loss": 0.5095, + "step": 736 + }, + { + "epoch": 81.88888888888889, + "grad_norm": 30.006784439086914, + "learning_rate": 1e-06, + "loss": 0.3107, + "step": 737 + }, + { + "epoch": 82.0, + "grad_norm": 87.35581970214844, + "learning_rate": 1e-06, + "loss": 0.3556, + "step": 738 + }, + { + "epoch": 82.11111111111111, + "grad_norm": 18.222827911376953, + "learning_rate": 1e-06, + "loss": 0.2196, + "step": 739 + }, + { + "epoch": 82.22222222222223, + "grad_norm": 18.605798721313477, + "learning_rate": 1e-06, + "loss": 0.4806, + "step": 740 + }, + { + "epoch": 82.33333333333333, + "grad_norm": 20.49045181274414, + "learning_rate": 1e-06, + "loss": 0.197, + "step": 741 + }, + { + "epoch": 82.44444444444444, + "grad_norm": 20.11545181274414, + "learning_rate": 1e-06, + "loss": 0.3137, + "step": 742 + }, + { + "epoch": 82.55555555555556, + "grad_norm": 13.487460136413574, + "learning_rate": 1e-06, + "loss": 0.2515, + "step": 743 + }, + { + "epoch": 82.66666666666667, + "grad_norm": 19.600465774536133, + "learning_rate": 1e-06, + "loss": 0.3797, + "step": 744 + }, + { + "epoch": 82.77777777777777, + "grad_norm": 11.542980194091797, + "learning_rate": 1e-06, + "loss": 0.4512, + "step": 745 + }, + { + "epoch": 82.88888888888889, + "grad_norm": 10.7384033203125, + "learning_rate": 1e-06, + "loss": 0.5673, + "step": 746 + }, + { + "epoch": 83.0, + "grad_norm": 12.39702320098877, + "learning_rate": 1e-06, + "loss": 0.4032, + "step": 747 + }, + { + "epoch": 83.11111111111111, + "grad_norm": 8.72336196899414, + "learning_rate": 1e-06, + "loss": 0.3354, + "step": 748 + }, + { + "epoch": 83.22222222222223, + "grad_norm": 23.624330520629883, + "learning_rate": 1e-06, + "loss": 0.4054, + "step": 749 + }, + { + "epoch": 83.33333333333333, + "grad_norm": 29.29412269592285, + "learning_rate": 1e-06, + "loss": 0.1129, + "step": 750 + }, + { + "epoch": 83.44444444444444, + "grad_norm": 11.684513092041016, + "learning_rate": 1e-06, + "loss": 0.2729, + "step": 751 + }, + { + "epoch": 83.55555555555556, + "grad_norm": 10.786295890808105, + "learning_rate": 1e-06, + "loss": 0.3853, + "step": 752 + }, + { + "epoch": 83.66666666666667, + "grad_norm": 17.896499633789062, + "learning_rate": 1e-06, + "loss": 0.3844, + "step": 753 + }, + { + "epoch": 83.77777777777777, + "grad_norm": 13.140891075134277, + "learning_rate": 1e-06, + "loss": 0.536, + "step": 754 + }, + { + "epoch": 83.88888888888889, + "grad_norm": 31.212228775024414, + "learning_rate": 1e-06, + "loss": 0.2101, + "step": 755 + }, + { + "epoch": 84.0, + "grad_norm": 31.212228775024414, + "learning_rate": 1e-06, + "loss": 0.332, + "step": 756 + }, + { + "epoch": 84.11111111111111, + "grad_norm": 11.83405876159668, + "learning_rate": 1e-06, + "loss": 0.2491, + "step": 757 + }, + { + "epoch": 84.22222222222223, + "grad_norm": 15.506129264831543, + "learning_rate": 1e-06, + "loss": 0.295, + "step": 758 + }, + { + "epoch": 84.33333333333333, + "grad_norm": 16.899911880493164, + "learning_rate": 1e-06, + "loss": 0.3656, + "step": 759 + }, + { + "epoch": 84.44444444444444, + "grad_norm": 10.113003730773926, + "learning_rate": 1e-06, + "loss": 0.5774, + "step": 760 + }, + { + "epoch": 84.55555555555556, + "grad_norm": 27.815305709838867, + "learning_rate": 1e-06, + "loss": 0.4604, + "step": 761 + }, + { + "epoch": 84.66666666666667, + "grad_norm": 49.94767379760742, + "learning_rate": 1e-06, + "loss": 0.2563, + "step": 762 + }, + { + "epoch": 84.77777777777777, + "grad_norm": 29.198976516723633, + "learning_rate": 1e-06, + "loss": 0.3775, + "step": 763 + }, + { + "epoch": 84.88888888888889, + "grad_norm": 40.49813461303711, + "learning_rate": 1e-06, + "loss": 0.4756, + "step": 764 + }, + { + "epoch": 85.0, + "grad_norm": 20.67350196838379, + "learning_rate": 1e-06, + "loss": 0.2605, + "step": 765 + }, + { + "epoch": 85.11111111111111, + "grad_norm": 12.112614631652832, + "learning_rate": 1e-06, + "loss": 0.4724, + "step": 766 + }, + { + "epoch": 85.22222222222223, + "grad_norm": 26.125646591186523, + "learning_rate": 1e-06, + "loss": 0.3525, + "step": 767 + }, + { + "epoch": 85.33333333333333, + "grad_norm": 9.908268928527832, + "learning_rate": 1e-06, + "loss": 0.3631, + "step": 768 + }, + { + "epoch": 85.44444444444444, + "grad_norm": 15.4559965133667, + "learning_rate": 1e-06, + "loss": 0.339, + "step": 769 + }, + { + "epoch": 85.55555555555556, + "grad_norm": 17.823034286499023, + "learning_rate": 1e-06, + "loss": 0.3561, + "step": 770 + }, + { + "epoch": 85.66666666666667, + "grad_norm": 37.19938659667969, + "learning_rate": 1e-06, + "loss": 0.2465, + "step": 771 + }, + { + "epoch": 85.77777777777777, + "grad_norm": 18.34689712524414, + "learning_rate": 1e-06, + "loss": 0.4135, + "step": 772 + }, + { + "epoch": 85.88888888888889, + "grad_norm": 11.003941535949707, + "learning_rate": 1e-06, + "loss": 0.2603, + "step": 773 + }, + { + "epoch": 86.0, + "grad_norm": 46.76641845703125, + "learning_rate": 1e-06, + "loss": 0.4344, + "step": 774 + }, + { + "epoch": 86.11111111111111, + "grad_norm": 11.135799407958984, + "learning_rate": 1e-06, + "loss": 0.4159, + "step": 775 + }, + { + "epoch": 86.22222222222223, + "grad_norm": 9.778196334838867, + "learning_rate": 1e-06, + "loss": 0.4923, + "step": 776 + }, + { + "epoch": 86.33333333333333, + "grad_norm": 21.18617057800293, + "learning_rate": 1e-06, + "loss": 0.1963, + "step": 777 + }, + { + "epoch": 86.44444444444444, + "grad_norm": 17.61665916442871, + "learning_rate": 1e-06, + "loss": 0.2422, + "step": 778 + }, + { + "epoch": 86.55555555555556, + "grad_norm": 5.7774977684021, + "learning_rate": 1e-06, + "loss": 0.5099, + "step": 779 + }, + { + "epoch": 86.66666666666667, + "grad_norm": 13.74606704711914, + "learning_rate": 1e-06, + "loss": 0.4298, + "step": 780 + }, + { + "epoch": 86.77777777777777, + "grad_norm": 16.952144622802734, + "learning_rate": 1e-06, + "loss": 0.2211, + "step": 781 + }, + { + "epoch": 86.88888888888889, + "grad_norm": 24.98837661743164, + "learning_rate": 1e-06, + "loss": 0.1727, + "step": 782 + }, + { + "epoch": 87.0, + "grad_norm": 22.205644607543945, + "learning_rate": 1e-06, + "loss": 0.3544, + "step": 783 + }, + { + "epoch": 87.11111111111111, + "grad_norm": 6.329139232635498, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 784 + }, + { + "epoch": 87.22222222222223, + "grad_norm": 8.101445198059082, + "learning_rate": 1e-06, + "loss": 0.2641, + "step": 785 + }, + { + "epoch": 87.33333333333333, + "grad_norm": 38.60511779785156, + "learning_rate": 1e-06, + "loss": 0.3205, + "step": 786 + }, + { + "epoch": 87.44444444444444, + "grad_norm": 16.22945213317871, + "learning_rate": 1e-06, + "loss": 0.2483, + "step": 787 + }, + { + "epoch": 87.55555555555556, + "grad_norm": 13.620223045349121, + "learning_rate": 1e-06, + "loss": 0.2354, + "step": 788 + }, + { + "epoch": 87.66666666666667, + "grad_norm": 15.664928436279297, + "learning_rate": 1e-06, + "loss": 0.3918, + "step": 789 + }, + { + "epoch": 87.77777777777777, + "grad_norm": 11.706796646118164, + "learning_rate": 1e-06, + "loss": 0.2074, + "step": 790 + }, + { + "epoch": 87.88888888888889, + "grad_norm": 23.90934944152832, + "learning_rate": 1e-06, + "loss": 0.3172, + "step": 791 + }, + { + "epoch": 88.0, + "grad_norm": 13.836673736572266, + "learning_rate": 1e-06, + "loss": 0.4029, + "step": 792 + }, + { + "epoch": 88.11111111111111, + "grad_norm": 8.835229873657227, + "learning_rate": 1e-06, + "loss": 0.1918, + "step": 793 + }, + { + "epoch": 88.22222222222223, + "grad_norm": 20.60171890258789, + "learning_rate": 1e-06, + "loss": 0.2677, + "step": 794 + }, + { + "epoch": 88.33333333333333, + "grad_norm": 27.24347496032715, + "learning_rate": 1e-06, + "loss": 0.2195, + "step": 795 + }, + { + "epoch": 88.44444444444444, + "grad_norm": 10.20477294921875, + "learning_rate": 1e-06, + "loss": 0.1222, + "step": 796 + }, + { + "epoch": 88.55555555555556, + "grad_norm": 16.11830711364746, + "learning_rate": 1e-06, + "loss": 0.3363, + "step": 797 + }, + { + "epoch": 88.66666666666667, + "grad_norm": 14.978215217590332, + "learning_rate": 1e-06, + "loss": 0.4905, + "step": 798 + }, + { + "epoch": 88.77777777777777, + "grad_norm": 15.838691711425781, + "learning_rate": 1e-06, + "loss": 0.5793, + "step": 799 + }, + { + "epoch": 88.88888888888889, + "grad_norm": 16.04787254333496, + "learning_rate": 1e-06, + "loss": 0.2126, + "step": 800 + }, + { + "epoch": 89.0, + "grad_norm": 16.440237045288086, + "learning_rate": 1e-06, + "loss": 0.312, + "step": 801 + }, + { + "epoch": 89.11111111111111, + "grad_norm": 8.598647117614746, + "learning_rate": 1e-06, + "loss": 0.4878, + "step": 802 + }, + { + "epoch": 89.22222222222223, + "grad_norm": 15.383726119995117, + "learning_rate": 1e-06, + "loss": 0.2361, + "step": 803 + }, + { + "epoch": 89.33333333333333, + "grad_norm": 12.068379402160645, + "learning_rate": 1e-06, + "loss": 0.2574, + "step": 804 + }, + { + "epoch": 89.44444444444444, + "grad_norm": 12.259071350097656, + "learning_rate": 1e-06, + "loss": 0.2463, + "step": 805 + }, + { + "epoch": 89.55555555555556, + "grad_norm": 15.286248207092285, + "learning_rate": 1e-06, + "loss": 0.3384, + "step": 806 + }, + { + "epoch": 89.66666666666667, + "grad_norm": 12.922062873840332, + "learning_rate": 1e-06, + "loss": 0.4404, + "step": 807 + }, + { + "epoch": 89.77777777777777, + "grad_norm": 19.369972229003906, + "learning_rate": 1e-06, + "loss": 0.1727, + "step": 808 + }, + { + "epoch": 89.88888888888889, + "grad_norm": 12.708710670471191, + "learning_rate": 1e-06, + "loss": 0.1943, + "step": 809 + }, + { + "epoch": 90.0, + "grad_norm": 14.53272819519043, + "learning_rate": 1e-06, + "loss": 0.6182, + "step": 810 + }, + { + "epoch": 90.11111111111111, + "grad_norm": 10.790712356567383, + "learning_rate": 1e-06, + "loss": 0.4775, + "step": 811 + }, + { + "epoch": 90.22222222222223, + "grad_norm": 10.32442855834961, + "learning_rate": 1e-06, + "loss": 0.3675, + "step": 812 + }, + { + "epoch": 90.33333333333333, + "grad_norm": 23.4688777923584, + "learning_rate": 1e-06, + "loss": 0.2709, + "step": 813 + }, + { + "epoch": 90.44444444444444, + "grad_norm": 41.15815734863281, + "learning_rate": 1e-06, + "loss": 0.3074, + "step": 814 + }, + { + "epoch": 90.55555555555556, + "grad_norm": 13.35794448852539, + "learning_rate": 1e-06, + "loss": 0.4154, + "step": 815 + }, + { + "epoch": 90.66666666666667, + "grad_norm": 45.23856735229492, + "learning_rate": 1e-06, + "loss": 0.1435, + "step": 816 + }, + { + "epoch": 90.77777777777777, + "grad_norm": 22.80068016052246, + "learning_rate": 1e-06, + "loss": 0.2388, + "step": 817 + }, + { + "epoch": 90.88888888888889, + "grad_norm": 32.1614990234375, + "learning_rate": 1e-06, + "loss": 0.3011, + "step": 818 + }, + { + "epoch": 91.0, + "grad_norm": 28.676061630249023, + "learning_rate": 1e-06, + "loss": 0.3191, + "step": 819 + }, + { + "epoch": 91.11111111111111, + "grad_norm": 9.782219886779785, + "learning_rate": 1e-06, + "loss": 0.3208, + "step": 820 + }, + { + "epoch": 91.22222222222223, + "grad_norm": 20.61576271057129, + "learning_rate": 1e-06, + "loss": 0.2585, + "step": 821 + }, + { + "epoch": 91.33333333333333, + "grad_norm": 9.24669361114502, + "learning_rate": 1e-06, + "loss": 0.2187, + "step": 822 + }, + { + "epoch": 91.44444444444444, + "grad_norm": 17.92948341369629, + "learning_rate": 1e-06, + "loss": 0.3645, + "step": 823 + }, + { + "epoch": 91.55555555555556, + "grad_norm": 18.46591567993164, + "learning_rate": 1e-06, + "loss": 0.2369, + "step": 824 + }, + { + "epoch": 91.66666666666667, + "grad_norm": 13.847054481506348, + "learning_rate": 1e-06, + "loss": 0.3809, + "step": 825 + }, + { + "epoch": 91.77777777777777, + "grad_norm": 8.995189666748047, + "learning_rate": 1e-06, + "loss": 0.4611, + "step": 826 + }, + { + "epoch": 91.88888888888889, + "grad_norm": 37.68916320800781, + "learning_rate": 1e-06, + "loss": 0.1256, + "step": 827 + }, + { + "epoch": 92.0, + "grad_norm": 21.104402542114258, + "learning_rate": 1e-06, + "loss": 0.398, + "step": 828 + }, + { + "epoch": 92.11111111111111, + "grad_norm": 9.259795188903809, + "learning_rate": 1e-06, + "loss": 0.2527, + "step": 829 + }, + { + "epoch": 92.22222222222223, + "grad_norm": 18.586181640625, + "learning_rate": 1e-06, + "loss": 0.353, + "step": 830 + }, + { + "epoch": 92.33333333333333, + "grad_norm": 13.1871976852417, + "learning_rate": 1e-06, + "loss": 0.2783, + "step": 831 + }, + { + "epoch": 92.44444444444444, + "grad_norm": 14.94079875946045, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 832 + }, + { + "epoch": 92.55555555555556, + "grad_norm": 18.794118881225586, + "learning_rate": 1e-06, + "loss": 0.1955, + "step": 833 + }, + { + "epoch": 92.66666666666667, + "grad_norm": 10.596232414245605, + "learning_rate": 1e-06, + "loss": 0.1974, + "step": 834 + }, + { + "epoch": 92.77777777777777, + "grad_norm": 30.847915649414062, + "learning_rate": 1e-06, + "loss": 0.266, + "step": 835 + }, + { + "epoch": 92.88888888888889, + "grad_norm": 29.57938575744629, + "learning_rate": 1e-06, + "loss": 0.3875, + "step": 836 + }, + { + "epoch": 93.0, + "grad_norm": 13.810118675231934, + "learning_rate": 1e-06, + "loss": 0.4441, + "step": 837 + }, + { + "epoch": 93.11111111111111, + "grad_norm": 22.002283096313477, + "learning_rate": 1e-06, + "loss": 0.4476, + "step": 838 + }, + { + "epoch": 93.22222222222223, + "grad_norm": 13.933184623718262, + "learning_rate": 1e-06, + "loss": 0.2637, + "step": 839 + }, + { + "epoch": 93.33333333333333, + "grad_norm": 9.103535652160645, + "learning_rate": 1e-06, + "loss": 0.0527, + "step": 840 + }, + { + "epoch": 93.44444444444444, + "grad_norm": 15.471733093261719, + "learning_rate": 1e-06, + "loss": 0.1514, + "step": 841 + }, + { + "epoch": 93.55555555555556, + "grad_norm": 12.612188339233398, + "learning_rate": 1e-06, + "loss": 0.3272, + "step": 842 + }, + { + "epoch": 93.66666666666667, + "grad_norm": 13.691544532775879, + "learning_rate": 1e-06, + "loss": 0.2245, + "step": 843 + }, + { + "epoch": 93.77777777777777, + "grad_norm": 4.765650272369385, + "learning_rate": 1e-06, + "loss": 0.6039, + "step": 844 + }, + { + "epoch": 93.88888888888889, + "grad_norm": 28.33608055114746, + "learning_rate": 1e-06, + "loss": 0.2161, + "step": 845 + }, + { + "epoch": 94.0, + "grad_norm": 15.917205810546875, + "learning_rate": 1e-06, + "loss": 0.2851, + "step": 846 + }, + { + "epoch": 94.11111111111111, + "grad_norm": 16.265289306640625, + "learning_rate": 1e-06, + "loss": 0.3444, + "step": 847 + }, + { + "epoch": 94.22222222222223, + "grad_norm": 16.644309997558594, + "learning_rate": 1e-06, + "loss": 0.3419, + "step": 848 + }, + { + "epoch": 94.33333333333333, + "grad_norm": 35.678619384765625, + "learning_rate": 1e-06, + "loss": 0.2401, + "step": 849 + }, + { + "epoch": 94.44444444444444, + "grad_norm": 10.744671821594238, + "learning_rate": 1e-06, + "loss": 0.2709, + "step": 850 + }, + { + "epoch": 94.55555555555556, + "grad_norm": 12.11386775970459, + "learning_rate": 1e-06, + "loss": 0.444, + "step": 851 + }, + { + "epoch": 94.66666666666667, + "grad_norm": 13.400485038757324, + "learning_rate": 1e-06, + "loss": 0.1479, + "step": 852 + }, + { + "epoch": 94.77777777777777, + "grad_norm": 19.921405792236328, + "learning_rate": 1e-06, + "loss": 0.2725, + "step": 853 + }, + { + "epoch": 94.88888888888889, + "grad_norm": 14.749449729919434, + "learning_rate": 1e-06, + "loss": 0.1393, + "step": 854 + }, + { + "epoch": 95.0, + "grad_norm": 37.78407669067383, + "learning_rate": 1e-06, + "loss": 0.2928, + "step": 855 + }, + { + "epoch": 95.11111111111111, + "grad_norm": 34.98849105834961, + "learning_rate": 1e-06, + "loss": 0.298, + "step": 856 + }, + { + "epoch": 95.22222222222223, + "grad_norm": 5.249161720275879, + "learning_rate": 1e-06, + "loss": 0.2064, + "step": 857 + }, + { + "epoch": 95.33333333333333, + "grad_norm": 15.806733131408691, + "learning_rate": 1e-06, + "loss": 0.2307, + "step": 858 + }, + { + "epoch": 95.44444444444444, + "grad_norm": 11.226180076599121, + "learning_rate": 1e-06, + "loss": 0.2357, + "step": 859 + }, + { + "epoch": 95.55555555555556, + "grad_norm": 27.58200454711914, + "learning_rate": 1e-06, + "loss": 0.3164, + "step": 860 + }, + { + "epoch": 95.66666666666667, + "grad_norm": 12.565176010131836, + "learning_rate": 1e-06, + "loss": 0.3623, + "step": 861 + }, + { + "epoch": 95.77777777777777, + "grad_norm": 20.493309020996094, + "learning_rate": 1e-06, + "loss": 0.2954, + "step": 862 + }, + { + "epoch": 95.88888888888889, + "grad_norm": 21.776185989379883, + "learning_rate": 1e-06, + "loss": 0.3483, + "step": 863 + }, + { + "epoch": 96.0, + "grad_norm": 27.685548782348633, + "learning_rate": 1e-06, + "loss": 0.3696, + "step": 864 + }, + { + "epoch": 96.11111111111111, + "grad_norm": 20.811012268066406, + "learning_rate": 1e-06, + "loss": 0.2769, + "step": 865 + }, + { + "epoch": 96.22222222222223, + "grad_norm": 8.767242431640625, + "learning_rate": 1e-06, + "loss": 0.0764, + "step": 866 + }, + { + "epoch": 96.33333333333333, + "grad_norm": 11.312590599060059, + "learning_rate": 1e-06, + "loss": 0.3858, + "step": 867 + }, + { + "epoch": 96.44444444444444, + "grad_norm": 9.137829780578613, + "learning_rate": 1e-06, + "loss": 0.144, + "step": 868 + }, + { + "epoch": 96.55555555555556, + "grad_norm": 11.574494361877441, + "learning_rate": 1e-06, + "loss": 0.2191, + "step": 869 + }, + { + "epoch": 96.66666666666667, + "grad_norm": 26.158550262451172, + "learning_rate": 1e-06, + "loss": 0.2087, + "step": 870 + }, + { + "epoch": 96.77777777777777, + "grad_norm": 26.152124404907227, + "learning_rate": 1e-06, + "loss": 0.2778, + "step": 871 + }, + { + "epoch": 96.88888888888889, + "grad_norm": 19.108152389526367, + "learning_rate": 1e-06, + "loss": 0.4486, + "step": 872 + }, + { + "epoch": 97.0, + "grad_norm": 17.45200538635254, + "learning_rate": 1e-06, + "loss": 0.2938, + "step": 873 + }, + { + "epoch": 97.11111111111111, + "grad_norm": 8.9403076171875, + "learning_rate": 1e-06, + "loss": 0.2232, + "step": 874 + }, + { + "epoch": 97.22222222222223, + "grad_norm": 12.997782707214355, + "learning_rate": 1e-06, + "loss": 0.2816, + "step": 875 + }, + { + "epoch": 97.33333333333333, + "grad_norm": 6.348421573638916, + "learning_rate": 1e-06, + "loss": 0.2508, + "step": 876 + }, + { + "epoch": 97.44444444444444, + "grad_norm": 24.62539291381836, + "learning_rate": 1e-06, + "loss": 0.2229, + "step": 877 + }, + { + "epoch": 97.55555555555556, + "grad_norm": 17.0504207611084, + "learning_rate": 1e-06, + "loss": 0.4344, + "step": 878 + }, + { + "epoch": 97.66666666666667, + "grad_norm": 21.977495193481445, + "learning_rate": 1e-06, + "loss": 0.2908, + "step": 879 + }, + { + "epoch": 97.77777777777777, + "grad_norm": 26.756763458251953, + "learning_rate": 1e-06, + "loss": 0.3758, + "step": 880 + }, + { + "epoch": 97.88888888888889, + "grad_norm": 24.016990661621094, + "learning_rate": 1e-06, + "loss": 0.1747, + "step": 881 + }, + { + "epoch": 98.0, + "grad_norm": 33.4600715637207, + "learning_rate": 1e-06, + "loss": 0.2295, + "step": 882 + }, + { + "epoch": 98.11111111111111, + "grad_norm": 14.065025329589844, + "learning_rate": 1e-06, + "loss": 0.4171, + "step": 883 + }, + { + "epoch": 98.22222222222223, + "grad_norm": 29.71886444091797, + "learning_rate": 1e-06, + "loss": 0.3168, + "step": 884 + }, + { + "epoch": 98.33333333333333, + "grad_norm": 28.04619026184082, + "learning_rate": 1e-06, + "loss": 0.2115, + "step": 885 + }, + { + "epoch": 98.44444444444444, + "grad_norm": 14.654488563537598, + "learning_rate": 1e-06, + "loss": 0.1902, + "step": 886 + }, + { + "epoch": 98.55555555555556, + "grad_norm": 22.9525089263916, + "learning_rate": 1e-06, + "loss": 0.314, + "step": 887 + }, + { + "epoch": 98.66666666666667, + "grad_norm": 14.720409393310547, + "learning_rate": 1e-06, + "loss": 0.0839, + "step": 888 + }, + { + "epoch": 98.77777777777777, + "grad_norm": 14.100489616394043, + "learning_rate": 1e-06, + "loss": 0.2975, + "step": 889 + }, + { + "epoch": 98.88888888888889, + "grad_norm": 19.674903869628906, + "learning_rate": 1e-06, + "loss": 0.4588, + "step": 890 + }, + { + "epoch": 99.0, + "grad_norm": 16.432249069213867, + "learning_rate": 1e-06, + "loss": 0.2946, + "step": 891 + }, + { + "epoch": 99.11111111111111, + "grad_norm": 11.798779487609863, + "learning_rate": 1e-06, + "loss": 0.2888, + "step": 892 + }, + { + "epoch": 99.22222222222223, + "grad_norm": 13.641345977783203, + "learning_rate": 1e-06, + "loss": 0.2575, + "step": 893 + }, + { + "epoch": 99.33333333333333, + "grad_norm": 5.519834041595459, + "learning_rate": 1e-06, + "loss": 0.2885, + "step": 894 + }, + { + "epoch": 99.44444444444444, + "grad_norm": 14.122554779052734, + "learning_rate": 1e-06, + "loss": 0.2353, + "step": 895 + }, + { + "epoch": 99.55555555555556, + "grad_norm": 43.725589752197266, + "learning_rate": 1e-06, + "loss": 0.3605, + "step": 896 + }, + { + "epoch": 99.66666666666667, + "grad_norm": 28.557147979736328, + "learning_rate": 1e-06, + "loss": 0.2822, + "step": 897 + }, + { + "epoch": 99.77777777777777, + "grad_norm": 18.63408660888672, + "learning_rate": 1e-06, + "loss": 0.2942, + "step": 898 + }, + { + "epoch": 99.88888888888889, + "grad_norm": 12.691607475280762, + "learning_rate": 1e-06, + "loss": 0.0572, + "step": 899 + }, + { + "epoch": 100.0, + "grad_norm": 22.37607765197754, + "learning_rate": 1e-06, + "loss": 0.1662, + "step": 900 + }, + { + "epoch": 100.11111111111111, + "grad_norm": 12.871050834655762, + "learning_rate": 1e-06, + "loss": 0.2023, + "step": 901 + }, + { + "epoch": 100.22222222222223, + "grad_norm": 25.998291015625, + "learning_rate": 1e-06, + "loss": 0.2157, + "step": 902 + }, + { + "epoch": 100.33333333333333, + "grad_norm": 14.715108871459961, + "learning_rate": 1e-06, + "loss": 0.1057, + "step": 903 + }, + { + "epoch": 100.44444444444444, + "grad_norm": 10.71116828918457, + "learning_rate": 1e-06, + "loss": 0.3125, + "step": 904 + }, + { + "epoch": 100.55555555555556, + "grad_norm": 10.060977935791016, + "learning_rate": 1e-06, + "loss": 0.3035, + "step": 905 + }, + { + "epoch": 100.66666666666667, + "grad_norm": 12.163490295410156, + "learning_rate": 1e-06, + "loss": 0.461, + "step": 906 + }, + { + "epoch": 100.77777777777777, + "grad_norm": 9.285447120666504, + "learning_rate": 1e-06, + "loss": 0.3125, + "step": 907 + }, + { + "epoch": 100.88888888888889, + "grad_norm": 12.88953971862793, + "learning_rate": 1e-06, + "loss": 0.1439, + "step": 908 + }, + { + "epoch": 101.0, + "grad_norm": 25.145294189453125, + "learning_rate": 1e-06, + "loss": 0.3043, + "step": 909 + }, + { + "epoch": 101.11111111111111, + "grad_norm": 12.301963806152344, + "learning_rate": 1e-06, + "loss": 0.1456, + "step": 910 + }, + { + "epoch": 101.22222222222223, + "grad_norm": 7.447351932525635, + "learning_rate": 1e-06, + "loss": 0.0786, + "step": 911 + }, + { + "epoch": 101.33333333333333, + "grad_norm": 11.069771766662598, + "learning_rate": 1e-06, + "loss": 0.3954, + "step": 912 + }, + { + "epoch": 101.44444444444444, + "grad_norm": 32.85992431640625, + "learning_rate": 1e-06, + "loss": 0.3386, + "step": 913 + }, + { + "epoch": 101.55555555555556, + "grad_norm": 17.169145584106445, + "learning_rate": 1e-06, + "loss": 0.4006, + "step": 914 + }, + { + "epoch": 101.66666666666667, + "grad_norm": 11.61205005645752, + "learning_rate": 1e-06, + "loss": 0.238, + "step": 915 + }, + { + "epoch": 101.77777777777777, + "grad_norm": 22.558744430541992, + "learning_rate": 1e-06, + "loss": 0.257, + "step": 916 + }, + { + "epoch": 101.88888888888889, + "grad_norm": 21.26172637939453, + "learning_rate": 1e-06, + "loss": 0.1269, + "step": 917 + }, + { + "epoch": 102.0, + "grad_norm": 28.676082611083984, + "learning_rate": 1e-06, + "loss": 0.0849, + "step": 918 + }, + { + "epoch": 102.11111111111111, + "grad_norm": 10.261528015136719, + "learning_rate": 1e-06, + "loss": 0.3604, + "step": 919 + }, + { + "epoch": 102.22222222222223, + "grad_norm": 12.20177173614502, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 920 + }, + { + "epoch": 102.33333333333333, + "grad_norm": 10.046985626220703, + "learning_rate": 1e-06, + "loss": 0.3196, + "step": 921 + }, + { + "epoch": 102.44444444444444, + "grad_norm": 41.6318473815918, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 922 + }, + { + "epoch": 102.55555555555556, + "grad_norm": 27.263620376586914, + "learning_rate": 1e-06, + "loss": 0.0957, + "step": 923 + }, + { + "epoch": 102.66666666666667, + "grad_norm": 12.716344833374023, + "learning_rate": 1e-06, + "loss": 0.3408, + "step": 924 + }, + { + "epoch": 102.77777777777777, + "grad_norm": 12.407052040100098, + "learning_rate": 1e-06, + "loss": 0.209, + "step": 925 + }, + { + "epoch": 102.88888888888889, + "grad_norm": 18.274276733398438, + "learning_rate": 1e-06, + "loss": 0.313, + "step": 926 + }, + { + "epoch": 103.0, + "grad_norm": 14.834152221679688, + "learning_rate": 1e-06, + "loss": 0.4978, + "step": 927 + }, + { + "epoch": 103.11111111111111, + "grad_norm": 10.276154518127441, + "learning_rate": 1e-06, + "loss": 0.2751, + "step": 928 + }, + { + "epoch": 103.22222222222223, + "grad_norm": 8.359628677368164, + "learning_rate": 1e-06, + "loss": 0.0497, + "step": 929 + }, + { + "epoch": 103.33333333333333, + "grad_norm": 19.589160919189453, + "learning_rate": 1e-06, + "loss": 0.279, + "step": 930 + }, + { + "epoch": 103.44444444444444, + "grad_norm": 8.969367980957031, + "learning_rate": 1e-06, + "loss": 0.2477, + "step": 931 + }, + { + "epoch": 103.55555555555556, + "grad_norm": 11.00221061706543, + "learning_rate": 1e-06, + "loss": 0.2791, + "step": 932 + }, + { + "epoch": 103.66666666666667, + "grad_norm": 39.352291107177734, + "learning_rate": 1e-06, + "loss": 0.3288, + "step": 933 + }, + { + "epoch": 103.77777777777777, + "grad_norm": 14.972265243530273, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 934 + }, + { + "epoch": 103.88888888888889, + "grad_norm": 27.021085739135742, + "learning_rate": 1e-06, + "loss": 0.365, + "step": 935 + }, + { + "epoch": 104.0, + "grad_norm": 18.456432342529297, + "learning_rate": 1e-06, + "loss": 0.4129, + "step": 936 + }, + { + "epoch": 104.11111111111111, + "grad_norm": 11.49702262878418, + "learning_rate": 1e-06, + "loss": 0.3308, + "step": 937 + }, + { + "epoch": 104.22222222222223, + "grad_norm": 14.58060359954834, + "learning_rate": 1e-06, + "loss": 0.2145, + "step": 938 + }, + { + "epoch": 104.33333333333333, + "grad_norm": 11.75944709777832, + "learning_rate": 1e-06, + "loss": 0.2745, + "step": 939 + }, + { + "epoch": 104.44444444444444, + "grad_norm": 22.57122230529785, + "learning_rate": 1e-06, + "loss": 0.2452, + "step": 940 + }, + { + "epoch": 104.55555555555556, + "grad_norm": 25.197404861450195, + "learning_rate": 1e-06, + "loss": 0.3641, + "step": 941 + }, + { + "epoch": 104.66666666666667, + "grad_norm": 12.267632484436035, + "learning_rate": 1e-06, + "loss": 0.1309, + "step": 942 + }, + { + "epoch": 104.77777777777777, + "grad_norm": 14.208633422851562, + "learning_rate": 1e-06, + "loss": 0.2382, + "step": 943 + }, + { + "epoch": 104.88888888888889, + "grad_norm": 5.930640697479248, + "learning_rate": 1e-06, + "loss": 0.0524, + "step": 944 + }, + { + "epoch": 105.0, + "grad_norm": 9.436129570007324, + "learning_rate": 1e-06, + "loss": 0.2492, + "step": 945 + }, + { + "epoch": 105.11111111111111, + "grad_norm": 7.6014838218688965, + "learning_rate": 1e-06, + "loss": 0.2235, + "step": 946 + }, + { + "epoch": 105.22222222222223, + "grad_norm": 12.748156547546387, + "learning_rate": 1e-06, + "loss": 0.1283, + "step": 947 + }, + { + "epoch": 105.33333333333333, + "grad_norm": 12.893329620361328, + "learning_rate": 1e-06, + "loss": 0.3226, + "step": 948 + }, + { + "epoch": 105.44444444444444, + "grad_norm": 21.034143447875977, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 949 + }, + { + "epoch": 105.55555555555556, + "grad_norm": 15.541600227355957, + "learning_rate": 1e-06, + "loss": 0.2206, + "step": 950 + }, + { + "epoch": 105.66666666666667, + "grad_norm": 19.584583282470703, + "learning_rate": 1e-06, + "loss": 0.54, + "step": 951 + }, + { + "epoch": 105.77777777777777, + "grad_norm": 9.456072807312012, + "learning_rate": 1e-06, + "loss": 0.1264, + "step": 952 + }, + { + "epoch": 105.88888888888889, + "grad_norm": 23.224267959594727, + "learning_rate": 1e-06, + "loss": 0.2532, + "step": 953 + }, + { + "epoch": 106.0, + "grad_norm": 30.81361961364746, + "learning_rate": 1e-06, + "loss": 0.1572, + "step": 954 + }, + { + "epoch": 106.11111111111111, + "grad_norm": 11.119683265686035, + "learning_rate": 1e-06, + "loss": 0.4473, + "step": 955 + }, + { + "epoch": 106.22222222222223, + "grad_norm": 12.288918495178223, + "learning_rate": 1e-06, + "loss": 0.1827, + "step": 956 + }, + { + "epoch": 106.33333333333333, + "grad_norm": 7.512098789215088, + "learning_rate": 1e-06, + "loss": 0.3499, + "step": 957 + }, + { + "epoch": 106.44444444444444, + "grad_norm": 16.369300842285156, + "learning_rate": 1e-06, + "loss": 0.1207, + "step": 958 + }, + { + "epoch": 106.55555555555556, + "grad_norm": 12.884987831115723, + "learning_rate": 1e-06, + "loss": 0.3294, + "step": 959 + }, + { + "epoch": 106.66666666666667, + "grad_norm": 6.706182956695557, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 960 + }, + { + "epoch": 106.77777777777777, + "grad_norm": 22.22920036315918, + "learning_rate": 1e-06, + "loss": 0.1696, + "step": 961 + }, + { + "epoch": 106.88888888888889, + "grad_norm": 14.130997657775879, + "learning_rate": 1e-06, + "loss": 0.1496, + "step": 962 + }, + { + "epoch": 107.0, + "grad_norm": 11.65090274810791, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 963 + }, + { + "epoch": 107.11111111111111, + "grad_norm": 6.85329008102417, + "learning_rate": 1e-06, + "loss": 0.0715, + "step": 964 + }, + { + "epoch": 107.22222222222223, + "grad_norm": 4.908586502075195, + "learning_rate": 1e-06, + "loss": 0.3803, + "step": 965 + }, + { + "epoch": 107.33333333333333, + "grad_norm": 9.561410903930664, + "learning_rate": 1e-06, + "loss": 0.3484, + "step": 966 + }, + { + "epoch": 107.44444444444444, + "grad_norm": 25.01987075805664, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 967 + }, + { + "epoch": 107.55555555555556, + "grad_norm": 27.558963775634766, + "learning_rate": 1e-06, + "loss": 0.1962, + "step": 968 + }, + { + "epoch": 107.66666666666667, + "grad_norm": 24.73050880432129, + "learning_rate": 1e-06, + "loss": 0.1454, + "step": 969 + }, + { + "epoch": 107.77777777777777, + "grad_norm": 17.136043548583984, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 970 + }, + { + "epoch": 107.88888888888889, + "grad_norm": 11.313124656677246, + "learning_rate": 1e-06, + "loss": 0.261, + "step": 971 + }, + { + "epoch": 108.0, + "grad_norm": 16.987947463989258, + "learning_rate": 1e-06, + "loss": 0.1997, + "step": 972 + }, + { + "epoch": 108.11111111111111, + "grad_norm": 12.19642448425293, + "learning_rate": 1e-06, + "loss": 0.2363, + "step": 973 + }, + { + "epoch": 108.22222222222223, + "grad_norm": 14.916790962219238, + "learning_rate": 1e-06, + "loss": 0.219, + "step": 974 + }, + { + "epoch": 108.33333333333333, + "grad_norm": 20.122058868408203, + "learning_rate": 1e-06, + "loss": 0.3293, + "step": 975 + }, + { + "epoch": 108.44444444444444, + "grad_norm": 8.060060501098633, + "learning_rate": 1e-06, + "loss": 0.2389, + "step": 976 + }, + { + "epoch": 108.55555555555556, + "grad_norm": 5.7434234619140625, + "learning_rate": 1e-06, + "loss": 0.1949, + "step": 977 + }, + { + "epoch": 108.66666666666667, + "grad_norm": 13.84915828704834, + "learning_rate": 1e-06, + "loss": 0.116, + "step": 978 + }, + { + "epoch": 108.77777777777777, + "grad_norm": 19.048036575317383, + "learning_rate": 1e-06, + "loss": 0.1587, + "step": 979 + }, + { + "epoch": 108.88888888888889, + "grad_norm": 23.46514892578125, + "learning_rate": 1e-06, + "loss": 0.2714, + "step": 980 + }, + { + "epoch": 109.0, + "grad_norm": 17.62268829345703, + "learning_rate": 1e-06, + "loss": 0.2097, + "step": 981 + }, + { + "epoch": 109.11111111111111, + "grad_norm": 14.01596450805664, + "learning_rate": 1e-06, + "loss": 0.191, + "step": 982 + }, + { + "epoch": 109.22222222222223, + "grad_norm": 11.19281005859375, + "learning_rate": 1e-06, + "loss": 0.4491, + "step": 983 + }, + { + "epoch": 109.33333333333333, + "grad_norm": 11.7758150100708, + "learning_rate": 1e-06, + "loss": 0.1858, + "step": 984 + }, + { + "epoch": 109.44444444444444, + "grad_norm": 11.7758150100708, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 985 + }, + { + "epoch": 109.55555555555556, + "grad_norm": 16.89055824279785, + "learning_rate": 1e-06, + "loss": 0.2884, + "step": 986 + }, + { + "epoch": 109.66666666666667, + "grad_norm": 12.270724296569824, + "learning_rate": 1e-06, + "loss": 0.1485, + "step": 987 + }, + { + "epoch": 109.77777777777777, + "grad_norm": 11.03988265991211, + "learning_rate": 1e-06, + "loss": 0.1951, + "step": 988 + }, + { + "epoch": 109.88888888888889, + "grad_norm": 26.165496826171875, + "learning_rate": 1e-06, + "loss": 0.1627, + "step": 989 + }, + { + "epoch": 110.0, + "grad_norm": 20.040264129638672, + "learning_rate": 1e-06, + "loss": 0.1006, + "step": 990 + }, + { + "epoch": 110.11111111111111, + "grad_norm": 8.513829231262207, + "learning_rate": 1e-06, + "loss": 0.2366, + "step": 991 + }, + { + "epoch": 110.22222222222223, + "grad_norm": 8.618755340576172, + "learning_rate": 1e-06, + "loss": 0.2203, + "step": 992 + }, + { + "epoch": 110.33333333333333, + "grad_norm": 10.539334297180176, + "learning_rate": 1e-06, + "loss": 0.1976, + "step": 993 + }, + { + "epoch": 110.44444444444444, + "grad_norm": 21.284353256225586, + "learning_rate": 1e-06, + "loss": 0.2486, + "step": 994 + }, + { + "epoch": 110.55555555555556, + "grad_norm": 9.325624465942383, + "learning_rate": 1e-06, + "loss": 0.0889, + "step": 995 + }, + { + "epoch": 110.66666666666667, + "grad_norm": 23.49601173400879, + "learning_rate": 1e-06, + "loss": 0.3622, + "step": 996 + }, + { + "epoch": 110.77777777777777, + "grad_norm": 10.95486068725586, + "learning_rate": 1e-06, + "loss": 0.1565, + "step": 997 + }, + { + "epoch": 110.88888888888889, + "grad_norm": 12.906221389770508, + "learning_rate": 1e-06, + "loss": 0.064, + "step": 998 + }, + { + "epoch": 110.88888888888889, + "step": 998, + "total_flos": 5.636802804396851e+16, + "train_loss": 0.6520614204200391, + "train_runtime": 2665.3891, + "train_samples_per_second": 2.247, + "train_steps_per_second": 0.374 + } + ], + "logging_steps": 1.0, + "max_steps": 998, + "num_input_tokens_seen": 0, + "num_train_epochs": 111, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.636802804396851e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/training_args.bin b/MiniCPM-2-V/output/output_minicpmv2_lora/training_args.bin new file mode 100644 index 00000000..b0420903 Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/training_args.bin differ diff --git a/MiniCPM-2-V/output/output_minicpmv2_lora/vpm_resampler_embedtokens.pt b/MiniCPM-2-V/output/output_minicpmv2_lora/vpm_resampler_embedtokens.pt new file mode 100644 index 00000000..65418810 Binary files /dev/null and b/MiniCPM-2-V/output/output_minicpmv2_lora/vpm_resampler_embedtokens.pt differ diff --git a/MiniCPM-2-V/requirement/DeepSpeed b/MiniCPM-2-V/requirement/DeepSpeed new file mode 160000 index 00000000..b6e24adb --- /dev/null +++ b/MiniCPM-2-V/requirement/DeepSpeed @@ -0,0 +1 @@ +Subproject commit b6e24adb43257628592aaaa772c328efac30f797 diff --git a/MiniCPM-2-V/requirement/sentencepiece b/MiniCPM-2-V/requirement/sentencepiece new file mode 160000 index 00000000..6225e08e --- /dev/null +++ b/MiniCPM-2-V/requirement/sentencepiece @@ -0,0 +1 @@ +Subproject commit 6225e08edb2577757163b3f5dbba4c0b670ef445 diff --git a/MiniCPM-2-V/trainer.py b/MiniCPM-2-V/trainer.py new file mode 100644 index 00000000..bea2eff5 --- /dev/null +++ b/MiniCPM-2-V/trainer.py @@ -0,0 +1,271 @@ +import torch +import torch.nn as nn +import deepspeed +from transformers import Trainer +from transformers.trainer_pt_utils import nested_detach +from transformers.utils import is_sagemaker_mp_enabled +from transformers.trainer import * +import deepspeed +from transformers.integrations import is_deepspeed_zero3_enabled + +class CPMTrainer(Trainer): + def compute_loss(self, model, inputs, return_outputs=False): + if "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + self.model.resampler.pos_embed = self.model.resampler.pos_embed.to(self.model.device) + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.GatheredParameters(self.model.resampler.attn.parameters(), modifier_rank=0): + if not self.args.use_lora: + outputs = self.model(data = inputs, use_cache=False) + else: + with self.model._enable_peft_forward_hooks(**inputs): + outputs = self.model.base_model(data = inputs, use_cache=False) + else: + if not self.args.use_lora: + outputs = self.model(data = inputs, use_cache=False) + else: + with self.model._enable_peft_forward_hooks(**inputs): + outputs = self.model.base_model(data = inputs, use_cache=False) + + if labels is not None: + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + logits = outputs.logits.view(-1, + self.model.config.vocab_size).contiguous() + labels = labels.view(-1).long().contiguous() + # Enable model parallelism + labels = labels.to(logits.device) + loss = loss_fct(logits, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model: nn.Module, + inputs: Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[List[str]] = None, + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + + Return: + Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, + logits and labels (each being optional). + """ + has_labels = ( + False + if len(self.label_names) == 0 + else all(inputs.get(k) is not None for k in self.label_names) + ) + # For CLIP-like models capable of returning loss values. + # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` + # is `True` in `model.forward`. + return_loss = inputs.get("return_loss", None) + if return_loss is None: + return_loss = self.can_return_loss + loss_without_labels = ( + True if len(self.label_names) == 0 and return_loss else False + ) + + inputs = self._prepare_inputs(inputs) + if ignore_keys is None: + if hasattr(self.model, "config"): + ignore_keys = getattr( + self.model.config, "keys_to_ignore_at_inference", [] + ) + else: + ignore_keys = [] + + # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. + if has_labels or loss_without_labels: + labels = nested_detach(tuple(inputs.get(name) + for name in self.label_names)) + if len(labels) == 1: + labels = labels[0] + else: + labels = None + + with torch.no_grad(): + if is_sagemaker_mp_enabled(): + raw_outputs = smp_forward_only(model, inputs) + if has_labels or loss_without_labels: + if isinstance(raw_outputs, dict): + loss_mb = raw_outputs["loss"] + logits_mb = tuple( + v + for k, v in raw_outputs.items() + if k not in ignore_keys + ["loss"] + ) + else: + loss_mb = raw_outputs[0] + logits_mb = raw_outputs[1:] + + loss = loss_mb.reduce_mean().detach().cpu() + logits = smp_nested_concat(logits_mb) + else: + loss = None + if isinstance(raw_outputs, dict): + logits_mb = tuple( + v for k, v in raw_outputs.items() if k not in ignore_keys + ) + else: + logits_mb = raw_outputs + logits = smp_nested_concat(logits_mb) + else: + if has_labels or loss_without_labels: + with self.compute_loss_context_manager(): + loss, outputs = self.compute_loss( + model, inputs, return_outputs=True + ) + loss = loss.mean().detach() + + if isinstance(outputs, dict): + logits = tuple( + v + for k, v in outputs.items() + if k not in ignore_keys + ["loss"] + ) + else: + logits = outputs[1:] + else: + loss = None + with self.compute_loss_context_manager(): + outputs = model(**inputs) + if isinstance(outputs, dict): + logits = tuple( + v for k, v in outputs.items() if k not in ignore_keys + ) + else: + logits = outputs + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index - 1] + + if prediction_loss_only: + return (loss, None, None) + + logits = nested_detach(logits) + if len(logits) == 1: + logits = logits[0] + + return (loss, logits, labels) + + def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor: + """ + Perform a training step on a batch of inputs. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + model.train() + inputs = self._prepare_inputs(inputs) + + if is_sagemaker_mp_enabled(): + loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps) + return loss_mb.reduce_mean().detach().to(self.args.device) + + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs) + + del inputs + torch.cuda.empty_cache() + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + if is_deepspeed_zero3_enabled(): + with deepspeed.zero.GatheredParameters(self.model.resampler.attn.parameters(), modifier_rank=0): + self.accelerator.backward(loss) + else: + self.accelerator.backward(loss) + + return loss.detach() / self.args.gradient_accumulation_steps + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + # If we are executing this function, we are the process zero, so we don't check for that. + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Saving model checkpoint to {output_dir}") + + supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not isinstance(self.model, supported_classes): + if state_dict is None: + state_dict = self.model.state_dict() + + if isinstance(unwrap_model(self.model), supported_classes): + unwrap_model(self.model).save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + if self.args.save_safetensors: + safetensors.torch.save_file( + state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"} + ) + else: + torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + else: + if self.args.use_lora: + from collections import OrderedDict + state_dict_vision = OrderedDict() + for key, values in state_dict.items(): + if 'vpm' in key or 'resampler' in key or 'embed_tokens' in key: + state_dict_vision[key] = values + self.model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + torch.save(state_dict_vision, f"{output_dir}/vpm_resampler_embedtokens.pt", ) + else: + self.model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + + if self.tokenizer is not None: + self.tokenizer.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) diff --git a/MiniCPM-2-V/vlm/configuration_minicpm.py b/MiniCPM-2-V/vlm/configuration_minicpm.py new file mode 100644 index 00000000..9fbee0f0 --- /dev/null +++ b/MiniCPM-2-V/vlm/configuration_minicpm.py @@ -0,0 +1,232 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" MiniCPM model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +MINICPM_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +class MiniCPMConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`MiniCPMModel`]. It is used to instantiate an MiniCPM + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the MiniCPM-7B. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the MiniCPM model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`MiniCPMModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. MiniCPM 1 supports up to 2048 tokens, + MiniCPM 2 up to 4096, CodeMiniCPM up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://hf-mirror.com/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/LocalMiniCPM/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + ```python + >>> from transformers import MiniCPMModel, MiniCPMConfig + >>> # Initializing a MiniCPM minicpm-7b style configuration + >>> configuration = MiniCPMConfig() + >>> # Initializing a model from the minicpm-7b style configuration + >>> model = MiniCPMModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "minicpm" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + scale_emb=1, + dim_model_base=1, + scale_depth=1, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.scale_emb = scale_emb + self.dim_model_base = dim_model_base + self.scale_depth = scale_depth + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if ( + rope_scaling_factor is None + or not isinstance(rope_scaling_factor, float) + or rope_scaling_factor <= 1.0 + ): + raise ValueError( + f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}" + ) + + +class MiniCPMVConfig(MiniCPMConfig): + model_type = "minicpmv" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vision_encoder="vit_so400m_patch14_siglip_384.webli", + query_num=64, + image_size=448, + drop_vision_last_layer=True, + slice_mode=True, + patch_size=14, + max_slice_nums=9, + scale_resolution=448, + im_start_token_id=101, + im_end_token_id=102, + slice_start_token_id=111, + slice_end_token_id=112, + unk_token_id=0, + **kwargs, + ): + self.vision_encoder = vision_encoder + self.query_num = query_num + self.image_size = image_size + self.drop_vision_last_layer = drop_vision_last_layer + self.slice_mode = slice_mode + self.patch_size = patch_size + self.max_slice_nums = max_slice_nums + self.scale_resolution = scale_resolution + self.im_start_token_id = im_start_token_id + self.im_end_token_id = im_end_token_id + self.slice_start_token_id = slice_start_token_id + self.slice_end_token_id = slice_end_token_id + self.unk_token_id = unk_token_id + super().__init__(**kwargs) diff --git a/MiniCPM-2-V/vlm/modeling_minicpm.py b/MiniCPM-2-V/vlm/modeling_minicpm.py new file mode 100644 index 00000000..002df2a6 --- /dev/null +++ b/MiniCPM-2-V/vlm/modeling_minicpm.py @@ -0,0 +1,1697 @@ +# coding=utf-8 +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch MiniCPM model.""" +import math +import re +import warnings +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.cache_utils import Cache, DynamicCache +from transformers.modeling_attn_mask_utils import ( + AttentionMaskConverter, + _prepare_4d_attention_mask, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast, +) +from transformers.modeling_utils import PreTrainedModel +from transformers.pytorch_utils import ( + ALL_LAYERNORM_LAYERS, + is_torch_greater_or_equal_than_1_13, +) +from transformers.utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from transformers.utils.import_utils import is_torch_fx_available + +from .configuration_minicpm import MiniCPMConfig + +try: + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa +except: + pass + +# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph. +# It means that the function will not be traced through and simply appear as a node in the graph. +if is_torch_fx_available(): + if not is_torch_greater_or_equal_than_1_13: + import torch.fx + + _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask) + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "MiniCPMConfig" + + +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad( + torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0) + ) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + warnings.warn( + "Calling `transformers.models.minicpm.modeling_minicpm._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask" + ) + return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, +): + warnings.warn( + "Calling `transformers.models.minicpm.modeling_minicpm._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.minicpm.modeling_minicpm.AttentionMaskConverter._make_causal_mask" + ) + return AttentionMaskConverter._make_causal_mask( + input_ids_shape=input_ids_shape, + dtype=dtype, + device=device, + past_key_values_length=past_key_values_length, + ) + + +# @torch.jit.script # type: ignore +def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float): + old_dtype = hidden.dtype + variance = hidden.to(torch.float32).pow(2).mean(dim=-1, keepdim=True) + hidden = (hidden * torch.rsqrt(variance + eps)).to(old_dtype) + return hidden * weight + + +class MiniCPMRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + MiniCPMRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + return rms_layernorm(hidden_states, self.weight, self.variance_epsilon) + + +ALL_LAYERNORM_LAYERS.append(MiniCPMRMSNorm) + + +class MiniCPMRotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + # seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + seq_len=max_position_embeddings, + device=self.inv_freq.device, + dtype=torch.float32, + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding): + """MiniCPMRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + t = t / self.scaling_factor + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +class MiniCPMDynamicNTKScalingRotaryEmbedding(MiniCPMRotaryEmbedding): + """MiniCPMRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla""" + + def __init__( + self, + dim, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + ): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) + - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / ( + base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim) + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) + + t = torch.arange( + self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype + ) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + + self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + # cos = cos[position_ids].unsqueeze(unsqueeze_dim) + # sin = sin[position_ids].unsqueeze(unsqueeze_dim) + # q_embed = (q * cos) + (rotate_half(q) * sin) + # k_embed = (k * cos) + (rotate_half(k) * sin) + orig_dtype = k.dtype + cos = cos[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(unsqueeze_dim) # [bs, 1, seq_len, dim] + q_fp32 = q.to(dtype=torch.float32, device=q.device) + k_fp32 = k.to(dtype=torch.float32, device=k.device) + q_embed = (q_fp32 * cos) + (rotate_half(q_fp32) * sin) + k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin) + return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype) + + +class MiniCPMMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + if self.config.pretraining_tp > 1: + slice = self.intermediate_size // self.config.pretraining_tp + gate_proj_slices = self.gate_proj.weight.split(slice, dim=0) + up_proj_slices = self.up_proj.weight.split(slice, dim=0) + down_proj_slices = self.down_proj.weight.split(slice, dim=1) + + gate_proj = torch.cat( + [ + F.linear(x, gate_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ], + dim=-1, + ) + up_proj = torch.cat( + [ + F.linear(x, up_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ], + dim=-1, + ) + + intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2) + down_proj = [ + F.linear(intermediate_states[i], down_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ] + down_proj = sum(down_proj) + else: + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + return down_proj + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand( + batch, num_key_value_heads, n_rep, slen, head_dim + ) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class MiniCPMAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: MiniCPMConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will " + "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}" + f" and `num_heads`: {self.num_heads})." + ) + + self.q_proj = nn.Linear( + self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.v_proj = nn.Linear( + self.hidden_size, + self.num_key_value_heads * self.head_dim, + bias=config.attention_bias, + ) + self.o_proj = nn.Linear( + self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias + ) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = MiniCPMRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling["type"] + scaling_factor = self.config.rope_scaling["factor"] + if scaling_type == "linear": + self.rotary_emb = MiniCPMLinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + elif scaling_type == "dynamic": + self.rotary_emb = MiniCPMDynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + scaling_factor=scaling_factor, + base=self.rope_theta, + ) + else: + raise ValueError(f"Unknown RoPE scaling type {scaling_type}") + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return ( + tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + .transpose(1, 2) + .contiguous() + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = ( + self.num_key_value_heads * self.head_dim + ) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [ + F.linear(hidden_states, query_slices[i]) + for i in range(self.config.pretraining_tp) + ] + query_states = torch.cat(query_states, dim=-1) + + key_states = [ + F.linear(hidden_states, key_slices[i]) + for i in range(self.config.pretraining_tp) + ] + key_states = torch.cat(key_states, dim=-1) + + value_states = [ + F.linear(hidden_states, value_slices[i]) + for i in range(self.config.pretraining_tp) + ] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul( + query_states, key_states.transpose(2, 3) + ) / math.sqrt(self.head_dim) + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax( + attn_weights, dim=-1, dtype=torch.float32 + ).to(query_states.dtype) + attn_weights = nn.functional.dropout( + attn_weights, p=self.attention_dropout, training=self.training + ) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split( + self.hidden_size // self.config.pretraining_tp, dim=2 + ) + o_proj_slices = self.o_proj.weight.split( + self.hidden_size // self.config.pretraining_tp, dim=1 + ) + attn_output = sum( + [ + F.linear(attn_output[i], o_proj_slices[i]) + for i in range(self.config.pretraining_tp) + ] + ) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class MiniCPMFlashAttention2(MiniCPMAttention): + """ + MiniCPM flash attention module. This module inherits from `MiniCPMAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # MiniCPMFlashAttention2 attention does not support output_attentions + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop("padding_mask") + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states.to(torch.float32), seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (MiniCPMRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + # Handle the case where the model is quantized + if hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in MiniCPMFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + ( + query_states, + key_states, + value_states, + indices_q, + cu_seq_lens, + max_seq_lens, + ) = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input( + attn_output_unpad, indices_q, batch_size, query_length + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + return attn_output + + def _upad_input( + self, query_layer, key_layer, value_layer, attention_mask, query_length + ): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), + indices_k, + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), + indices_k, + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( + query_layer, attention_mask + ) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +class MiniCPMSdpaAttention(MiniCPMAttention): + """ + MiniCPM attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `MiniCPMAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from MiniCPMAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MiniCPMModel is using MiniCPMSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view( + bsz, q_len, self.num_heads, self.head_dim + ).transpose(1, 2) + key_states = key_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + value_states = value_states.view( + bsz, q_len, self.num_key_value_heads, self.head_dim + ).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb( + query_states, key_states, cos, sin, position_ids + ) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update( + key_states, value_states, self.layer_idx, cache_kwargs + ) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and attention_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1. + is_causal=self.is_causal and attention_mask is None and q_len > 1, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +MINICPM_ATTENTION_CLASSES = { + "eager": MiniCPMAttention, + "flash_attention_2": MiniCPMFlashAttention2, + "sdpa": MiniCPMSdpaAttention, +} + + +class MiniCPMDecoderLayer(nn.Module): + def __init__(self, config: MiniCPMConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = MINICPM_ATTENTION_CLASSES[config._attn_implementation]( + config=config, layer_idx=layer_idx + ) + + self.mlp = MiniCPMMLP(config) + self.input_layernorm = MiniCPMRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + self.post_attention_layernorm = MiniCPMRMSNorm( + config.hidden_size, eps=config.rms_norm_eps + ) + + self.scale_depth = config.scale_depth + self.num_hidden_layers = config.num_hidden_layers + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[ + torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]] + ]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + + hidden_states = residual + hidden_states * ( + self.scale_depth / math.sqrt(self.num_hidden_layers) + ) + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states * ( + self.scale_depth / math.sqrt(self.num_hidden_layers) + ) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +MINICPM_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + Parameters: + config ([`MiniCPMConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.", + MINICPM_START_DOCSTRING, +) +class MiniCPMPreTrainedModel(PreTrainedModel): + config_class = MiniCPMConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["MiniCPMDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +MINICPM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + Two formats are allowed: + - a [`~cache_utils.Cache`] instance; + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare MiniCPM Model outputting raw hidden-states without any specific head on top.", + MINICPM_START_DOCSTRING, +) +class MiniCPMModel(MiniCPMPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MiniCPMDecoderLayer`] + Args: + config: MiniCPMConfig + """ + + def __init__(self, config: MiniCPMConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding( + config.vocab_size, config.hidden_size, self.padding_idx + ) + self.layers = nn.ModuleList( + [ + MiniCPMDecoderLayer(config, layer_idx) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self._use_sdpa = config._attn_implementation == "sdpa" + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.norm = MiniCPMRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time" + ) + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + past_key_values_length = 0 + if use_cache: + use_legacy_cache = not isinstance(past_key_values, Cache) + if use_legacy_cache: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_key_values_length = past_key_values.get_usable_length(seq_length) + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.config.scale_emb + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = ( + attention_mask + if (attention_mask is not None and 0 in attention_mask) + else None + ) + elif self._use_sdpa and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, + (batch_size, seq_length), + inputs_embeds, + past_key_values_length, + ) + + # embed positions + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = None + if use_cache: + next_cache = ( + next_decoder_cache.to_legacy_cache() + if use_legacy_cache + else next_decoder_cache + ) + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] + if v is not None + ) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +class MiniCPMForCausalLM(MiniCPMPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = MiniCPMModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + Example: + ```python + >>> from transformers import AutoTokenizer, MiniCPMForCausalLM + >>> model = MiniCPMForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.pretraining_tp > 1: + lm_head_slices = self.lm_head.weight.split( + self.vocab_size // self.config.pretraining_tp, dim=0 + ) + logits = [ + F.linear(hidden_states, lm_head_slices[i]) + for i in range(self.config.pretraining_tp) + ] + logits = torch.cat(logits, dim=-1) + else: + logits = self.lm_head( + hidden_states / (self.config.hidden_size / self.config.dim_model_base) + ) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as + # input) + if ( + attention_mask is not None + and attention_mask.shape[1] > input_ids.shape[1] + ): + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx.to(past_state.device)) + for past_state in layer_past + ), + ) + return reordered_past + + @torch.inference_mode() + def chat( + self, + tokenizer, + query: str, + history: List[Dict] = None, + role: str = "user", + max_length: int = 4096, + num_beams=1, + do_sample=True, + top_p=0.8, + temperature=0.3, + logits_processor=None, + **kwargs, + ): + if history is None: + history = [] + if logits_processor: + gen_kwargs = { + "max_length": max_length, + "num_beams": num_beams, + "do_sample": do_sample, + "top_p": top_p, + "temperature": temperature, + "logits_processor": logits_processor, + **kwargs, + } + else: + gen_kwargs = { + "max_length": max_length, + "num_beams": num_beams, + "do_sample": do_sample, + "top_p": top_p, + "temperature": temperature, + "logits_processor": logits_processor, + **kwargs, + } + + history.append({"role": role, "content": query}) + history_str = tokenizer.apply_chat_template( + history, tokenize=False, add_generation_prompt=False + ) + inputs = tokenizer(history_str, return_tensors="pt").to(self.device) + outputs = self.generate(**inputs, **gen_kwargs) + outputs = outputs.tolist()[0][len(inputs["input_ids"][0]) : -1] + response = tokenizer.decode(outputs) + pattern = re.compile(r".*?(?=|<用户>)", re.DOTALL) + matches = pattern.findall(response) + if len(matches) > 0: + response = matches[0] + history.append({"role": "assistant", "content": response}) + return response, history + + +@add_start_docstrings( + """ + The MiniCPM Model transformer with a sequence classification head on top (linear layer). + [`MiniCPMForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + MINICPM_START_DOCSTRING, +) +class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = MiniCPMModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + @add_start_docstrings_to_model_forward(MINICPM_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError( + "Cannot handle batch sizes > 1 if no padding token is defined." + ) + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = ( + torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1 + ).to(logits.device) + else: + sequence_lengths = -1 + + pooled_logits = logits[ + torch.arange(batch_size, device=logits.device), sequence_lengths + ] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and ( + labels.dtype == torch.long or labels.dtype == torch.int + ): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct( + pooled_logits.view(-1, self.num_labels), labels.view(-1) + ) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/MiniCPM-2-V/vlm/modeling_minicpmv.py b/MiniCPM-2-V/vlm/modeling_minicpmv.py new file mode 100644 index 00000000..7f1b2c65 --- /dev/null +++ b/MiniCPM-2-V/vlm/modeling_minicpmv.py @@ -0,0 +1,590 @@ +import math +from typing import List, Optional +import json +import timm +import torch +import torchvision +from PIL import Image +from timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from torchvision import transforms +from transformers import LlamaTokenizer + +from .configuration_minicpm import MiniCPMVConfig +from .modeling_minicpm import MiniCPMForCausalLM, MiniCPMPreTrainedModel +from .resampler import Resampler + + +class MiniCPMVPreTrainedModel(MiniCPMPreTrainedModel): + config_class = MiniCPMVConfig + + +class MiniCPMV(MiniCPMVPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.llm = MiniCPMForCausalLM(config) + self.vpm = self.init_vision_module() + self.vision_dim = self.vpm.embed_dim + self.embed_dim = self.llm.config.hidden_size + self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) + self.transform = self.init_transform() + + def init_vision_module(self): + model = timm.create_model( + self.config.vision_encoder, + pretrained=False, + num_classes=0, + dynamic_img_size=True, + dynamic_img_pad=True + ) + + if isinstance(model, timm.models.VisionTransformer): + if model.attn_pool is not None: + model.attn_pool = torch.nn.Identity() + + if self.config.drop_vision_last_layer: + model.blocks = model.blocks[:-1] + + return model + + def init_resampler(self, embed_dim, vision_dim): + return Resampler( + grid_size=int(math.sqrt(self.config.query_num)), + embed_dim=embed_dim, + num_heads=embed_dim // 128, + kv_dim=vision_dim, + adaptive=True + ) + + def init_transform(self): + return transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD + ), + ] + ) + + def get_input_embeddings(self): + return self.llm.get_input_embeddings() + + def set_input_embeddings(self, value): + self.llm.embed_tokens = value + + def get_vision_embedding(self, pixel_values): + res = [] + dtype = self.vpm.pos_embed.data.dtype + for pixel_value in pixel_values: + H, W = pixel_value.shape[-2:] + tgt_size = ( + math.ceil(H / self.vpm.patch_embed.patch_size[0]), math.ceil(W / self.vpm.patch_embed.patch_size[0])) + vision_embedding = self.vpm.forward_features(pixel_value.unsqueeze(0).type(dtype)) + if hasattr(self.vpm, 'num_prefix_tokens') and self.vpm.num_prefix_tokens > 0: + vision_embedding = vision_embedding[:, self.vpm.num_prefix_tokens:] + res.append(self.resampler(vision_embedding, tgt_size)) + return torch.vstack(res) + + def get_vllm_embedding(self, data): + if "vision_hidden_states" not in data: + pixel_values_list = data["pixel_values"] + vision_hidden_states = [] + for pixel_values in pixel_values_list: + if len(pixel_values) > 0: + vision_hidden_states.append(self.get_vision_embedding(pixel_values)) + elif self.training: + dtype = self.vpm.pos_embed.data.dtype + device = self.vpm.pos_embed.data.device + dummy_image = torch.zeros( + (1, 3, 224, 224), device=device, dtype=dtype + ) + vision_hidden_states.append(self.get_vision_embedding(dummy_image)) + else: + vision_hidden_states.append([]) + + else: + vision_hidden_states = data["vision_hidden_states"] + + vllm_embedding = ( + self.llm.model.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb + ) + vision_hidden_states = [ + i.type(vllm_embedding.dtype) if isinstance(i, torch.Tensor) else i + for i in vision_hidden_states + ] + + bs = len(data["input_ids"]) + for i in range(bs): + cur_vs_hs = vision_hidden_states[i] + if len(cur_vs_hs) > 0: + cur_vllm_emb = vllm_embedding[i] + cur_image_bound = data["image_bound"][i] + if len(cur_image_bound) > 0: + image_indices = torch.stack( + [ + torch.arange(r[0], r[1], dtype=torch.long) + for r in cur_image_bound + ] + ).to(vllm_embedding.device) + + cur_vllm_emb.scatter_( + 0, + image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]), + cur_vs_hs.view(-1, cur_vs_hs.shape[-1]), + ) + elif self.training: + cur_vllm_emb += cur_vs_hs[0].mean() * 0 + + return vllm_embedding, vision_hidden_states + + def forward(self, data, **kwargs): + vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data) + position_ids = data["position_ids"] + if position_ids.dtype != torch.int64: + position_ids = position_ids.long() + + return self.llm( + input_ids=None, + position_ids=position_ids, + inputs_embeds=vllm_embedding, + **kwargs + ) + + def _convert_to_tensors( + self, tokenizer, input_str, max_inp_length: Optional[int] = None + ): + if tokenizer.add_bos_token: + input_ids = tokenizer.encode(input_str) + else: + input_ids = [tokenizer.bos_id] + tokenizer.encode(input_str) + if max_inp_length is not None: + input_ids = input_ids[:max_inp_length] + input_ids = torch.tensor(input_ids, dtype=torch.int32) + + image_start_tokens = torch.where(input_ids == tokenizer.im_start_id)[0] + # 跳过 im_start + image_start_tokens += 1 + image_end_tokens = torch.where(input_ids == tokenizer.im_end_id)[0] + valid_image_nums = max(len(image_start_tokens), len(image_end_tokens)) + image_bound = torch.hstack( + [ + image_start_tokens[:valid_image_nums].unsqueeze(-1), + image_end_tokens[:valid_image_nums].unsqueeze(-1), + ] + ) + + model_input = {} + model_input["input_ids"] = input_ids.unsqueeze(0).to(self.device) + model_input["image_bound"] = image_bound + + return model_input + + def _process_list( + self, tokenizer, data_list: List[str], max_inp_length: Optional[int] = None + ): + pad_keys = ["input_ids"] + input_tensors = [] + for data in data_list: + input_tensors.append( + self._convert_to_tensors(tokenizer, data, max_inp_length) + ) + padded = {} + for key in pad_keys: + padded[key] = pad(input_tensors, key, padding_side="left").to(self.device) + padded["image_bound"] = [i["image_bound"] for i in input_tensors] + return padded + + def _decode(self, inputs_embeds, tokenizer, **kwargs): + output = self.llm.generate( + inputs_embeds=inputs_embeds, + pad_token_id=0, + eos_token_id=tokenizer.eos_token_id, + **kwargs + ) + return self._decode_text(output, tokenizer) + + def _decode_text(self, result_ids, tokenizer): + result_text = [] + for result in result_ids: + result = result[result != 0] + if result[0] == tokenizer.bos_id: + result = result[1:] + if result[-1] == tokenizer.eos_id: + result = result[:-1] + result_text.append(tokenizer.decode(result).strip()) + return result_text + + def slice_image(self, image): + return slice_image( + image, + self.config.max_slice_nums, + self.config.scale_resolution, + self.config.patch_size, + ) + + def get_slice_image_placeholder(self, image, tokenizer): + image_placeholder = ( + tokenizer.im_start + + tokenizer.unk_token * self.config.query_num + + tokenizer.im_end + ) + + slice_images = [] + + source_image, patches, best_grid = slice_image( + image, + self.config.max_slice_nums, + self.config.scale_resolution, + self.config.patch_size, + ) + + slice_images.append(source_image) + final_placeholder = image_placeholder + + if len(patches) > 0: + for i in range(len(patches)): + for j in range(len(patches[0])): + slice_images.append(patches[i][j]) + + final_placeholder += get_grid_placeholder( + tokenizer, best_grid, self.config.query_num + ) + + return slice_images, final_placeholder + + def generate( + self, + data_list=None, + img_list=None, + tokenizer=None, + max_inp_length: Optional[int] = None, + vision_hidden_states=None, + return_vision_hidden_states=False, + **kwargs + ): + + assert data_list is not None + bs = len(data_list) + if img_list == None: + img_list = [[] for i in range(bs)] + assert bs == len(img_list) + + model_inputs = self._process_list(tokenizer, data_list, max_inp_length) + + if vision_hidden_states is None: + pixel_values = [] + for i in range(bs): + img_inps = [] + for img in img_list[i]: + img_inps.append(self.transform(img).to(self.device)) + if img_inps: + pixel_values.append(img_inps) + else: + pixel_values.append([]) + model_inputs["pixel_values"] = pixel_values + else: + model_inputs["vision_hidden_states"] = vision_hidden_states + + with torch.inference_mode(): + ( + model_inputs["inputs_embeds"], + vision_hidden_states, + ) = self.get_vllm_embedding(model_inputs) + + result = self._decode(model_inputs["inputs_embeds"], tokenizer, **kwargs) + + if return_vision_hidden_states: + return result, vision_hidden_states + + return result + + def chat( + self, + image, + msgs, + context, + tokenizer, + vision_hidden_states=None, + max_new_tokens=1024, + sampling=True, + max_inp_length=2048, + **kwargs + ): + if isinstance(msgs, str): + msgs = json.loads(msgs) + # msgs to prompt + prompt = "" + for i, msg in enumerate(msgs): + role = msg["role"] + content = msg["content"] + assert role in ["user", "assistant"] + if i == 0: + assert role == "user", "The role of first msg should be user" + if self.config.slice_mode: + images, final_placeholder = self.get_slice_image_placeholder( + image, tokenizer + ) + content = final_placeholder + "\n" + content + else: + images = [image] + content = ( + tokenizer.im_start + + tokenizer.unk_token * self.config.query_num + + tokenizer.im_end + + "\n" + + content + ) + prompt += "<用户>" if role == "user" else "" + prompt += content + prompt += "" + final_input = prompt + + if sampling: + generation_config = { + "top_p": 0.8, + "top_k": 100, + "temperature": 0.7, + "do_sample": True, + "repetition_penalty": 1.05 + } + else: + generation_config = { + "num_beams": 3, + "repetition_penalty": 1.2, + } + + generation_config.update( + (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys() + ) + + with torch.inference_mode(): + res, vision_hidden_states = self.generate( + data_list=[final_input], + max_inp_length=max_inp_length, + img_list=[images], + tokenizer=tokenizer, + max_new_tokens=max_new_tokens, + vision_hidden_states=vision_hidden_states, + return_vision_hidden_states=True, + **generation_config + ) + answer = res[0] + context = msgs.copy() + context.append({"role": "assistant", "content": answer}) + + return answer, context, generation_config + + +class LlamaTokenizerWrapper(LlamaTokenizer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.im_start = "" + self.im_end = "" + self.ref_start = "" + self.ref_end = "" + self.box_start = "" + self.box_end = "" + self.quad_start = "" + self.quad_end = "" + self.point_start = "" + self.point_end = "" + self.slice_start = "" + self.slice_end = "" + + @property + def eos_id(self): + return self.sp_model.eos_id() + + @property + def bos_id(self): + return self.sp_model.bos_id() + + @property + def unk_id(self): + return self.sp_model.unk_id() + + @property + def im_start_id(self): + return self._convert_token_to_id(self.im_start) + + @property + def im_end_id(self): + return self._convert_token_to_id(self.im_end) + + +def pad(orig_items, key, max_length=None, padding_value=0, padding_side="left"): + items = [] + if isinstance(orig_items[0][key], list): + assert isinstance(orig_items[0][key][0], torch.Tensor) + for it in orig_items: + for tr in it[key]: + items.append({key: tr}) + else: + assert isinstance(orig_items[0][key], torch.Tensor) + items = orig_items + + batch_size = len(items) + shape = items[0][key].shape + dim = len(shape) + assert dim <= 3 + if max_length is None: + max_length = 0 + max_length = max(max_length, max(item[key].shape[-1] for item in items)) + min_length = min(item[key].shape[-1] for item in items) + dtype = items[0][key].dtype + + if dim == 1: + return torch.cat([item[key] for item in items], dim=0) + elif dim == 2: + if max_length == min_length: + return torch.cat([item[key] for item in items], dim=0) + tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value + else: + tensor = ( + torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + + padding_value + ) + + for i, item in enumerate(items): + if dim == 2: + if padding_side == "left": + tensor[i, -len(item[key][0]) :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0])] = item[key][0].clone() + elif dim == 3: + if padding_side == "left": + tensor[i, -len(item[key][0]) :, :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0]), :] = item[key][0].clone() + + return tensor + + +def slice_image( + image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False +): + original_size = image.size + original_width, original_height = original_size + log_ratio = math.log(original_width / original_height) + ratio = original_width * original_height / (scale_resolution * scale_resolution) + multiple = min(math.ceil(ratio), max_slice_nums) + + source_image = None + best_grid = None + patches = [] + + if multiple <= 1 or never_split: + # dont need to slice, upsample + best_size = find_best_resize( + original_size, scale_resolution, patch_size, allow_upscale=True + ) + source_image = image.resize(best_size, Image.Resampling.BICUBIC) + else: + candidate_split_grids_nums = [] + for i in [multiple - 1, multiple, multiple + 1]: + if i == 1 or i > max_slice_nums: + continue + candidate_split_grids_nums.append(i) + + # source image, down-sampling and ensure divided by patch_size + best_resize = find_best_resize(original_size, scale_resolution, patch_size) + source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC) + candidate_grids = [] + + # find best grid + for split_grids_nums in candidate_split_grids_nums: + m = 1 + while m <= split_grids_nums: + if split_grids_nums % m == 0: + candidate_grids.append([m, split_grids_nums // m]) + m += 1 + + best_grid = [1, 1] + min_error = float("inf") + for grid in candidate_grids: + error = abs(log_ratio - math.log(grid[0] / grid[1])) + if error < min_error: + best_grid = grid + min_error = error + + refine_size = get_refine_size( + original_size, best_grid, scale_resolution, patch_size, allow_upscale=True + ) + + refine_image = image.resize(refine_size, Image.Resampling.BICUBIC) + patches = split_to_patches(refine_image, best_grid) + + return source_image, patches, best_grid + + +def ensure_divide(length, patch_size): + return max(round(length / patch_size) * patch_size, patch_size) + + +def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False): + width, height = original_size + if (width * height > scale_resolution * scale_resolution) or allow_upscale: + r = width / height + height = int(scale_resolution / math.sqrt(r)) + width = int(height * r) + best_width = ensure_divide(width, patch_size) + best_height = ensure_divide(height, patch_size) + return (best_width, best_height) + + +def get_refine_size( + original_size, grid, scale_resolution, patch_size, allow_upscale=False +): + width, height = original_size + grid_x, grid_y = grid + + refine_width = ensure_divide(width, grid_x) + refine_height = ensure_divide(height, grid_y) + + grid_width = refine_width / grid_x + grid_height = refine_height / grid_y + + best_grid_size = find_best_resize( + (grid_width, grid_height), + scale_resolution, + patch_size, + allow_upscale=allow_upscale, + ) + + refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y) + + return refine_size + + +def split_to_patches(image, grid): + patches = [] + width, height = image.size + grid_x = int(width / grid[0]) + grid_y = int(height / grid[1]) + + for i in range(0, height, grid_y): + images = [] + for j in range(0, width, grid_x): + box = (j, i, j + grid_x, i + grid_y) + patch = image.crop(box) + images.append(patch) + patches.append(images) + + return patches + + +def get_grid_placeholder(tokenizer, grid, query_num): + image_placeholder = ( + tokenizer.im_start + tokenizer.unk_token * query_num + tokenizer.im_end + ) + + cols = grid[0] + rows = grid[1] + slices = [] + for i in range(rows): + lines = [] + for j in range(cols): + lines.append(image_placeholder) + slices.append("".join(lines)) + slice_placeholder = tokenizer.slice_start + "\n".join(slices) + tokenizer.slice_end + return slice_placeholder diff --git a/MiniCPM-2-V/vlm/resampler.py b/MiniCPM-2-V/vlm/resampler.py new file mode 100644 index 00000000..5888ecdf --- /dev/null +++ b/MiniCPM-2-V/vlm/resampler.py @@ -0,0 +1,170 @@ +# Copyright (c) Alibaba Cloud. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict +import math +import requests +from io import BytesIO +from functools import partial +from PIL import Image +from typing import Callable, Optional, Sequence, Tuple, List, Union +import numpy as np + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.init import trunc_normal_ +from torchvision import transforms +from torchvision.transforms import InterpolationMode + +def get_abs_pos(abs_pos, tgt_size): + # abs_pos: L, C + # tgt_size: (H, W) + # return: M, C + src_size = int(math.sqrt(abs_pos.size(0))) + # tgt_size = int(math.sqrt(tgt_size)) + dtype = abs_pos.dtype + + return F.interpolate( + abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), + size=(tgt_size[0], tgt_size[1]), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype) + + +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2. + omega = 1. / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +class Resampler(nn.Module): + """ + A 2D perceiver-resampler network with one cross attention layers by + (grid_size**2) learnable queries and 2d sincos pos_emb + Outputs: + A tensor with the shape of (grid_size**2, embed_dim) + """ + + def __init__( + self, + grid_size, + embed_dim, + num_heads, + kv_dim=None, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + adaptive=False + ): + super().__init__() + self.num_queries = grid_size ** 2 + self.embed_dim = embed_dim + self.num_heads = num_heads + self.adaptive = adaptive + + self.pos_embed = nn.Parameter( + torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float() + ).requires_grad_(False) + + self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) + trunc_normal_(self.query, std=.02) + + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False) + else: + self.kv_proj = nn.Identity() + + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.ln_q = norm_layer(embed_dim) + self.ln_kv = norm_layer(embed_dim) + + self.ln_post = norm_layer(embed_dim) + self.proj = nn.Parameter((embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim)) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x, tgt_size=None, attn_mask=None): + if self.adaptive: + pos_embed = torch.Tensor(get_2d_sincos_pos_embed(self.embed_dim, tgt_size)).float().to(device=x.device, dtype=x.dtype) + else: + pos_embed = get_abs_pos(self.pos_embed, tgt_size) + + x = self.kv_proj(x) + x = self.ln_kv(x).permute(1, 0, 2) + + N = x.shape[1] + q = self.ln_q(self.query) + out = self.attn( + self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask)[0] + x = out.permute(1, 0, 2) + + x = self.ln_post(x) + x = x @ self.proj + return x + + def _repeat(self, query, N: int): + return query.unsqueeze(1).repeat(1, N, 1)