You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?
fromlongva.longva.model.builderimportload_pretrained_modelfromlongva.longva.mm_utilsimporttokenizer_image_token, process_imagesfromlongva.longva.constantsimportIMAGE_TOKEN_INDEX, IGNORE_INDEXfromPILimportImagefromdecordimportVideoReader, cpuimporttorchimportnumpyasnpimportcopy#from longva.longva.conversation import Conversationmax_frame_num=16prompt="<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"question_template="Here is the caption:[{}], help me determine if the caption matches the content of the video, please answer yes or no"answer_template='yes'prompt_question_template="<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"prompt_answer_template='{}<|im_end|>\n'#prompt_answer_template = '{}'defget_longva_QAscore_single_data(model, image_processor, tokenizer, videos_frames, captions, answers, device):
gen_kwargs= {"do_sample": True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
vs= []
forvideo_pathinvideos_frames:
vr=VideoReader(video_path, ctx=cpu(0))
total_frame_num=len(vr)
uniform_sampled_frames=np.linspace(0, total_frame_num-1, max_frame_num, dtype=int)
frame_idx=uniform_sampled_frames.tolist()
frames=vr.get_batch(frame_idx).asnumpy()
vs.append(image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16))
modalities= ['video'foriinvs]
# print(modalities)q=question_template.format(captions)
a=answer_template.format(answers)
questions=prompt_question_template.format(q)
answers=prompt_answer_template.format(a)
prompts=questions+answers#prompts = questionsinput_ids=tokenizer_image_token(prompts, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0)
labels=copy.deepcopy(input_ids)
# for label, qs in zip(labels, questions):tokenized_len=len(tokenizer_image_token(questions, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0))
ifquestions[-1] ==' ':
tokenized_len-=1labels[:tokenized_len] =IGNORE_INDEXinput_ids=input_ids[:tokenizer.model_max_length]
labels=labels[:tokenizer.model_max_length]
attention_mask=input_ids.ne(tokenizer.pad_token_id)
input_ids, attention_mask, labels=input_ids.to(device), attention_mask.to(device), labels.to(device)
position_ids=Nonepast_key_values=Noneimages=vsmodalities= ['video']
image_sizes=None
(input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) =model.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
withtorch.no_grad():
outputs=model(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
labels=labels,
images=images,
image_sizes=image_sizes,
modalities=modalities,
dpo_forward=False,
return_dict=True
)
# print(outputs.keys())logits=outputs.logitsloss=outputs.lossreturnlossif__name__=='__main__':
model_path='/home/css/t2v_metrics/LongVA-7B-DPO'image_path="local_demo/assets/lmms-eval.png"video_path="/home/css/LongVA/local_demo/assets/dc_demo.mp4"device='cuda'max_frames_num=16# you can change this to several thousands so long you GPU memory can handle it :)# gen_kwargs = {"do_sample": True, "return_dict":True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}tokenizer, model, image_processor, _=load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
videos= [video_path]
descriptions='a man is dancing in a mountain'answers='yes'scores=get_longva_QAscore_single_data(model, image_processor, tokenizer, videos, descriptions, answers,device)
The text was updated successfully, but these errors were encountered:
I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?
The text was updated successfully, but these errors were encountered: