Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adapting VQAScore to Longva #9

Open
qingfengcss opened this issue Jul 2, 2024 · 0 comments
Open

Adapting VQAScore to Longva #9

qingfengcss opened this issue Jul 2, 2024 · 0 comments

Comments

@qingfengcss
Copy link

qingfengcss commented Jul 2, 2024

I have problems in adapting VQAscore in Longva, when I add "{answer}<|im_end|>\n" or "answer" in the end of the prompt, the logits and the lables are wrong, could help me VQAscore in Longva?

from longva.longva.model.builder import load_pretrained_model
from longva.longva.mm_utils import tokenizer_image_token, process_images
from longva.longva.constants import IMAGE_TOKEN_INDEX, IGNORE_INDEX
from PIL import Image
from decord import VideoReader, cpu
import torch
import numpy as np
import copy
#from longva.longva.conversation import Conversation


max_frame_num = 16
prompt = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
question_template = "Here is the caption:[{}], help me determine if the caption matches the content of the video, please answer yes or no"
answer_template = 'yes'

prompt_question_template = "<|im_start|>system\nYou're an expert in data annotation.<|im_end|>\n<|im_start|>user\n<image>\n{}\n <|im_end|>\n<|im_start|>assistant\n"
prompt_answer_template = '{}<|im_end|>\n'
#prompt_answer_template = '{}'
def get_longva_QAscore_single_data(model, image_processor, tokenizer, videos_frames, captions, answers, device):
    gen_kwargs = {"do_sample": True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
    vs = []
    for video_path in videos_frames:
        vr = VideoReader(video_path, ctx=cpu(0))
        total_frame_num = len(vr)
        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frame_num, dtype=int)
        frame_idx = uniform_sampled_frames.tolist()
        frames = vr.get_batch(frame_idx).asnumpy()
        vs.append(image_processor.preprocess(frames, return_tensors="pt")["pixel_values"].to(model.device, dtype=torch.float16))
    modalities= ['video' for i in vs]
    # print(modalities)
    q = question_template.format(captions)
    a = answer_template.format(answers) 
    questions = prompt_question_template.format(q)
    answers = prompt_answer_template.format(a)
    prompts = questions + answers
    #prompts = questions
    input_ids = tokenizer_image_token(prompts, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0) 
    labels = copy.deepcopy(input_ids)
    # for label, qs in zip(labels, questions):
    tokenized_len = len(tokenizer_image_token(questions, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0))
    if questions[-1] == ' ':
        tokenized_len -= 1
    labels[:tokenized_len] = IGNORE_INDEX
    input_ids = input_ids[:tokenizer.model_max_length]
    labels = labels[:tokenizer.model_max_length]
    attention_mask = input_ids.ne(tokenizer.pad_token_id)
    
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
    position_ids = None
    past_key_values = None
    images = vs
    modalities = ['video']
    image_sizes= None
    (input_ids, position_ids, attention_mask, past_key_values, inputs_embeds, labels) = model.prepare_inputs_labels_for_multimodal(input_ids, position_ids, attention_mask, past_key_values, labels, images, modalities, image_sizes)
    with torch.no_grad():
        outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids = position_ids,
        past_key_values = past_key_values,
        inputs_embeds = inputs_embeds,
        labels = labels,
        images = images,
        image_sizes = image_sizes,
        modalities = modalities,
        dpo_forward = False,
        return_dict=True
        )
    # print(outputs.keys())
    logits = outputs.logits
    loss = outputs.loss
    return loss
if __name__ == '__main__':
    model_path = '/home/css/t2v_metrics/LongVA-7B-DPO'
    image_path = "local_demo/assets/lmms-eval.png"
    video_path = "/home/css/LongVA/local_demo/assets/dc_demo.mp4"
    device = 'cuda'
    max_frames_num = 16 # you can change this to several thousands so long you GPU memory can handle it :)
    # gen_kwargs = {"do_sample": True, "return_dict":True, "temperature": 0.5, "top_p": None, "num_beams": 1, "use_cache": True, "max_new_tokens": 1024}
    tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, "llava_qwen", device_map="cuda:0")
    videos = [video_path]
    descriptions = 'a man is dancing in a mountain'
    answers = 'yes'
    scores = get_longva_QAscore_single_data(model, image_processor, tokenizer, videos, descriptions, answers,device)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant