Skip to content

Latest commit

 

History

History
140 lines (102 loc) · 4.22 KB

README.md

File metadata and controls

140 lines (102 loc) · 4.22 KB

# How to generate text from vLLM just like a local model

If model is not running, first see how to create inference endpoint vLLM Reference

from typing import Any, List, Mapping, Optional

from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM

from transformers import AutoModelForCausalLM, GenerationConfig

load the generation config (Huggingface)

generation_config = GenerationConfig(
    return_full_text=True,
    task='text-generation',
    #stopping_criteria=stopping_criteria,
    temperature= 0.05, 
    top_p=0.6,
    top_k=5,
    max_new_tokens=2000, # 200
    #repetition_penalty=1.1,
)

generation_config.save_pretrained("llama2_safe")

generation_config = GenerationConfig.from_pretrained("../llama2_safe")

default_config['max_tokens'] = default_config['max_length']

We will use SamplingParams for filtering args from config, otherwise vLLM raises error.

from vllm import SamplingParams
sp = SamplingParams()
import requests
import json

def request_get(prompt, generation_args={}, **kwargs):
    url = 'http://localhost:8000/generate'

    generation_args = {k: v for k, v in generation_args.items() if k in sp.__dict__}

    data = {
        "prompt": prompt,
        "use_beam_search": False,
        "n": 1,
    } | generation_args | kwargs
    
    data_json = json.dumps(data)
    
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, data=data_json, headers=headers)
    
    if response.status_code == 200:
        result = response.json()
        return result
    else:
        print(f"Request failed with status code {response.status_code}")
        print(response.text)
request_get('my name is', generation_args=default_config)

#produces: {'text': ['my name is john and i am a 35 year old man from the united states. i have']}
# define custom LLM for langchain
class CustomLLM(LLM):
    model_name: str='custom'
    max_len: int=8000
    config: dict=default_config

    @property
    def _llm_type(self) -> str:
        return "custom"

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        if stop is not None:
            raise ValueError("stop kwargs are not permitted.")
        #print(self.config)
        
        return request_get(prompt[: self.max_len], generation_args=self.config, **kwargs)['text'][0]

    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"model_name": self.model_name}

Example use,

my_llm = CustomLLM()

my_llm('I usually code in', temperature=1)
# "I usually code in Java, but I'm interested in learning Go. Here are some resources I've found to"

my_llm('I usually code in', temperature=0.001)
# "I usually code in Python, but I'm interested in learning more about Rust and its ecosystem. Here"

my_llm('I usually code in', temperature=0.001, max_tokens=100)
#"I usually code in Python, but I'm interested in learning more about Rust and its ecosystem. Here are some resources I've found helpful:\n\n1. The Rust Programming Language: This is the official book on Rust, written by the language's creators. It covers the language's syntax, standard library, and best practices.\n2. Rust by Example: This book provides a gentle introduction to Rust, with a focus on practical examples and exercis"

Bonus

Callbacks and Agents

from langchain import PromptTemplate, LLMChain

pr = """<s>[INST] <<SYS>>
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

<</SYS>>
Respond to user. User: {input} [/INST]"""

llm_chain = LLMChain(llm=my_llm, prompt=PromptTemplate.from_template(pr))

response = llm_chain.run(input='How are you?')

#"<s>[INST] <<SYS>>\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n<</SYS>>\nRespond to user. User: How are you? [/INST]  Hello! I'm doing well, thank you for asking! How about you?"