LLaMA-Factory/src/api_demo.py

# coding=utf-8
# Implements API for fine-tuned models in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python api_demo.py --model_name_or_path path_to_model --checkpoint_dir path_to_checkpoint
# Visit http://localhost:8000/docs for document.


import time
import torch
import uvicorn
from threading import Thread
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from transformers import TextIteratorStreamer
from sse_starlette import EventSourceResponse
from typing import Any, Dict, List, Literal, Optional, Union

from utils import (
    Template,
    load_pretrained,
    prepare_infer_args,
    get_logits_processor
)


@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()


app = FastAPI(lifespan=lifespan)


app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class ModelCard(BaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "owner"
    root: Optional[str] = None
    parent: Optional[str] = None
    permission: Optional[list] = None


class ModelList(BaseModel):
    object: str = "list"
    data: List[ModelCard] = []


class ChatMessage(BaseModel):
    role: Literal["user", "assistant", "system"]
    content: str


class DeltaMessage(BaseModel):
    role: Optional[Literal["user", "assistant", "system"]] = None
    content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    max_length: Optional[int] = None
    max_new_tokens: Optional[int] = None
    stream: Optional[bool] = False


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: Literal["stop", "length"]


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal["stop", "length"]]


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal["chat.completion", "chat.completion.chunk"]
    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))


@app.get("/v1/models", response_model=ModelList)
async def list_models():
    global model_args
    model_card = ModelCard(id="gpt-3.5-turbo")
    return ModelList(data=[model_card])


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer, source_prefix, generating_args

    if request.messages[-1].role != "user":
        raise HTTPException(status_code=400, detail="Invalid request")
    query = request.messages[-1].content

    prev_messages = request.messages[:-1]
    if len(prev_messages) > 0 and prev_messages[0].role == "system":
        prefix = prev_messages.pop(0).content
    else:
        prefix = source_prefix

    history = []
    if len(prev_messages) % 2 == 0:
        for i in range(0, len(prev_messages), 2):
            if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
                history.append([prev_messages[i].content, prev_messages[i+1].content])

    inputs = tokenizer([prompt_template.get_prompt(query, history, prefix)], return_tensors="pt")
    inputs = inputs.to(model.device)

    gen_kwargs = generating_args.to_dict()
    gen_kwargs.update({
        "input_ids": inputs["input_ids"],
        "temperature": request.temperature if request.temperature else gen_kwargs["temperature"],
        "top_p": request.top_p if request.top_p else gen_kwargs["top_p"],
        "logits_processor": get_logits_processor()
    })
    if request.max_length:
        gen_kwargs.pop("max_new_tokens", None)
        gen_kwargs["max_length"] = request.max_length
    if request.max_new_tokens:
        gen_kwargs.pop("max_length", None)
        gen_kwargs["max_new_tokens"] = request.max_new_tokens

    if request.stream:
        generate = predict(gen_kwargs, request.model)
        return EventSourceResponse(generate, media_type="text/event-stream")

    generation_output = model.generate(**gen_kwargs)
    outputs = generation_output.tolist()[0][len(inputs["input_ids"][0]):]
    response = tokenizer.decode(outputs, skip_special_tokens=True)

    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=response),
        finish_reason="stop"
    )

    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")


async def predict(gen_kwargs: Dict[str, Any], model_id: str):
    global model, tokenizer

    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    gen_kwargs["streamer"] = streamer

    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield chunk.json(exclude_unset=True, ensure_ascii=False)

    for new_text in streamer:
        if len(new_text) == 0:
            continue

        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=DeltaMessage(content=new_text),
            finish_reason=None
        )
        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
        yield chunk.json(exclude_unset=True, ensure_ascii=False)

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield chunk.json(exclude_unset=True, ensure_ascii=False)
    yield "[DONE]"


if __name__ == "__main__":
    model_args, data_args, finetuning_args, generating_args = prepare_infer_args()
    model, tokenizer = load_pretrained(model_args, finetuning_args)

    prompt_template = Template(data_args.prompt_template)
    source_prefix = data_args.source_prefix if data_args.source_prefix else ""

    uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)
add API demo from #1 2023-06-05 13:32:18 +00:00			`# coding=utf-8`
update readme 2023-06-22 16:17:05 +00:00			`# Implements API for fine-tuned models in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)`
add API demo from #1 2023-06-05 13:32:18 +00:00			`# Usage: python api_demo.py --model_name_or_path path_to_model --checkpoint_dir path_to_checkpoint`
update API 2023-06-22 12:46:24 +00:00			`# Visit http://localhost:8000/docs for document.`
add API demo from #1 2023-06-05 13:32:18 +00:00

match api with OpenAI format 2023-06-22 12:27:00 +00:00			`import time`
add API demo from #1 2023-06-05 13:32:18 +00:00			`import torch`
			`import uvicorn`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`from threading import Thread`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`from pydantic import BaseModel, Field`
update readme 2023-06-22 16:17:05 +00:00			`from fastapi import FastAPI, HTTPException`
fix typo 2023-06-30 02:09:59 +00:00			`from fastapi.middleware.cors import CORSMiddleware`
update readme 2023-06-22 16:17:05 +00:00			`from contextlib import asynccontextmanager`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`from transformers import TextIteratorStreamer`
fix streaming response in API 2023-07-05 14:42:31 +00:00			`from sse_starlette import EventSourceResponse`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`from typing import Any, Dict, List, Literal, Optional, Union`
add API demo from #1 2023-06-05 13:32:18 +00:00
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`from utils import (`
			`Template,`
			`load_pretrained,`
			`prepare_infer_args,`
			`get_logits_processor`
			`)`
add API demo from #1 2023-06-05 13:32:18 +00:00

match api with OpenAI format 2023-06-22 12:27:00 +00:00			`@asynccontextmanager`
			`async def lifespan(app: FastAPI): # collects GPU memory`
			`yield`
tiny fix 2023-06-07 08:02:07 +00:00			`if torch.cuda.is_available():`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`torch.cuda.empty_cache()`
			`torch.cuda.ipc_collect()`
add API demo from #1 2023-06-05 13:32:18 +00:00

match api with OpenAI format 2023-06-22 12:27:00 +00:00			`app = FastAPI(lifespan=lifespan)`
add API demo from #1 2023-06-05 13:32:18 +00:00

fix typo 2023-06-30 02:09:59 +00:00			`app.add_middleware(`
			`CORSMiddleware,`
			`allow_origins=["*"],`
			`allow_credentials=True,`
			`allow_methods=["*"],`
			`allow_headers=["*"],`
			`)`


support prefixes, loading multiple local files 2023-06-26 07:32:40 +00:00			`class ModelCard(BaseModel):`
			`id: str`
			`object: str = "model"`
			`created: int = Field(default_factory=lambda: int(time.time()))`
			`owned_by: str = "owner"`
			`root: Optional[str] = None`
			`parent: Optional[str] = None`
			`permission: Optional[list] = None`


			`class ModelList(BaseModel):`
			`object: str = "list"`
			`data: List[ModelCard] = []`


match api with OpenAI format 2023-06-22 12:27:00 +00:00			`class ChatMessage(BaseModel):`
update API 2023-06-22 12:46:24 +00:00			`role: Literal["user", "assistant", "system"]`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`content: str`
add API demo from #1 2023-06-05 13:32:18 +00:00
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`class DeltaMessage(BaseModel):`
update API 2023-06-22 12:46:24 +00:00			`role: Optional[Literal["user", "assistant", "system"]] = None`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`content: Optional[str] = None`


			`class ChatCompletionRequest(BaseModel):`
			`model: str`
			`messages: List[ChatMessage]`
			`temperature: Optional[float] = None`
			`top_p: Optional[float] = None`
update api 2023-06-26 05:39:57 +00:00			`max_length: Optional[int] = None`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`max_new_tokens: Optional[int] = None`
			`stream: Optional[bool] = False`


			`class ChatCompletionResponseChoice(BaseModel):`
			`index: int`
			`message: ChatMessage`
			`finish_reason: Literal["stop", "length"]`


			`class ChatCompletionResponseStreamChoice(BaseModel):`
			`index: int`
			`delta: DeltaMessage`
			`finish_reason: Optional[Literal["stop", "length"]]`

Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`class ChatCompletionResponse(BaseModel):`
			`model: str`
update readme 2023-06-22 16:17:05 +00:00			`object: Literal["chat.completion", "chat.completion.chunk"]`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]`
			`created: Optional[int] = Field(default_factory=lambda: int(time.time()))`


support prefixes, loading multiple local files 2023-06-26 07:32:40 +00:00			`@app.get("/v1/models", response_model=ModelList)`
			`async def list_models():`
			`global model_args`
			`model_card = ModelCard(id="gpt-3.5-turbo")`
			`return ModelList(data=[model_card])`


match api with OpenAI format 2023-06-22 12:27:00 +00:00			`@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)`
			`async def create_chat_completion(request: ChatCompletionRequest):`
update readme 2023-06-22 16:17:05 +00:00			`global model, tokenizer, source_prefix, generating_args`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
update API 2023-06-22 12:46:24 +00:00			`if request.messages[-1].role != "user":`
			`raise HTTPException(status_code=400, detail="Invalid request")`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`query = request.messages[-1].content`
update API 2023-06-22 12:46:24 +00:00
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`prev_messages = request.messages[:-1]`
			`if len(prev_messages) > 0 and prev_messages[0].role == "system":`
update readme 2023-06-22 16:17:05 +00:00			`prefix = prev_messages.pop(0).content`
			`else:`
			`prefix = source_prefix`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
			`history = []`
			`if len(prev_messages) % 2 == 0:`
			`for i in range(0, len(prev_messages), 2):`
			`if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":`
			`history.append([prev_messages[i].content, prev_messages[i+1].content])`

update readme 2023-06-22 16:17:05 +00:00			`inputs = tokenizer([prompt_template.get_prompt(query, history, prefix)], return_tensors="pt")`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`inputs = inputs.to(model.device)`
add API demo from #1 2023-06-05 13:32:18 +00:00
support RM metrics, add generating Args 2023-06-12 07:48:48 +00:00			`gen_kwargs = generating_args.to_dict()`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`gen_kwargs.update({`
			`"input_ids": inputs["input_ids"],`
			`"temperature": request.temperature if request.temperature else gen_kwargs["temperature"],`
			`"top_p": request.top_p if request.top_p else gen_kwargs["top_p"],`
			`"logits_processor": get_logits_processor()`
			`})`
update api 2023-06-26 05:39:57 +00:00			`if request.max_length:`
			`gen_kwargs.pop("max_new_tokens", None)`
			`gen_kwargs["max_length"] = request.max_length`
			`if request.max_new_tokens:`
			`gen_kwargs.pop("max_length", None)`
			`gen_kwargs["max_new_tokens"] = request.max_new_tokens`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
			`if request.stream:`
			`generate = predict(gen_kwargs, request.model)`
fix streaming response in API 2023-07-05 14:42:31 +00:00			`return EventSourceResponse(generate, media_type="text/event-stream")`
add API demo from #1 2023-06-05 13:32:18 +00:00
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`generation_output = model.generate(**gen_kwargs)`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`outputs = generation_output.tolist()[0][len(inputs["input_ids"][0]):]`
add API demo from #1 2023-06-05 13:32:18 +00:00			`response = tokenizer.decode(outputs, skip_special_tokens=True)`

match api with OpenAI format 2023-06-22 12:27:00 +00:00			`choice_data = ChatCompletionResponseChoice(`
			`index=0,`
			`message=ChatMessage(role="assistant", content=response),`
			`finish_reason="stop"`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`)`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
			`return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion")`


			`async def predict(gen_kwargs: Dict[str, Any], model_id: str):`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`global model, tokenizer`
add source prefix 2023-06-16 08:32:17 +00:00
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`gen_kwargs["streamer"] = streamer`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00
			`thread = Thread(target=model.generate, kwargs=gen_kwargs)`
			`thread.start()`

match api with OpenAI format 2023-06-22 12:27:00 +00:00			`choice_data = ChatCompletionResponseStreamChoice(`
			`index=0,`
			`delta=DeltaMessage(role="assistant"),`
			`finish_reason=None`
			`)`
update readme 2023-06-22 16:17:05 +00:00			`chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")`
fix streaming response in API 2023-07-05 14:42:31 +00:00			`yield chunk.json(exclude_unset=True, ensure_ascii=False)`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00			`for new_text in streamer:`
match api with OpenAI format 2023-06-22 12:27:00 +00:00			`if len(new_text) == 0:`
			`continue`

			`choice_data = ChatCompletionResponseStreamChoice(`
			`index=0,`
			`delta=DeltaMessage(content=new_text),`
			`finish_reason=None`
			`)`
update readme 2023-06-22 16:17:05 +00:00			`chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")`
fix streaming response in API 2023-07-05 14:42:31 +00:00			`yield chunk.json(exclude_unset=True, ensure_ascii=False)`
match api with OpenAI format 2023-06-22 12:27:00 +00:00
			`choice_data = ChatCompletionResponseStreamChoice(`
			`index=0,`
			`delta=DeltaMessage(),`
			`finish_reason="stop"`
			`)`
update readme 2023-06-22 16:17:05 +00:00			`chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")`
fix streaming response in API 2023-07-05 14:42:31 +00:00			`yield chunk.json(exclude_unset=True, ensure_ascii=False)`
			`yield "[DONE]"`
Compatible with OpenAI API. 2023-06-21 06:45:04 +00:00

			`if __name__ == "__main__":`
support RM metrics, add generating Args 2023-06-12 07:48:48 +00:00			`model_args, data_args, finetuning_args, generating_args = prepare_infer_args()`
add API demo from #1 2023-06-05 13:32:18 +00:00			`model, tokenizer = load_pretrained(model_args, finetuning_args)`
add source prefix 2023-06-16 08:32:17 +00:00
add prompt template class 2023-06-07 03:55:25 +00:00			`prompt_template = Template(data_args.prompt_template)`
add source prefix 2023-06-16 08:32:17 +00:00			`source_prefix = data_args.source_prefix if data_args.source_prefix else ""`
add API demo from #1 2023-06-05 13:32:18 +00:00
fix streaming response in API 2023-07-05 14:42:31 +00:00			`uvicorn.run(app, host="0.0.0.0", port=8000, workers=1)`