-
Notifications
You must be signed in to change notification settings - Fork 446
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
✨ feat: add deploy #177
Open
Ling-yunchi
wants to merge
4
commits into
mymusise:master
Choose a base branch
from
Ling-yunchi:master
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
✨ feat: add deploy #177
Changes from 3 commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from fastapi import FastAPI, Request | ||
from transformers import AutoTokenizer, AutoModel | ||
import uvicorn | ||
import json | ||
import datetime | ||
import torch | ||
from peft import get_peft_model, LoraConfig, TaskType | ||
|
||
DEVICE = "cuda" | ||
DEVICE_ID = "0" | ||
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE | ||
|
||
|
||
def torch_gc(): | ||
if torch.cuda.is_available(): | ||
with torch.cuda.device(CUDA_DEVICE): | ||
torch.cuda.empty_cache() | ||
torch.cuda.ipc_collect() | ||
|
||
|
||
app = FastAPI() | ||
|
||
|
||
@app.post("/") | ||
async def chat(request: Request): | ||
global model, tokenizer | ||
json_post_raw = await request.json() | ||
json_post = json.dumps(json_post_raw) | ||
json_post_list = json.loads(json_post) | ||
prompt = json_post_list.get('prompt') | ||
history = json_post_list.get('history') | ||
max_length = json_post_list.get('max_length') | ||
top_p = json_post_list.get('top_p') | ||
temperature = json_post_list.get('temperature') | ||
response, history = model.chat(tokenizer, | ||
prompt, | ||
history=history, | ||
max_length=max_length if max_length else 2048, | ||
top_p=top_p if top_p else 0.7, | ||
temperature=temperature if temperature else 0.95) | ||
now = datetime.datetime.now() | ||
time = now.strftime("%Y-%m-%d %H:%M:%S") | ||
answer = { | ||
"response": response, | ||
"history": history, | ||
"status": 200, | ||
"time": time | ||
} | ||
log = "[" + time + "] " + '", prompt:"' + \ | ||
prompt + '", response:"' + repr(response) + '"' | ||
print(log) | ||
torch_gc() | ||
return answer | ||
|
||
|
||
if __name__ == '__main__': | ||
torch.set_default_tensor_type(torch.cuda.HalfTensor) | ||
tokenizer = AutoTokenizer.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True) | ||
model = AutoModel.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True).half().cuda() | ||
|
||
peft_path = "output/adapter_model.bin" | ||
|
||
peft_config = LoraConfig( | ||
task_type=TaskType.CAUSAL_LM, inference_mode=True, | ||
r=8, | ||
lora_alpha=32, lora_dropout=0.1 | ||
) | ||
|
||
model = get_peft_model(model, peft_config) | ||
model.load_state_dict(torch.load(peft_path), strict=False) | ||
|
||
model.eval() | ||
|
||
uvicorn.run(app, host='0.0.0.0', port=8000, workers=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
from transformers import AutoModel, AutoTokenizer | ||
import gradio as gr | ||
import mdtex2html | ||
from peft import get_peft_model, LoraConfig, TaskType | ||
import torch | ||
|
||
tokenizer = AutoTokenizer.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True) | ||
model = AutoModel.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True).half().cuda() | ||
peft_path = "output/adapter_model.bin" | ||
peft_config = LoraConfig( | ||
task_type=TaskType.CAUSAL_LM, inference_mode=True, | ||
r=8, | ||
lora_alpha=32, lora_dropout=0.1 | ||
) | ||
model = get_peft_model(model, peft_config) | ||
model.load_state_dict(torch.load(peft_path), strict=False) | ||
model.eval() | ||
|
||
"""Override Chatbot.postprocess""" | ||
|
||
|
||
def postprocess(self, y): | ||
if y is None: | ||
return [] | ||
for i, (message, response) in enumerate(y): | ||
y[i] = ( | ||
None if message is None else mdtex2html.convert((message)), | ||
None if response is None else mdtex2html.convert(response), | ||
) | ||
return y | ||
|
||
|
||
gr.Chatbot.postprocess = postprocess | ||
|
||
|
||
def parse_text(text): | ||
"""copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/""" | ||
lines = text.split("\n") | ||
lines = [line for line in lines if line != ""] | ||
count = 0 | ||
for i, line in enumerate(lines): | ||
if "```" in line: | ||
count += 1 | ||
items = line.split('`') | ||
if count % 2 == 1: | ||
lines[i] = f'<pre><code class="language-{items[-1]}">' | ||
else: | ||
lines[i] = f'<br></code></pre>' | ||
else: | ||
if i > 0: | ||
if count % 2 == 1: | ||
line = line.replace("`", "\`") | ||
line = line.replace("<", "<") | ||
line = line.replace(">", ">") | ||
line = line.replace(" ", " ") | ||
line = line.replace("*", "*") | ||
line = line.replace("_", "_") | ||
line = line.replace("-", "-") | ||
line = line.replace(".", ".") | ||
line = line.replace("!", "!") | ||
line = line.replace("(", "(") | ||
line = line.replace(")", ")") | ||
line = line.replace("$", "$") | ||
lines[i] = "<br>"+line | ||
text = "".join(lines) | ||
return text | ||
|
||
|
||
def predict(input, chatbot, max_length, top_p, temperature, history): | ||
chatbot.append((parse_text(input), "")) | ||
for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p, | ||
temperature=temperature): | ||
chatbot[-1] = (parse_text(input), parse_text(response)) | ||
|
||
yield chatbot, history | ||
|
||
|
||
def reset_user_input(): | ||
return gr.update(value='') | ||
|
||
|
||
def reset_state(): | ||
return [], [] | ||
|
||
|
||
with gr.Blocks() as demo: | ||
gr.HTML("""<h1 align="center">ChatGLM</h1>""") | ||
|
||
chatbot = gr.Chatbot() | ||
with gr.Row(): | ||
with gr.Column(scale=4): | ||
with gr.Column(scale=12): | ||
user_input = gr.Textbox(show_label=False, placeholder="Input...", lines=10).style( | ||
container=False) | ||
with gr.Column(min_width=32, scale=1): | ||
submitBtn = gr.Button("Submit", variant="primary") | ||
with gr.Column(scale=1): | ||
emptyBtn = gr.Button("Clear History") | ||
max_length = gr.Slider( | ||
0, 4096, value=2048, step=1.0, label="Maximum length", interactive=True) | ||
top_p = gr.Slider(0, 1, value=0.7, step=0.01, | ||
label="Top P", interactive=True) | ||
temperature = gr.Slider( | ||
0, 1, value=0.95, step=0.01, label="Temperature", interactive=True) | ||
|
||
history = gr.State([]) | ||
|
||
submitBtn.click(predict, [user_input, chatbot, max_length, top_p, temperature, history], [chatbot, history], | ||
show_progress=True) | ||
submitBtn.click(reset_user_input, [], [user_input]) | ||
|
||
emptyBtn.click(reset_state, outputs=[chatbot, history], show_progress=True) | ||
|
||
demo.queue().launch(share=False, inbrowser=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from transformers import AutoModel, AutoTokenizer | ||
import streamlit as st | ||
from streamlit_chat import message | ||
from peft import get_peft_model, LoraConfig, TaskType | ||
import torch | ||
|
||
st.set_page_config( | ||
page_title="ChatGLM-6b 演示", | ||
page_icon=":robot:" | ||
) | ||
|
||
|
||
@st.cache_resource | ||
def get_model(): | ||
tokenizer = AutoTokenizer.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True) | ||
model = AutoModel.from_pretrained( | ||
"THUDM/chatglm-6b", trust_remote_code=True).half().cuda() | ||
peft_path = "output/adapter_model.bin" | ||
peft_config = LoraConfig( | ||
task_type=TaskType.CAUSAL_LM, inference_mode=True, | ||
r=8, | ||
lora_alpha=32, lora_dropout=0.1 | ||
) | ||
model = get_peft_model(model, peft_config) | ||
model.load_state_dict(torch.load(peft_path), strict=False) | ||
model.eval() | ||
return tokenizer, model | ||
|
||
|
||
MAX_TURNS = 20 | ||
MAX_BOXES = MAX_TURNS * 2 | ||
|
||
|
||
def predict(input, max_length, top_p, temperature, history=None): | ||
tokenizer, model = get_model() | ||
if history is None: | ||
history = [] | ||
|
||
with container: | ||
if len(history) > 0: | ||
for i, (query, response) in enumerate(history): | ||
message(query, avatar_style="big-smile", key=str(i) + "_user") | ||
message(response, avatar_style="bottts", key=str(i)) | ||
|
||
message(input, avatar_style="big-smile", | ||
key=str(len(history)) + "_user") | ||
st.write("AI正在回复:") | ||
with st.empty(): | ||
for response, history in model.stream_chat(tokenizer, input, history, max_length=max_length, top_p=top_p, | ||
temperature=temperature): | ||
query, response = history[-1] | ||
st.write(response) | ||
|
||
return history | ||
|
||
|
||
container = st.container() | ||
|
||
# create a prompt text for the text generation | ||
prompt_text = st.text_area(label="用户命令输入", | ||
height=100, | ||
placeholder="请在这儿输入您的命令") | ||
|
||
max_length = st.sidebar.slider( | ||
'max_length', 0, 4096, 2048, step=1 | ||
) | ||
top_p = st.sidebar.slider( | ||
'top_p', 0.0, 1.0, 0.6, step=0.01 | ||
) | ||
temperature = st.sidebar.slider( | ||
'temperature', 0.0, 1.0, 0.95, step=0.01 | ||
) | ||
|
||
if 'state' not in st.session_state: | ||
st.session_state['state'] = [] | ||
|
||
if st.button("发送", key="predict"): | ||
with st.spinner("AI正在思考,请稍等........"): | ||
# text generation | ||
st.session_state["state"] = predict( | ||
prompt_text, max_length, top_p, temperature, st.session_state["state"]) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以提个小建议不,LoRA的路径、端口、workers数这些配置,如果可以启动的时候通过参数读取,感觉会方便很多,比如: