From fd89bcb23211b8aad047df2c5a07f883c082c7db Mon Sep 17 00:00:00 2001 From: Anthony Wang Date: Mon, 29 May 2023 20:15:59 +0000 Subject: [PATCH] Hide stderr output --- gpt.py | 54 +++++++++++++++++++++++++++++++++++ llama.py | 50 --------------------------------- main.py | 85 ++++++++++++++++++++++++++------------------------------ 3 files changed, 94 insertions(+), 95 deletions(-) create mode 100644 gpt.py delete mode 100644 llama.py diff --git a/gpt.py b/gpt.py new file mode 100644 index 0000000..aa3aae1 --- /dev/null +++ b/gpt.py @@ -0,0 +1,54 @@ +from http.server import HTTPServer, BaseHTTPRequestHandler +from os import chmod +from pathlib import Path +from socket import AF_UNIX +from socketserver import UnixStreamServer +from urllib.parse import unquote + +from torch import float16 +from transformers import AutoModelForCausalLM, AutoTokenizer + + +# https://stackoverflow.com/questions/21650370/setting-up-an-http-server-that-listens-over-a-file-socket +class UnixHTTPServer(UnixStreamServer): + def get_request(self): + request, client_address = super(UnixHTTPServer, self).get_request() + return (request, ["local", 0]) + + +class textgenHandler(BaseHTTPRequestHandler): + def do_GET(self): + prompt = unquote(self.path[1:]) + print('Prompt') + print(prompt) + + if prompt == 'favicon.ico': + return + + input = tokenizer.encode(prompt, return_tensors='pt').to('cuda') + output = tokenizer.decode(model.generate( + input, do_sample=True, max_length=500, top_p=0.9)[0]) + print(output) + + self.send_response(200) + self.send_header('Content-Type', 'text/plain') + self.send_header('Content-Length', str(len(output))) + self.end_headers() + self.wfile.write(output.encode('utf-8')) + + +# Load model +print('Loading model') +tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B') +model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B', + torch_dtype=float16, low_cpu_mem_usage=True).to('cuda') + + +# Create and start server +print('Starting server') +path = '/srv/http/pages/textgen' +Path(path).unlink(missing_ok=True) +server = UnixHTTPServer(path, textgenHandler) +chmod(path, 666) +print('Server ready') +server.serve_forever() diff --git a/llama.py b/llama.py deleted file mode 100644 index 31cdc04..0000000 --- a/llama.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -import subprocess -import time -import threading -from flask import Flask, Response - -app = Flask(__name__) - - -@app.route("/") -def llama(prompt): - if prompt == "favicon.ico": - return Response(status=204) - - def generate(): - try: - process = subprocess.Popen( - [ - "/opt/llama.cpp/main", - "-ngl", - "32", - "-m", - "/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin", - "-n", - "1024", - "-p", - f"### Human: {prompt}\n### Assistant:", - ], - stderr=subprocess.STDOUT, - stdout=subprocess.PIPE, - ) - for c in iter(lambda: process.stdout.read(1), b""): - yield c - finally: - process.kill() - - return Response(generate(), mimetype="text/plain") - - -path = "/srv/http/pages/textgen" - - -def fixperms(): - time.sleep(0.1) - os.chmod(path, 660) - - -threading.Thread(target=fixperms).start() - -app.run(host="unix://" + path) diff --git a/main.py b/main.py index aa3aae1..0cce538 100644 --- a/main.py +++ b/main.py @@ -1,54 +1,49 @@ -from http.server import HTTPServer, BaseHTTPRequestHandler -from os import chmod -from pathlib import Path -from socket import AF_UNIX -from socketserver import UnixStreamServer -from urllib.parse import unquote +import os +import subprocess +import time +import threading +from flask import Flask, Response -from torch import float16 -from transformers import AutoModelForCausalLM, AutoTokenizer +app = Flask(__name__) -# https://stackoverflow.com/questions/21650370/setting-up-an-http-server-that-listens-over-a-file-socket -class UnixHTTPServer(UnixStreamServer): - def get_request(self): - request, client_address = super(UnixHTTPServer, self).get_request() - return (request, ["local", 0]) +@app.route("/") +def llama(prompt): + if prompt == "favicon.ico": + return Response(status=204) + + def generate(): + try: + process = subprocess.Popen( + [ + "/opt/llama.cpp/main", + "-ngl", + "32", + "-m", + "/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin", + "-n", + "1024", + "-p", + f"### Human: {prompt}\n### Assistant:", + ], + stdout=subprocess.PIPE, + ) + for c in iter(lambda: process.stdout.read(1), b""): + yield c + finally: + process.kill() + + return Response(generate(), mimetype="text/plain") -class textgenHandler(BaseHTTPRequestHandler): - def do_GET(self): - prompt = unquote(self.path[1:]) - print('Prompt') - print(prompt) - - if prompt == 'favicon.ico': - return - - input = tokenizer.encode(prompt, return_tensors='pt').to('cuda') - output = tokenizer.decode(model.generate( - input, do_sample=True, max_length=500, top_p=0.9)[0]) - print(output) - - self.send_response(200) - self.send_header('Content-Type', 'text/plain') - self.send_header('Content-Length', str(len(output))) - self.end_headers() - self.wfile.write(output.encode('utf-8')) +path = "/srv/http/pages/textgen" -# Load model -print('Loading model') -tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B') -model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B', - torch_dtype=float16, low_cpu_mem_usage=True).to('cuda') +def fixperms(): + time.sleep(0.1) + os.chmod(path, 660) -# Create and start server -print('Starting server') -path = '/srv/http/pages/textgen' -Path(path).unlink(missing_ok=True) -server = UnixHTTPServer(path, textgenHandler) -chmod(path, 666) -print('Server ready') -server.serve_forever() +threading.Thread(target=fixperms).start() + +app.run(host="unix://" + path)