From fd89bcb23211b8aad047df2c5a07f883c082c7db Mon Sep 17 00:00:00 2001
From: Anthony Wang <a@exozy.me>
Date: Mon, 29 May 2023 20:15:59 +0000
Subject: [PATCH] Hide stderr output

---
 gpt.py   | 54 +++++++++++++++++++++++++++++++++++
 llama.py | 50 ---------------------------------
 main.py  | 85 ++++++++++++++++++++++++++------------------------------
 3 files changed, 94 insertions(+), 95 deletions(-)
 create mode 100644 gpt.py
 delete mode 100644 llama.py
diff --git a/gpt.py b/gpt.py
new file mode 100644
index 0000000..aa3aae1
--- /dev/null
+++ b/gpt.py
@@ -0,0 +1,54 @@
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from os import chmod
+from pathlib import Path
+from socket import AF_UNIX
+from socketserver import UnixStreamServer
+from urllib.parse import unquote
+
+from torch import float16
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# https://stackoverflow.com/questions/21650370/setting-up-an-http-server-that-listens-over-a-file-socket
+class UnixHTTPServer(UnixStreamServer):
+    def get_request(self):
+        request, client_address = super(UnixHTTPServer, self).get_request()
+        return (request, ["local", 0])
+
+
+class textgenHandler(BaseHTTPRequestHandler):
+    def do_GET(self):    
+        prompt = unquote(self.path[1:])        
+        print('Prompt')
+        print(prompt)
+
+        if prompt == 'favicon.ico':
+            return
+        
+        input = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
+        output = tokenizer.decode(model.generate(
+            input, do_sample=True, max_length=500, top_p=0.9)[0])
+        print(output)
+        
+        self.send_response(200)
+        self.send_header('Content-Type', 'text/plain')
+        self.send_header('Content-Length', str(len(output)))
+        self.end_headers()
+        self.wfile.write(output.encode('utf-8'))
+
+
+# Load model
+print('Loading model')
+tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
+model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B',
+    torch_dtype=float16, low_cpu_mem_usage=True).to('cuda')
+
+
+# Create and start server
+print('Starting server')
+path = '/srv/http/pages/textgen'
+Path(path).unlink(missing_ok=True)
+server = UnixHTTPServer(path, textgenHandler)
+chmod(path, 666)
+print('Server ready')
+server.serve_forever()
diff --git a/llama.py b/llama.py
deleted file mode 100644
index 31cdc04..0000000
--- a/llama.py
+++ /dev/null
@@ -1,50 +0,0 @@
-import os
-import subprocess
-import time
-import threading
-from flask import Flask, Response
-
-app = Flask(__name__)
-
-
-@app.route("/<prompt>")
-def llama(prompt):
-    if prompt == "favicon.ico":
-        return Response(status=204)
-
-    def generate():
-        try:
-            process = subprocess.Popen(
-                [
-                    "/opt/llama.cpp/main",
-                    "-ngl",
-                    "32",
-                    "-m",
-                    "/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin",
-                    "-n",
-                    "1024",
-                    "-p",
-                    f"### Human: {prompt}\n### Assistant:",
-                ],
-                stderr=subprocess.STDOUT,
-                stdout=subprocess.PIPE,
-            )
-            for c in iter(lambda: process.stdout.read(1), b""):
-                yield c
-        finally:
-            process.kill()
-
-    return Response(generate(), mimetype="text/plain")
-
-
-path = "/srv/http/pages/textgen"
-
-
-def fixperms():
-    time.sleep(0.1)
-    os.chmod(path, 660)
-
-
-threading.Thread(target=fixperms).start()
-
-app.run(host="unix://" + path)
diff --git a/main.py b/main.py
index aa3aae1..0cce538 100644
--- a/main.py
+++ b/main.py
@@ -1,54 +1,49 @@
-from http.server import HTTPServer, BaseHTTPRequestHandler
-from os import chmod
-from pathlib import Path
-from socket import AF_UNIX
-from socketserver import UnixStreamServer
-from urllib.parse import unquote
+import os
+import subprocess
+import time
+import threading
+from flask import Flask, Response
 
-from torch import float16
-from transformers import AutoModelForCausalLM, AutoTokenizer
+app = Flask(__name__)
 
 
-# https://stackoverflow.com/questions/21650370/setting-up-an-http-server-that-listens-over-a-file-socket
-class UnixHTTPServer(UnixStreamServer):
-    def get_request(self):
-        request, client_address = super(UnixHTTPServer, self).get_request()
-        return (request, ["local", 0])
+@app.route("/<prompt>")
+def llama(prompt):
+    if prompt == "favicon.ico":
+        return Response(status=204)
+
+    def generate():
+        try:
+            process = subprocess.Popen(
+                [
+                    "/opt/llama.cpp/main",
+                    "-ngl",
+                    "32",
+                    "-m",
+                    "/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin",
+                    "-n",
+                    "1024",
+                    "-p",
+                    f"### Human: {prompt}\n### Assistant:",
+                ],
+                stdout=subprocess.PIPE,
+            )
+            for c in iter(lambda: process.stdout.read(1), b""):
+                yield c
+        finally:
+            process.kill()
+
+    return Response(generate(), mimetype="text/plain")
 
 
-class textgenHandler(BaseHTTPRequestHandler):
-    def do_GET(self):    
-        prompt = unquote(self.path[1:])        
-        print('Prompt')
-        print(prompt)
-
-        if prompt == 'favicon.ico':
-            return
-        
-        input = tokenizer.encode(prompt, return_tensors='pt').to('cuda')
-        output = tokenizer.decode(model.generate(
-            input, do_sample=True, max_length=500, top_p=0.9)[0])
-        print(output)
-        
-        self.send_response(200)
-        self.send_header('Content-Type', 'text/plain')
-        self.send_header('Content-Length', str(len(output)))
-        self.end_headers()
-        self.wfile.write(output.encode('utf-8'))
+path = "/srv/http/pages/textgen"
 
 
-# Load model
-print('Loading model')
-tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neo-2.7B')
-model = AutoModelForCausalLM.from_pretrained('EleutherAI/gpt-neo-2.7B',
-    torch_dtype=float16, low_cpu_mem_usage=True).to('cuda')
+def fixperms():
+    time.sleep(0.1)
+    os.chmod(path, 660)
 
 
-# Create and start server
-print('Starting server')
-path = '/srv/http/pages/textgen'
-Path(path).unlink(missing_ok=True)
-server = UnixHTTPServer(path, textgenHandler)
-chmod(path, 666)
-print('Server ready')
-server.serve_forever()
+threading.Thread(target=fixperms).start()
+
+app.run(host="unix://" + path)