51 lines
1.1 KiB
Python
51 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import time
|
|
import threading
|
|
from flask import Flask, Response
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
@app.route("/<prompt>")
|
|
def llama(prompt):
|
|
if prompt == "favicon.ico":
|
|
return Response(status=204)
|
|
|
|
def generate():
|
|
try:
|
|
process = subprocess.Popen(
|
|
[
|
|
"/opt/llama.cpp/main",
|
|
"-ngl",
|
|
"32",
|
|
"-m",
|
|
"/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin",
|
|
"-n",
|
|
"1024",
|
|
"-p",
|
|
f"### Human: {prompt}\n### Assistant:",
|
|
],
|
|
stderr=subprocess.STDOUT,
|
|
stdout=subprocess.PIPE,
|
|
)
|
|
for c in iter(lambda: process.stdout.read(1), b""):
|
|
yield c
|
|
finally:
|
|
process.kill()
|
|
|
|
return Response(generate(), mimetype="text/plain")
|
|
|
|
|
|
path = "/srv/http/pages/textgen"
|
|
|
|
|
|
def fixperms():
|
|
time.sleep(0.1)
|
|
os.chmod(path, 660)
|
|
|
|
|
|
threading.Thread(target=fixperms).start()
|
|
|
|
app.run(host="unix://" + path)
|