50 lines
1.1 KiB
Python
50 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import time
|
|
import threading
|
|
from flask import Flask, Response
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
@app.route("/<prompt>")
|
|
def llama(prompt):
|
|
if prompt == "favicon.ico":
|
|
return Response(status=204)
|
|
|
|
def generate():
|
|
try:
|
|
process = subprocess.Popen(
|
|
[
|
|
"/opt/llama.cpp/main",
|
|
"-ngl",
|
|
"32",
|
|
"-m",
|
|
"/opt/llama.cpp/models/wizardLM-7B.ggmlv3.q4_0.bin",
|
|
"-n",
|
|
"1024",
|
|
"-p",
|
|
f"{prompt}\n\n### Response:",
|
|
],
|
|
stdout=subprocess.PIPE,
|
|
)
|
|
for c in iter(lambda: process.stdout.read(1), b""):
|
|
yield c
|
|
finally:
|
|
process.kill()
|
|
|
|
return Response(generate(), mimetype="text/plain")
|
|
|
|
|
|
path = "/srv/http/pages/textgen"
|
|
|
|
|
|
def fixperms():
|
|
time.sleep(0.1)
|
|
os.chmod(path, 660)
|
|
|
|
|
|
threading.Thread(target=fixperms).start()
|
|
|
|
app.run(host="unix://" + path)
|