textgen/llama.py

import os
import subprocess
import time
import threading
from flask import Flask, Response

app = Flask(__name__)


@app.route("/<prompt>")
def llama(prompt):
    if prompt == "favicon.ico":
        return Response(status=204)

    def generate():
        try:
            process = subprocess.Popen(
                [
                    "/opt/llama.cpp/main",
                    "-ngl",
                    "32",
                    "-m",
                    "/opt/llama.cpp/models/ggml-vicuna-7b-1.1-q4_0.bin",
                    "-n",
                    "1024",
                    "-p",
                    f"### Human: {prompt}\n### Assistant:",
                ],
                stderr=subprocess.STDOUT,
                stdout=subprocess.PIPE,
            )
            for c in iter(lambda: process.stdout.read(1), b""):
                yield c
        finally:
            process.kill()

    return Response(generate(), mimetype="text/plain")


path = "/srv/http/pages/textgen"


def fixperms():
    time.sleep(0.1)
    os.chmod(path, 660)


threading.Thread(target=fixperms).start()

app.run(host="unix://" + path)