01/software/source/server/services/llm/llamaedge/llm.py

import os
import subprocess
import requests
import json

class Llm:
    def __init__(self, config):
        self.install(config["service_directory"])

    def install(self, service_directory):
        LLM_FOLDER_PATH = service_directory
        self.llm_directory = os.path.join(LLM_FOLDER_PATH, 'llm')
        if not os.path.isdir(self.llm_directory): # Check if the LLM directory exists
            os.makedirs(LLM_FOLDER_PATH, exist_ok=True)

            # Install WasmEdge
            subprocess.run(['curl', '-sSf', 'https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh', '|', 'bash', '-s', '--', '--plugin', 'wasi_nn-ggml'])

            # Download the Qwen1.5-0.5B-Chat model GGUF file
            MODEL_URL = "https://huggingface.co/second-state/Qwen1.5-0.5B-Chat-GGUF/resolve/main/Qwen1.5-0.5B-Chat-Q5_K_M.gguf"
            subprocess.run(['curl', '-LO', MODEL_URL], cwd=self.llm_directory)

            # Download the llama-api-server.wasm app
            APP_URL = "https://github.com/LlamaEdge/LlamaEdge/releases/latest/download/llama-api-server.wasm"
            subprocess.run(['curl', '-LO', APP_URL], cwd=self.llm_directory)

            # Run the API server
            subprocess.run(['wasmedge', '--dir', '.:.', '--nn-preload', 'default:GGML:AUTO:Qwen1.5-0.5B-Chat-Q5_K_M.gguf', 'llama-api-server.wasm', '-p', 'llama-2-chat'], cwd=self.llm_directory)

            print("LLM setup completed.")
        else:
            print("LLM already set up. Skipping download.")

    def llm(self, messages):
        url = "http://localhost:8080/v1/chat/completions"
        headers = {
            'accept': 'application/json',
            'Content-Type': 'application/json'
        }
        data = {
            "messages": messages,
            "model": "llama-2-chat"
        }
        with requests.post(url, headers=headers, data=json.dumps(data), stream=True) as response:
            for line in response.iter_lines():
                if line:
                    yield json.loads(line)