1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
| """Hermes API 服务 - 兼容 OpenAI Chat Completions 格式""" import json import time import uuid from typing import Optional
from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field from sse_starlette.sse import EventSourceResponse from llama_cpp import Llama
MODEL_PATH = "./Hermes-3-Llama-3.1-8B-Q5_K_M.gguf" MODEL_NAME = "hermes-3-8b"
app = FastAPI(title="Hermes API", version="1.0.0") app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
llm = Llama(model_path=MODEL_PATH, n_ctx=8192, n_gpu_layers=-1, chat_format="chatml", verbose=False)
class Message(BaseModel): role: str content: str
class ChatRequest(BaseModel): model: str = MODEL_NAME messages: list[Message] temperature: float = Field(default=0.7, ge=0, le=2) top_p: float = Field(default=0.9, ge=0, le=1) max_tokens: int = Field(default=1024, ge=1, le=8192) stream: bool = False stop: Optional[list[str]] = None
@app.get("/v1/models") async def list_models(): return {"object": "list", "data": [{"id": MODEL_NAME, "object": "model", "created": int(time.time()), "owned_by": "local"}]}
@app.post("/v1/chat/completions") async def chat_completions(request: ChatRequest): try: messages = [{"role": m.role, "content": m.content} for m in request.messages]
if request.stream: return EventSourceResponse(stream_response(messages, request), media_type="text/event-stream")
response = llm.create_chat_completion( messages=messages, temperature=request.temperature, top_p=request.top_p, max_tokens=request.max_tokens, stop=request.stop )
return { "id": f"chatcmpl-{uuid.uuid4().hex[:8]}", "object": "chat.completion", "created": int(time.time()), "model": request.model, "choices": [{ "index": 0, "message": {"role": "assistant", "content": response["choices"][0]["message"]["content"]}, "finish_reason": response["choices"][0].get("finish_reason", "stop") }], "usage": response["usage"] } except Exception as e: raise HTTPException(status_code=500, detail=str(e))
async def stream_response(messages, request): completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}" stream = llm.create_chat_completion( messages=messages, temperature=request.temperature, top_p=request.top_p, max_tokens=request.max_tokens, stop=request.stop, stream=True ) for chunk in stream: delta = chunk["choices"][0].get("delta", {}) finish_reason = chunk["choices"][0].get("finish_reason") data = { "id": completion_id, "object": "chat.completion.chunk", "created": int(time.time()), "model": request.model, "choices": [{"index": 0, "delta": delta, "finish_reason": finish_reason}] } yield {"data": json.dumps(data, ensure_ascii=False)} yield {"data": "[DONE]"}
@app.get("/health") async def health_check(): return {"status": "ok", "model": MODEL_NAME}
if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)
|