From ef45ca611e1e8eee77ea687035ddfb97dc33d991 Mon Sep 17 00:00:00 2001 From: bojiang Date: Wed, 3 Jul 2024 20:28:46 +0800 Subject: [PATCH] refactor: use openai client in run --- openllm_next/local.py | 17 +++++++++++++---- pyproject.toml | 1 + 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/openllm_next/local.py b/openllm_next/local.py index e68336c8..c0dd0f5a 100644 --- a/openllm_next/local.py +++ b/openllm_next/local.py @@ -46,7 +46,6 @@ async def _run_model( venv=venv, silent=False, ) as server_proc: - import bentoml output(f"Model server started {server_proc.pid}") @@ -82,8 +81,12 @@ async def _run_model( stderr_streamer.cancel() output("Model is ready", style="green") - messages = [] - client = bentoml.AsyncHTTPClient(f"http://localhost:{port}", timeout=timeout) + messages: list[dict[str, str]] = [] + + from openai import AsyncOpenAI + + client = AsyncOpenAI(base_url=f"http://localhost:{port}/v1") + model_id = (await client.models.list()).data[0].id while True: try: message = input("user: ") @@ -93,7 +96,13 @@ async def _run_model( messages.append(dict(role="user", content=message)) output("assistant: ", end="", style="lightgreen") assistant_message = "" - async for text in client.chat(messages=messages): # type: ignore + stream = await client.chat.completions.create( + model=model_id, + messages=messages, # type: ignore + stream=True, + ) + async for chunk in stream: + text = chunk.choices[0].delta.content or "" assistant_message += text output(text, end="", style="lightgreen") messages.append(dict(role="assistant", content=assistant_message)) diff --git a/pyproject.toml b/pyproject.toml index 341cd4dd..95968631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "dulwich", "tabulate", "uv", + "openai==1.35.9", ] [project.scripts]