chore(runner): yield the outputs directly (#573)

update openai client examples to >1 Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-03-07 16:47:13 -05:00 · 2023-11-07 22:34:11 -05:00
parent b3c4e204b2
commit cfd09bfc47
18 changed files with 311 additions and 190 deletions
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -353,15 +353,13 @@ class LLM(t.Generic[M, T]):
    if request_id is None: request_id = openllm_core.utils.gen_random_uuid()
    previous_texts, previous_num_tokens = [''] * config['n'], [0] * config['n']
    async for out in self.runner.generate_iterator.async_stream(prompt_token_ids, request_id, stop, adapter_name, **config.model_dump(flatten=True)):
-      generated = GenerationOutput.from_sse(out).with_options(prompt=prompt)
+      generated = GenerationOutput.from_runner(out).with_options(prompt=prompt)
      delta_outputs = t.cast(t.List[CompletionChunk], [None] * len(generated.outputs))
      if generated.finished: break
      for output in generated.outputs:
        i = output.index
-        delta_tokens = output.token_ids[previous_num_tokens[i]:]
-        delta_text = output.text[len(previous_texts[i]):]
-        previous_texts[i] = output.text
-        previous_num_tokens[i] = len(output.token_ids)
+        delta_tokens, delta_text = output.token_ids[previous_num_tokens[i]:], output.text[len(previous_texts[i]):]
+        previous_texts[i], previous_num_tokens[i] = output.text, len(output.token_ids)
        delta_outputs[i] = output.with_options(text=delta_text, token_ids=delta_tokens)
      yield generated.with_options(outputs=delta_outputs)

--- a/openllm-python/src/openllm/_runners.py
+++ b/openllm-python/src/openllm/_runners.py
@@ -80,7 +80,7 @@ class vLLMRunnable(bentoml.Runnable):
    async for request_output in self.model.generate(None, sampling_params, request_id, prompt_token_ids):
      # XXX: Need to write a hook for serialisation None correctly
      if request_output.prompt_logprobs is not None: request_output.prompt_logprobs = [it if it else {} for it in request_output.prompt_logprobs]
-      yield f'data: {GenerationOutput.from_vllm(request_output).model_dump_json()}\n\n'
+      yield GenerationOutput.from_vllm(request_output).model_dump_json()

 class PyTorchRunnable(bentoml.Runnable):
  SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu')
@@ -101,7 +101,7 @@ class PyTorchRunnable(bentoml.Runnable):
                              **attrs: t.Any) -> t.AsyncGenerator[str, None]:
    if adapter_name is not None: self.model.set_adapter(adapter_name)
    async for generation_output in self.forward(prompt_token_ids, request_id, stop=stop, **attrs):
-      yield f'data: {generation_output.model_dump_json()}\n\n'
+      yield generation_output.model_dump_json()

  async def forward(self, prompt_token_ids: list[int], request_id: str, stop: str | t.Iterable[str] | None = None, **attrs: t.Any) -> t.AsyncGenerator[GenerationOutput, None]:
    from ._generation import is_partial_stop
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -128,7 +128,7 @@ async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Respo

  model_name, request_id = request.model, gen_random_uuid('chatcmpl')
  created_time = int(time.monotonic())
-  prompt = llm.tokenizer.apply_chat_template(request.messages, tokenize=False)
+  prompt = llm.tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=llm.config['add_generation_prompt'])
  logger.debug('Prompt: %r', prompt)
  config = llm.config.with_openai_request(request)

--- a/openllm-python/src/openllm/playground/features.py
+++ b/openllm-python/src/openllm/playground/features.py
@@ -40,8 +40,3 @@ async def main() -> int:
 def _mp_fn(index: t.Any):  # type: ignore
  # For xla_spawn (TPUs)
  asyncio.run(main())
-
-if openllm.utils.in_notebook():
-  await main()
-else:
-  raise SystemExit(asyncio.run(main()))