chore(runner): yield the outputs directly (#573)

update openai client examples to >1 Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-04-21 15:39:36 -04:00 · 2023-11-07 22:34:11 -05:00
parent b3c4e204b2
commit cfd09bfc47
18 changed files with 311 additions and 190 deletions
--- a/openllm-python/src/openllm/entrypoints/openai.py
+++ b/openllm-python/src/openllm/entrypoints/openai.py
@@ -128,7 +128,7 @@ async def create_chat_completions(req: Request, llm: openllm.LLM[M, T]) -> Respo

  model_name, request_id = request.model, gen_random_uuid('chatcmpl')
  created_time = int(time.monotonic())
-  prompt = llm.tokenizer.apply_chat_template(request.messages, tokenize=False)
+  prompt = llm.tokenizer.apply_chat_template(request.messages, tokenize=False, add_generation_prompt=llm.config['add_generation_prompt'])
  logger.debug('Prompt: %r', prompt)
  config = llm.config.with_openai_request(request)