diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 0263f21b..8ebe0e6a 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -38,7 +38,7 @@ _JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config @svc.api(route='/v1/generate', input=_JsonInput, output=bentoml.io.JSON.from_sample({'responses': [], 'configuration': llm_config.model_dump(flatten=True)})) async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerateOutput: echo = input_dict.pop('echo', False) - qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) + qa_inputs = openllm.GenerateInput.from_llm_config(llm_config)(**input_dict) config = qa_inputs.llm_config.model_dump() if runner.backend == 'vllm': async for output in runner.vllm_generate.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, request_id=openllm_core.utils.gen_random_uuid(), **config): @@ -51,7 +51,7 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerateOutput: @svc.api(route='/v1/generate_stream', input=_JsonInput, output=bentoml.io.Text(content_type='text/event-stream')) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: echo = input_dict.pop('echo', False) - qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) + qa_inputs = openllm.GenerateInput.from_llm_config(llm_config)(**input_dict) if runner.backend == 'vllm': return runner.vllm_generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name,