From d3328343d7f59c92737a777e83e21989c6cd4a0b Mon Sep 17 00:00:00 2001 From: Aaron Pham <29749331+aarnphm@users.noreply.github.com> Date: Tue, 12 Dec 2023 01:33:13 -0500 Subject: [PATCH] feat: mixtral support (#770) Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com> --- openllm-python/pyproject.toml | 2 +- openllm-python/src/openllm/_service.py | 15 +++++++-------- openllm-python/src/openllm/bundle/_package.py | 4 ++-- .../src/openllm/entrypoints/_openapi.py | 2 ++ openllm-python/src/openllm/protocol/openai.py | 1 + tools/dependencies.py | 2 +- 6 files changed, 14 insertions(+), 12 deletions(-) diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml index 6d89f7ba..7853262d 100644 --- a/openllm-python/pyproject.toml +++ b/openllm-python/pyproject.toml @@ -119,7 +119,7 @@ openai = ["openai[datalib]>=1", "tiktoken"] playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"] qwen = ["cpm-kernels", "tiktoken"] starcoder = ["bitsandbytes"] -vllm = ["vllm>=0.2.4"] +vllm = ["vllm>=0.2.4", "megablocks", "stanford-stk", "ray==2.6.0"] [tool.hatch.version] fallback-version = "0.0.0" diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 46c119d9..f99f4339 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -1,7 +1,6 @@ from __future__ import annotations import logging, typing as t -import _service_vars as svars -import bentoml, openllm +import bentoml, openllm, _service_vars as svars from openllm_core._schemas import MessageParam from bentoml.io import JSON, Text @@ -16,23 +15,23 @@ llm_model_class = openllm.GenerationInput.from_llm_config(llm.config) @svc.api( route='/v1/generate', - input=JSON.from_sample(llm_model_class.examples()), output=JSON.from_sample(openllm.GenerationOutput.examples()), # + input=JSON.from_sample(llm_model_class.examples()), + output=JSON.from_sample(openllm.GenerationOutput.examples()), ) async def generate_v1(input_dict: dict[str, t.Any]) -> dict[str, t.Any]: return (await llm.generate(**llm_model_class(**input_dict).model_dump())).model_dump() @svc.api( route='/v1/generate_stream', - input=JSON.from_sample(llm_model_class.examples()), output=Text(content_type='text/event-stream'), # + input=JSON.from_sample(llm_model_class.examples()), + output=Text(content_type='text/event-stream'), ) async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: - async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): - yield f'data: {it.model_dump_json()}\n\n' + async for it in llm.generate_iterator(**llm_model_class(**input_dict).model_dump()): yield f'data: {it.model_dump_json()}\n\n' yield 'data: [DONE]\n\n' _Metadata = openllm.MetadataOutput( timeout=llm.config['timeout'], model_name=llm.config['model_name'], # - backend=llm.__llm_backend__, model_id=llm.model_id, # - configuration=llm.config.model_dump_json().decode(), + backend=llm.__llm_backend__, model_id=llm.model_id, configuration=llm.config.model_dump_json().decode(), # ) @svc.api(route='/v1/metadata', input=Text(), output=JSON.from_sample(_Metadata.model_dump())) diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index e83f0a53..227ce50e 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -31,7 +31,7 @@ def build_editable(path, package='openllm'): raise RuntimeError('Please install OpenLLM from PyPI or built it from Git source.') def construct_python_options(llm, llm_fs, extra_dependencies=None, adapter_map=None): from . import RefResolver - packages = ['scipy', 'bentoml[tracing]>=1.1.10', 'vllm==0.2.4', 'ray==2.6.0', f'openllm>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one + packages = ['scipy', 'bentoml[tracing]>=1.1.10', f'openllm[vllm]>={RefResolver.from_strategy("release").version}'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] if extra_dependencies is not None: packages += [f'openllm[{k}]' for k in extra_dependencies] if llm.config['requirements'] is not None: packages.extend(llm.config['requirements']) @@ -50,7 +50,7 @@ def construct_docker_options(llm, _, quantize, adapter_map, dockerfile_template, environ['OPENLLM_CONFIG'] = f"'{environ['OPENLLM_CONFIG']}'" environ.pop('BENTOML_HOME', None) # NOTE: irrelevant in container environ['NVIDIA_DRIVER_CAPABILITIES'] = 'compute,utility' - return DockerOptions(cuda_version='12.1', env=environ, dockerfile_template=dockerfile_template) + return DockerOptions(cuda_version='12.1', python_version='3.11', env=environ, dockerfile_template=dockerfile_template) @inject def create_bento( bento_tag, llm_fs, llm, # diff --git a/openllm-python/src/openllm/entrypoints/_openapi.py b/openllm-python/src/openllm/entrypoints/_openapi.py index db1abf36..05d5111b 100644 --- a/openllm-python/src/openllm/entrypoints/_openapi.py +++ b/openllm-python/src/openllm/entrypoints/_openapi.py @@ -73,6 +73,7 @@ requestBody: stream: false chat_template: __chat_template__ add_generation_prompt: __add_generation_prompt__ + echo: false streaming: summary: Streaming input example value: @@ -92,6 +93,7 @@ requestBody: - "<|endoftext|>" chat_template: __chat_template__ add_generation_prompt: __add_generation_prompt__ + echo: false schema: $ref: '#/components/schemas/ChatCompletionRequest' responses: diff --git a/openllm-python/src/openllm/protocol/openai.py b/openllm-python/src/openllm/protocol/openai.py index b9b1b422..390f437f 100644 --- a/openllm-python/src/openllm/protocol/openai.py +++ b/openllm-python/src/openllm/protocol/openai.py @@ -57,6 +57,7 @@ class ChatCompletionRequest: max_tokens: t.Optional[int] = attr.field(default=None) presence_penalty: t.Optional[float] = attr.field(default=None) frequency_penalty: t.Optional[float] = attr.field(default=None) + echo: t.Optional[bool] = attr.field(default=False) logit_bias: t.Optional[t.Dict[str, float]] = attr.field(default=None) user: t.Optional[str] = attr.field(default=None) # supported by vLLM and us diff --git a/tools/dependencies.py b/tools/dependencies.py index 74e640bd..e8312b61 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -155,7 +155,7 @@ GGML_DEPS = ['ctransformers'] CTRANSLATE_DEPS = ['ctranslate2>=3.22.0'] AWQ_DEPS = ['autoawq'] GPTQ_DEPS = ['auto-gptq[triton]>=0.4.2'] -VLLM_DEPS = ['vllm>=0.2.4'] +VLLM_DEPS = ['vllm>=0.2.4', 'megablocks', 'stanford-stk', 'ray==2.6.0'] _base_requirements: dict[str, t.Any] = { inflection.dasherize(name): config_cls.__openllm_requirements__