perf(build): locking and improve build speed (#669)

* revert(build): not locking packages

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* perf: improve svars generation and unifying envvar parsing

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* docs: update changelog

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

* chore: update stubs check for mypy

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>

---------

Signed-off-by: Aaron Pham <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-16 06:27:45 -05:00
committed by GitHub
parent fce8f223f3
commit 8fdfd0491f
9 changed files with 138 additions and 74 deletions

View File

@@ -1,11 +1,8 @@
# mypy: disable-error-code="call-arg,misc,attr-defined,type-abstract,type-arg,valid-type,arg-type"
from __future__ import annotations
import logging
import os
import typing as t
import _service_vars as svars
import orjson
import bentoml
import openllm
@@ -14,18 +11,17 @@ from bentoml.io import JSON, Text
logger = logging.getLogger(__name__)
llm = openllm.LLM[t.Any, t.Any](
svars.model_id,
model_id=svars.model_id,
model_tag=svars.model_tag,
prompt_template=openllm.utils.first_not_none(os.getenv('OPENLLM_PROMPT_TEMPLATE'), None),
system_message=openllm.utils.first_not_none(os.getenv('OPENLLM_SYSTEM_MESSAGE'), None),
serialisation=openllm.utils.first_not_none(os.getenv('OPENLLM_SERIALIZATION'), 'safetensors'),
adapter_map=orjson.loads(svars.adapter_map),
trust_remote_code=openllm.utils.check_bool_env('TRUST_REMOTE_CODE', default=False),
prompt_template=svars.prompt_template,
system_message=svars.system_message,
serialisation=svars.serialization,
adapter_map=svars.adapter_map,
trust_remote_code=svars.trust_remote_code,
)
llm_config = llm.config
svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=[llm.runner])
svc = bentoml.Service(name=f"llm-{llm.config['start_name']}-service", runners=[llm.runner])
llm_model_class = openllm.GenerationInput.from_llm_config(llm_config)
llm_model_class = openllm.GenerationInput.from_llm_config(llm.config)
@svc.api(
@@ -49,11 +45,11 @@ async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[s
_Metadata = openllm.MetadataOutput(
timeout=llm_config['timeout'],
model_name=llm_config['model_name'],
timeout=llm.config['timeout'],
model_name=llm.config['model_name'],
backend=llm.__llm_backend__,
model_id=llm.model_id,
configuration=llm_config.model_dump_json().decode(),
configuration=llm.config.model_dump_json().decode(),
prompt_template=llm.runner.prompt_template,
system_message=llm.runner.system_message,
)