diff --git a/openllm-python/src/_openllm_tiny/Dockerfile.j2 b/openllm-python/src/_openllm_tiny/Dockerfile.j2 new file mode 100644 index 00000000..f2dbdc3d --- /dev/null +++ b/openllm-python/src/_openllm_tiny/Dockerfile.j2 @@ -0,0 +1,4 @@ +{% extends bento_base_template %} +{% block SETUP_BENTO_BASE_IMAGE %} +{{ super() }} +{% endblock %} diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py index 55570cd1..f4b57101 100644 --- a/openllm-python/src/_openllm_tiny/_entrypoint.py +++ b/openllm-python/src/_openllm_tiny/_entrypoint.py @@ -51,12 +51,6 @@ max_model_len=orjson.loads(coreutils.getenv('max_model_len', default=orjson.dump gpu_memory_utilization=orjson.loads(coreutils.getenv('gpu_memory_utilization', default=orjson.dumps({__gpu_memory_utilization__}), var=['GPU_MEMORY_UTILISATION'])) services_config=orjson.loads(coreutils.getenv('services_config',"""{__services_config__}""")) ''' -_DOCKERFILE_TEMPLATE = """\ -{% extends bento_base_template %} -{% block SETUP_BENTO_BASE_IMAGE %} -{{ super() }} -{% endblock %} -""" class ItemState(enum.Enum): @@ -235,6 +229,7 @@ def start_command( """ import transformers + from _bentoml_impl.server import serve_http from bentoml._internal.service.loader import load from bentoml._internal.log import configure_server_logging @@ -284,9 +279,8 @@ def start_command( working_dir = os.path.abspath(os.path.dirname(__file__)) if sys.path[0] != working_dir: sys.path.insert(0, working_dir) - load('.', working_dir=working_dir).serve_http( - working_dir=working_dir, reload=check_bool_env('RELOAD', default=False), development_mode=DEBUG - ) + load('.', working_dir=working_dir).inject_config() + serve_http('.', working_dir=working_dir, reload=check_bool_env('RELOAD', default=False), development_mode=DEBUG) def construct_python_options(llm_config, llm_fs):