diff --git a/local.sh b/local.sh
index f91b82ed..28d85d2c 100755
--- a/local.sh
+++ b/local.sh
@@ -104,9 +104,14 @@ else
   EXTENSIONS_STR=${EXTENSIONS_STR// /,} # Replace spaces with commas
 fi
 
-uv pip install --editable "$GIT_ROOT/openllm-python$EXTENSIONS_STR"
-uv pip install --editable "$GIT_ROOT/openllm-client"
-uv pip install --editable "$GIT_ROOT/openllm-core"
+PRERELEASE=${PRERELEASE:-false}
+
+ARGS=()
+[[ "${PRERELEASE}" == "true" ]] && ARGS+=("--prerelease=allow")
+
+uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-python$EXTENSIONS_STR"
+uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-client"
+uv pip install "${ARGS[@]}" --editable "$GIT_ROOT/openllm-core"
 
 echo "Instaling development dependencies..."
 uv pip install -r "$GIT_ROOT/tools/requirements.txt"
diff --git a/openllm-python/pyproject.toml b/openllm-python/pyproject.toml
index 944efdec..cdb24897 100644
--- a/openllm-python/pyproject.toml
+++ b/openllm-python/pyproject.toml
@@ -40,8 +40,8 @@ classifiers = [
 dependencies = [
     "bentoml[io]>=1.2",
     "transformers[torch,tokenizers]>=4.36.0",
-    "openllm-client>=0.5.0-alpha",
-    "openllm-core>=0.5.0-alpha",
+    "openllm-client>=0.4.44",
+    "openllm-core>=0.4.44",
     "safetensors",
     "optimum>=1.12.0",
     "accelerate",
diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py
index ee292db2..f535687b 100644
--- a/openllm-python/src/_openllm_tiny/_entrypoint.py
+++ b/openllm-python/src/_openllm_tiny/_entrypoint.py
@@ -240,7 +240,7 @@ def start_command(
     bentomodel = bentoml.models.get(model_id.lower())
     model_id = bentomodel.path
   except (ValueError, bentoml.exceptions.NotFound):
-    pass
+    bentomodel = None
   config = transformers.AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
   for arch in config.architectures:
     if arch in openllm_core.AutoConfig._architecture_mappings:
@@ -250,6 +250,8 @@ def start_command(
     raise RuntimeError(f'Failed to determine config class for {model_id}')
 
   llm_config = openllm_core.AutoConfig.for_model(model_name).model_construct_env()
+  if serialisation is None:
+    serialisation = llm_config['serialisation']
 
   # TODO: support LoRA adapters
   os.environ.update({
diff --git a/openllm-python/src/_openllm_tiny/_llm.py b/openllm-python/src/_openllm_tiny/_llm.py
index e95b62d5..b6742f8b 100644
--- a/openllm-python/src/_openllm_tiny/_llm.py
+++ b/openllm-python/src/_openllm_tiny/_llm.py
@@ -11,7 +11,6 @@ from openllm_core.utils import (
 )
 from openllm_core._typing_compat import LiteralQuantise, LiteralSerialisation, LiteralDtype
 from openllm_core._schemas import GenerationOutput, GenerationInput
-from _bentoml_sdk.service import ServiceConfig
 
 Dtype = t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']]
 
@@ -39,7 +38,6 @@ class LLM:
   quantise: t.Optional[LiteralQuantise] = attr.field(default=None)
   trust_remote_code: bool = attr.field(default=False)
   engine_args: t.Dict[str, t.Any] = attr.field(factory=dict, validator=check_engine_args)
-  service_config: t.Optional[ServiceConfig] = attr.field(factory=dict)
 
   _path: str = attr.field(
     init=False,
@@ -106,7 +104,6 @@ class LLM:
     quantise: LiteralQuantise | None = None,
     trust_remote_code: bool = False,
     llm_config: openllm_core.LLMConfig | None = None,
-    service_config: ServiceConfig | None = None,
     **engine_args: t.Any,
   ) -> LLM:
     return cls(
@@ -119,7 +116,6 @@ class LLM:
       dtype=dtype,
       engine_args=engine_args,
       trust_remote_code=trust_remote_code,
-      service_config=service_config,
     )
 
   @property
diff --git a/openllm-python/src/_openllm_tiny/_service.py b/openllm-python/src/_openllm_tiny/_service.py
index cab3c140..86c1ebe0 100644
--- a/openllm-python/src/_openllm_tiny/_service.py
+++ b/openllm-python/src/_openllm_tiny/_service.py
@@ -5,7 +5,7 @@ from starlette.requests import Request
 from starlette.responses import JSONResponse, StreamingResponse
 import openllm, bentoml, logging, openllm_core as core
 import _service_vars as svars, typing as t
-from openllm_core._typing_compat import Annotated, Unpack
+from openllm_core._typing_compat import Annotated
 from openllm_core._schemas import MessageParam, MessagesConverterInput
 from openllm_core.protocol.openai import ModelCard, ModelList, ChatCompletionRequest
 from _openllm_tiny._helpers import OpenAI, Error
@@ -43,19 +43,48 @@ class LLMService:
       quantise=svars.quantise,
       llm_config=llm_config,
       trust_remote_code=svars.trust_remote_code,
-      services_config=svars.services_config,
       max_model_len=svars.max_model_len,
       gpu_memory_utilization=svars.gpu_memory_utilization,
     )
     self.openai = OpenAI(self.llm)
 
   @core.utils.api(route='/v1/generate')
-  async def generate_v1(self, **parameters: Unpack[core.GenerationInputDict]) -> core.GenerationOutput:
-    return await self.llm.generate(**GenerationInput.from_dict(parameters).model_dump())
+  async def generate_v1(
+    self,
+    llm_config: t.Dict[str, t.Any],
+    prompt: str = 'What is the meaning of life?',
+    prompt_token_ids: t.Optional[t.List[int]] = None,
+    stop: t.Optional[t.List[str]] = None,
+    stop_token_ids: t.Optional[t.List[int]] = None,
+    request_id: t.Optional[str] = None,
+  ) -> core.GenerationOutput:
+    return await self.llm.generate(
+      prompt=prompt,
+      prompt_token_ids=prompt_token_ids,
+      llm_config=llm_config,
+      stop=stop,
+      stop_token_ids=stop_token_ids,
+      request_id=request_id,
+    )
 
   @core.utils.api(route='/v1/generate_stream')
-  async def generate_stream_v1(self, **parameters: Unpack[core.GenerationInputDict]) -> t.AsyncGenerator[str, None]:
-    async for generated in self.llm.generate_iterator(**GenerationInput.from_dict(parameters).model_dump()):
+  async def generate_stream_v1(
+    self,
+    llm_config: t.Dict[str, t.Any],
+    prompt: str = 'What is the meaning of life?',
+    prompt_token_ids: t.Optional[t.List[int]] = None,
+    stop: t.Optional[t.List[str]] = None,
+    stop_token_ids: t.Optional[t.List[int]] = None,
+    request_id: t.Optional[str] = None,
+  ) -> t.AsyncGenerator[str, None]:
+    async for generated in self.llm.generate_iterator(
+      prompt=prompt,
+      prompt_token_ids=prompt_token_ids,
+      llm_config=llm_config,
+      stop=stop,
+      stop_token_ids=stop_token_ids,
+      request_id=request_id,
+    ):
       yield f'data: {generated.model_dump_json()}\n\n'
     yield 'data: [DONE]\n\n'