diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py
index 148f7b63..e250992f 100644
--- a/openllm-core/src/openllm_core/_configuration.py
+++ b/openllm-core/src/openllm_core/_configuration.py
@@ -234,6 +234,7 @@ class GenerationConfig(pydantic.BaseModel):
   logits_processors: t.Optional[t.List[LogitsProcessor]] = pydantic.Field(
     None, description='List of functions that modify logits based on previously generated tokens.'
   )
+  seed: t.Optional[int] = pydantic.Field(None, description='Random seed for generation.')
 
   def __getitem__(self, item: str) -> t.Any:
     if hasattr(self, item):
diff --git a/openllm-python/src/_openllm_tiny/_entrypoint.py b/openllm-python/src/_openllm_tiny/_entrypoint.py
index 887c685f..bbedc1f2 100644
--- a/openllm-python/src/_openllm_tiny/_entrypoint.py
+++ b/openllm-python/src/_openllm_tiny/_entrypoint.py
@@ -289,7 +289,7 @@ def construct_python_options(llm_config, llm_fs):
 
   # TODO: Add this line back once 0.5 is out, for now depends on OPENLLM_DEV_BUILD
   # packages = ['scipy', 'bentoml[tracing]>=1.2.8', 'openllm[vllm]>0.4', 'vllm>=0.3']
-  packages = ['scipy', 'bentoml[tracing]>=1.2.8', 'vllm>=0.3']
+  packages = ['scipy', 'bentoml[tracing]>=1.2.8', 'vllm>=0.3', 'flash-attn']
   if llm_config['requirements'] is not None:
     packages.extend(llm_config['requirements'])
   built_wheels = [build_editable(llm_fs.getsyspath('/'), p) for p in ('openllm_core', 'openllm_client', 'openllm')]
@@ -462,7 +462,7 @@ def build_command(
           labels=labels,
           models=models,
           envs=[
-            EnvironmentEntry(name='OPENLLM_CONFIG', value=llm_config.model_dump_json()),
+            EnvironmentEntry(name='OPENLLM_CONFIG', value=f"'{llm_config.model_dump_json()}'"),
             EnvironmentEntry(name='NVIDIA_DRIVER_CAPABILITIES', value='compute,utility'),
           ],
           description=f"OpenLLM service for {llm_config['start_name']}",
diff --git a/openllm-python/src/_openllm_tiny/_llm.py b/openllm-python/src/_openllm_tiny/_llm.py
index 541c82d7..8b6a0c6d 100644
--- a/openllm-python/src/_openllm_tiny/_llm.py
+++ b/openllm-python/src/_openllm_tiny/_llm.py
@@ -10,7 +10,7 @@ from openllm_core.utils import (
   dict_filter_none,
 )
 from openllm_core._typing_compat import LiteralQuantise, LiteralSerialisation, LiteralDtype
-from openllm_core._schemas import GenerationOutput, GenerationInput
+from openllm_core._schemas import GenerationOutput
 
 Dtype = t.Union[LiteralDtype, t.Literal['auto', 'half', 'float']]
 
@@ -149,7 +149,7 @@ class LLM:
   ) -> t.AsyncGenerator[RequestOutput, None]:
     from vllm import SamplingParams
 
-    config = self.config.generation_config.model_copy(update=dict_filter_none(attrs))
+    config = self.config.model_construct_env(**dict_filter_none(attrs))
 
     stop_token_ids = stop_token_ids or []
     eos_token_id = attrs.get('eos_token_id', config['eos_token_id'])
@@ -172,12 +172,14 @@ class LLM:
     top_p = 1.0 if config['temperature'] <= 1e-5 else config['top_p']
     config = config.model_copy(update=dict(stop=list(stop), stop_token_ids=stop_token_ids, top_p=top_p))
 
-    params = {k: getattr(config, k, None) for k in set(inspect.signature(SamplingParams).parameters.keys())}
-    sampling_params = SamplingParams(**{k: v for k, v in params.items() if v is not None})
-
     try:
       async for it in self._model.generate(
-        prompt, sampling_params=sampling_params, request_id=request_id, prompt_token_ids=prompt_token_ids
+        prompt,
+        sampling_params=SamplingParams(**{
+          k: config.__getitem__(k) for k in set(inspect.signature(SamplingParams).parameters.keys())
+        }),
+        request_id=request_id,
+        prompt_token_ids=prompt_token_ids,
       ):
         yield it
     except Exception as err:
@@ -191,15 +193,13 @@ class LLM:
     stop_token_ids: list[int] | None = None,
     request_id: str | None = None,
     adapter_name: str | None = None,
-    *,
-    _generated: GenerationInput | None = None,
     **attrs: t.Any,
   ) -> GenerationOutput:
     if stop is not None:
       attrs.update({'stop': stop})
     if stop_token_ids is not None:
       attrs.update({'stop_token_ids': stop_token_ids})
-    config = self.config.model_copy(update=attrs)
+    config = self.config.model_construct_env(**attrs)
     texts, token_ids = [[]] * config['n'], [[]] * config['n']
     async for result in self.generate_iterator(
       prompt,
diff --git a/openllm-python/src/_openllm_tiny/_service.py b/openllm-python/src/_openllm_tiny/_service.py
index 3a9c737c..0f4096f3 100644
--- a/openllm-python/src/_openllm_tiny/_service.py
+++ b/openllm-python/src/_openllm_tiny/_service.py
@@ -58,39 +58,31 @@ class LLMService:
   @core.utils.api(route='/v1/generate')
   async def generate_v1(
     self,
-    llm_config: t.Dict[str, t.Any] = pydantic.Field(default_factory=lambda: llm_config, description='LLM Config'),
     prompt: str = pydantic.Field(default='What is the meaning of life?', description='Given prompt to generate from'),
     prompt_token_ids: t.Optional[t.List[int]] = None,
     stop: t.Optional[t.List[str]] = None,
     stop_token_ids: t.Optional[t.List[int]] = None,
     request_id: t.Optional[str] = None,
+    llm_config: t.Dict[str, t.Any] = pydantic.Field(default=llm_config, description='LLM Config'),
   ) -> core.GenerationOutput:
+    llm_config.update(stop=stop, stop_token_ids=stop_token_ids)
     return await self.llm.generate(
-      prompt=prompt,
-      prompt_token_ids=prompt_token_ids,
-      llm_config=llm_config,
-      stop=stop,
-      stop_token_ids=stop_token_ids,
-      request_id=request_id,
+      prompt=prompt, prompt_token_ids=prompt_token_ids, request_id=request_id, **llm_config
     )
 
   @core.utils.api(route='/v1/generate_stream')
   async def generate_stream_v1(
     self,
-    llm_config: t.Dict[str, t.Any] = pydantic.Field(default_factory=lambda: llm_config, description='LLM Config'),
     prompt: str = pydantic.Field(default='What is the meaning of life?', description='Given prompt to generate from'),
     prompt_token_ids: t.Optional[t.List[int]] = None,
     stop: t.Optional[t.List[str]] = None,
     stop_token_ids: t.Optional[t.List[int]] = None,
     request_id: t.Optional[str] = None,
+    llm_config: t.Dict[str, t.Any] = pydantic.Field(default=llm_config, description='LLM Config'),
   ) -> t.AsyncGenerator[str, None]:
+    llm_config.update(stop=stop, stop_token_ids=stop_token_ids)
     async for generated in self.llm.generate_iterator(
-      prompt=prompt,
-      prompt_token_ids=prompt_token_ids,
-      llm_config=llm_config,
-      stop=stop,
-      stop_token_ids=stop_token_ids,
-      request_id=request_id,
+      prompt=prompt, prompt_token_ids=prompt_token_ids, request_id=request_id, **llm_config
     ):
       yield f'data: {core.GenerationOutput.from_vllm(generated).model_dump_json()}\n\n'
     yield 'data: [DONE]\n\n'
@@ -108,13 +100,15 @@ class LLMService:
   @core.utils.api(route='/v1/helpers/messages')
   def helpers_messages_v1(
     self,
-    message: Annotated[t.Dict[str, t.Any], MessagesConverterInput] = MessagesConverterInput(
-      add_generation_prompt=False,
-      messages=[
-        MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
-        MessageParam(role='user', content='Hi there!'),
-        MessageParam(role='assistant', content='Yes?'),
-      ],
+    message: Annotated[t.Dict[str, t.Any], MessagesConverterInput] = pydantic.Field(
+      default=MessagesConverterInput(
+        add_generation_prompt=False,
+        messages=[
+          MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
+          MessageParam(role='user', content='Hi there!'),
+          MessageParam(role='assistant', content='Yes?'),
+        ],
+      )
     ),
   ) -> str:
     return self.llm._tokenizer.apply_chat_template(
@@ -136,6 +130,7 @@ class LLMService:
         MessageParam(role='system', content='You are acting as Ernest Hemmingway.'),
         MessageParam(role='user', content='Hi there!'),
         MessageParam(role='assistant', content='Yes?'),
+        MessageParam(role='user', content='What is the meaning of life?'),
       ],
       model=core.utils.normalise_model_name(model_id),
       n=1,