diff --git a/changelog.d/618.feature.md b/changelog.d/618.feature.md
new file mode 100644
index 00000000..bf1807b2
--- /dev/null
+++ b/changelog.d/618.feature.md
@@ -0,0 +1,14 @@
+Certain warnings can now be disabled with `OPENLLM_DISABLE_WARNINGS=True` in the environment.
+
+`openllm.LLM` now also brings `embedded` mode. By default this is True. if `embedded=True`, then
+the model will be loaded eagerly. This should only be used during developmen
+
+```python
+
+import openllm
+
+llm = openllm.LLM('HuggingFaceH4/zephyr-7b-beta', backend='vllm', embedded=True)
+```
+
+The default behaviour of loading the model first time when `llm.generate` or `llm.generate_iterator` is unchanged.
+`embedded` option is mainly for backward compatibility and more explicit definition.
diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py
index ecaeb7fb..8bb93ea0 100644
--- a/openllm-core/src/openllm_core/_typing_compat.py
+++ b/openllm-core/src/openllm_core/_typing_compat.py
@@ -13,9 +13,14 @@ if t.TYPE_CHECKING:
   from transformers import PreTrainedTokenizerFast
 
   from .utils.lazy import VersionInfo
+else:
+  # NOTE: t.Any is also a type
+  PeftModel = PreTrainedModel = PreTrainedTokenizer = PreTrainedTokenizerBase = PreTrainedTokenizerFast = t.Any
+  # NOTE: that VersionInfo is from openllm.utils.lazy.VersionInfo
+  VersionInfo = t.Any
 
-M = t.TypeVar('M', bound='t.Union[PreTrainedModel, PeftModel]')
-T = t.TypeVar('T', bound='t.Union[PreTrainedTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase]')
+M = t.TypeVar('M', bound=t.Union[PreTrainedModel, PeftModel])
+T = t.TypeVar('T', bound=t.Union[PreTrainedTokenizerFast, PreTrainedTokenizer, PreTrainedTokenizerBase])
 
 
 def get_literal_args(typ: t.Any) -> tuple[str, ...]:
diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py
index da04f2f2..71bd72da 100644
--- a/openllm-core/src/openllm_core/utils/__init__.py
+++ b/openllm-core/src/openllm_core/utils/__init__.py
@@ -120,6 +120,7 @@ def set_debug_mode(enabled: bool, level: int = 1) -> None:
     os.environ[DEV_DEBUG_VAR] = str(level)
   os.environ[DEBUG_ENV_VAR] = str(enabled)
   os.environ[_GRPC_DEBUG_ENV_VAR] = 'DEBUG' if enabled else 'ERROR'
+  set_disable_warnings(enabled)
 
 
 def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
@@ -188,6 +189,21 @@ def set_quiet_mode(enabled: bool) -> None:
   # do not log setting quiet mode
   os.environ[QUIET_ENV_VAR] = str(enabled)
   os.environ[_GRPC_DEBUG_ENV_VAR] = 'NONE'
+  set_disable_warnings(enabled)
+
+
+WARNING_ENV_VAR = 'OPENLLM_DISABLE_WARNING'
+
+
+def get_disable_warnings() -> bool:
+  if get_debug_mode():
+    return False
+  return check_bool_env(WARNING_ENV_VAR, False)
+
+
+def set_disable_warnings(disable: bool = True) -> None:
+  if get_disable_warnings():
+    os.environ[WARNING_ENV_VAR] = str(disable)
 
 
 class ExceptionFilter(logging.Filter):
diff --git a/openllm-python/src/openllm/_deprecated.py b/openllm-python/src/openllm/_deprecated.py
index 104f8b16..0488b7b3 100644
--- a/openllm-python/src/openllm/_deprecated.py
+++ b/openllm-python/src/openllm/_deprecated.py
@@ -87,9 +87,7 @@ def Runner(
   )
 
   backend = t.cast(LiteralBackend, first_not_none(backend, default='vllm' if is_vllm_available() else 'pt'))
-  llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, **attrs)
-  if init_local:
-    llm.runner.init_local(quiet=True)
+  llm = LLM[t.Any, t.Any](backend=backend, llm_config=llm_config, embedded=init_local, **attrs)
   return llm.runner
 
 
diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py
index abf1aee8..1d800d16 100644
--- a/openllm-python/src/openllm/_llm.py
+++ b/openllm-python/src/openllm/_llm.py
@@ -34,7 +34,6 @@ from openllm_core._typing_compat import TupleAny
 from openllm_core.exceptions import MissingDependencyError
 from openllm_core.prompts import PromptTemplate
 from openllm_core.utils import DEBUG
-from openllm_core.utils import LazyLoader
 from openllm_core.utils import ReprMixin
 from openllm_core.utils import apply
 from openllm_core.utils import check_bool_env
@@ -43,6 +42,9 @@ from openllm_core.utils import converter
 from openllm_core.utils import first_not_none
 from openllm_core.utils import flatten_attrs
 from openllm_core.utils import generate_hash_from_file
+from openllm_core.utils import get_debug_mode
+from openllm_core.utils import get_disable_warnings
+from openllm_core.utils import get_quiet_mode
 from openllm_core.utils import is_peft_available
 from openllm_core.utils import resolve_filepath
 from openllm_core.utils import validate_is_path
@@ -55,9 +57,13 @@ from .serialisation.constants import PEFT_CONFIG_NAME
 
 
 if t.TYPE_CHECKING:
-  import peft
   import transformers
 
+  from peft.config import PeftConfig
+  from peft.peft_model import PeftModel
+  from peft.peft_model import PeftModelForCausalLM
+  from peft.peft_model import PeftModelForSeq2SeqLM
+
   from bentoml._internal.runner.runnable import RunnableMethod
   from bentoml._internal.runner.runner import RunnerMethod
   from bentoml._internal.runner.runner_handle import RunnerHandle
@@ -65,10 +71,7 @@ if t.TYPE_CHECKING:
   from openllm_core._configuration import LLMConfig
   from openllm_core.utils.representation import ReprArgs
 
-else:
-  peft = LazyLoader('peft', globals(), 'peft')
-
-ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['peft.PeftConfig', str]]]
+ResolvedAdapterMap = t.Dict[AdapterType, t.Dict[str, t.Tuple['PeftConfig', str]]]
 
 P = ParamSpec('P')
 
@@ -159,6 +162,7 @@ class LLM(t.Generic[M, T], ReprMixin):
     adapter_map: dict[str, str] | None = None,
     serialisation: LiteralSerialisation = 'safetensors',
     trust_remote_code: bool = False,
+    embedded: bool = False,
     **attrs: t.Any,
   ):
     # low_cpu_mem_usage is only available for model this is helpful on system with low memory to avoid OOM
@@ -215,6 +219,14 @@ class LLM(t.Generic[M, T], ReprMixin):
     # resolve the tag
     self._tag = model.tag
 
+    if embedded and not get_disable_warnings() and not get_quiet_mode():
+      logger.warning(
+        'You are using embedded mode, which means the models will be loaded into memory. This is often not recommended in production and should only be used for local development only.'
+      )
+      if not get_debug_mode():
+        logger.info("To disable this warning, set 'OPENLLM_DISABLE_WARNING=True'")
+      self.runner.init_local(quiet=True)
+
   @apply(lambda val: tuple(str.lower(i) if i else i for i in val))
   def _make_tag_components(self, model_id, model_version, backend) -> tuple[str, str | None]:
     model_id, *maybe_revision = model_id.rsplit(':')
@@ -401,9 +413,9 @@ class LLM(t.Generic[M, T], ReprMixin):
 
   def prepare_for_training(
     self, adapter_type: AdapterType = 'lora', use_gradient_checking: bool = True, **attrs: t.Any
-  ) -> tuple[peft.PeftModel | peft.PeftModelForCausalLM | peft.PeftModelForSeq2SeqLM, T]:
-    from peft import get_peft_model
-    from peft import prepare_model_for_kbit_training
+  ) -> tuple[PeftModel | PeftModelForCausalLM | PeftModelForSeq2SeqLM, T]:
+    from peft.mapping import get_peft_model
+    from peft.utils.other import prepare_model_for_kbit_training
 
     peft_config = (
       self.config['fine_tune_strategies']
diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py
index beda442b..bcc5d8f3 100644
--- a/openllm-python/src/openllm/cli/_sdk.py
+++ b/openllm-python/src/openllm/cli/_sdk.py
@@ -18,8 +18,10 @@ import openllm_core
 from bentoml._internal.configuration.containers import BentoMLContainer
 from openllm_core._typing_compat import LiteralSerialisation
 from openllm_core.exceptions import OpenLLMException
+from openllm_core.utils import WARNING_ENV_VAR
 from openllm_core.utils import codegen
 from openllm_core.utils import first_not_none
+from openllm_core.utils import get_disable_warnings
 from openllm_core.utils import is_vllm_available
 
 
@@ -197,11 +199,8 @@ def _build(
     model_id,
     '--machine',
     '--serialisation',
-    t.cast(
-      LiteralSerialisation,
-      first_not_none(
-        serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-      ),
+    first_not_none(
+      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
     ),
   ]
   if quantize:
@@ -237,7 +236,11 @@ def _build(
   args.extend(['--container-registry', container_registry, '--container-version-strategy', container_version_strategy])
   if additional_args:
     args.extend(additional_args)
+  if force_push:
+    args.append('--force-push')
 
+  current_disable_warning = get_disable_warnings()
+  os.environ[WARNING_ENV_VAR] = str(True)
   try:
     output = subprocess.check_output(args, env=os.environ.copy(), cwd=build_ctx or os.getcwd())
   except subprocess.CalledProcessError as e:
@@ -250,6 +253,7 @@ def _build(
     raise ValueError(
       f"Failed to find tag from output: {output.decode('utf-8').strip()}\nNote: Output from 'openllm build' might not be correct. Please open an issue on GitHub."
     )
+  os.environ[WARNING_ENV_VAR] = str(current_disable_warning)
   try:
     result = orjson.loads(matched.group(1))
   except orjson.JSONDecodeError as e:
diff --git a/openllm-python/src/openllm/cli/entrypoint.py b/openllm-python/src/openllm/cli/entrypoint.py
index bd83e4da..d3641890 100644
--- a/openllm-python/src/openllm/cli/entrypoint.py
+++ b/openllm-python/src/openllm/cli/entrypoint.py
@@ -77,6 +77,7 @@ from openllm_core.utils import compose
 from openllm_core.utils import configure_logging
 from openllm_core.utils import first_not_none
 from openllm_core.utils import get_debug_mode
+from openllm_core.utils import get_disable_warnings
 from openllm_core.utils import get_quiet_mode
 from openllm_core.utils import is_torch_available
 from openllm_core.utils import resolve_user_filepath
@@ -141,19 +142,22 @@ _object_setattr = object.__setattr__
 _EXT_FOLDER = os.path.abspath(os.path.join(os.path.dirname(__file__), 'extension'))
 
 
-def backend_warning(backend: LiteralBackend):
-  if backend == 'pt' and check_bool_env('OPENLLM_BACKEND_WARNING') and not get_quiet_mode():
+def backend_warning(backend: LiteralBackend, build: bool = False) -> None:
+  if backend == 'pt' and (not get_disable_warnings()) and not get_quiet_mode():
     if openllm.utils.is_vllm_available():
       termui.warning(
-        '\nvLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).'
+        'vLLM is available, but using PyTorch backend instead. Note that vLLM is a lot more performant and should always be used in production (by explicitly set --backend vllm).'
       )
     else:
       termui.warning(
-        '\nvLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
+        'vLLM is not available. Note that PyTorch backend is not as performant as vLLM and you should always consider using vLLM for production.'
       )
-    termui.debug(
-      content="\nTip: if you are running 'openllm build' you can set '--backend vllm' to package your Bento with vLLM backend. To hide these messages, set 'OPENLLM_BACKEND_WARNING=False'\n"
-    )
+    if build:
+      termui.info(
+        "Tip: You can set '--backend vllm' to package your Bento with vLLM backend regardless if vLLM is available locally."
+      )
+    if not get_debug_mode():
+      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
 
 
 class Extensions(click.MultiCommand):
@@ -425,13 +429,14 @@ def start_command(
       serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
     ),
   )
-  if serialisation == 'safetensors' and quantize is not None and check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
-    termui.warning(
-      f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation."
-    )
+  if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
+    termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
     termui.warning(
       f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
     )
+    termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
+    if not get_debug_mode():
+      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
 
   llm = openllm.LLM[t.Any, t.Any](
     model_id=model_id,
@@ -542,19 +547,17 @@ def start_grpc_command(
 
   from ..serialisation.transformers.weights import has_safetensors_weights
 
-  serialisation = t.cast(
-    LiteralSerialisation,
-    first_not_none(
-      serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
-    ),
+  serialisation = first_not_none(
+    serialisation, default='safetensors' if has_safetensors_weights(model_id, model_version) else 'legacy'
   )
-  if serialisation == 'safetensors' and quantize is not None and check_bool_env('OPENLLM_SERIALIZATION_WARNING'):
-    termui.warning(
-      f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation."
-    )
+  if serialisation == 'safetensors' and quantize is not None and not get_disable_warnings() and not get_quiet_mode():
+    termui.warning(f"'--quantize={quantize}' might not work with 'safetensors' serialisation format.")
     termui.warning(
       f"Make sure to check out '{model_id}' repository to see if the weights is in '{serialisation}' format if unsure."
     )
+    termui.info("Tip: You can always fallback to '--serialisation legacy' when running quantisation.")
+    if not get_debug_mode():
+      termui.info("To disable these warnings, set 'OPENLLM_DISABLE_WARNING=True'")
 
   llm = openllm.LLM[t.Any, t.Any](
     model_id=model_id,
@@ -824,9 +827,26 @@ def import_command(
   return response
 
 
-class DeploymentInstruction(t.TypedDict):
+@attr.define(auto_attribs=True)
+class _Content:
+  instr: str
+  cmd: str
+
+  def __str__(self) -> str:
+    return self.instr.format(cmd=self.cmd)
+
+
+@attr.define(auto_attribs=True)
+class DeploymentInstruction:
   type: t.Literal['container', 'bentocloud']
-  content: str
+  content: _Content
+
+  @classmethod
+  def from_content(cls, type: t.Literal['container', 'bentocloud'], instr: str, cmd: str) -> DeploymentInstruction:
+    return cls(type=type, content=_Content(instr=instr, cmd=cmd))
+
+  def __getitem__(self, key: str) -> str:
+    return getattr(self, key)
 
 
 class BuildBentoOutput(t.TypedDict):
@@ -985,7 +1005,7 @@ def build_command(
       ),
     ),
   )
-  backend_warning(llm.__llm_backend__)
+  backend_warning(llm.__llm_backend__, build=True)
 
   os.environ.update(
     {
@@ -1069,21 +1089,36 @@ def build_command(
     traceback.print_exc()
     raise click.ClickException('Exception caught while building BentoLLM:\n' + str(err)) from err
 
+  def get_current_bentocloud_context() -> str:
+    passed = t.cast(t.Optional[str], ctx.obj.cloud_context)
+    if passed:
+      return passed
+    else:
+      return t.cast(
+        str, orjson.loads(subprocess.check_output(['bentoml', 'cloud', 'current-context'], env=os.environ))['name']
+      )
+
   response = BuildBentoOutput(
     state=state,
     tag=str(bento_tag),
     backend=llm.__llm_backend__,
     instructions=[
-      DeploymentInstruction(
-        type='bentocloud', content=f"Push to BentoCloud with 'bentoml push': `bentoml push {bento_tag}`"
+      DeploymentInstruction.from_content(
+        type='bentocloud',
+        instr="☁️  Push to BentoCloud with 'bentoml push':\n    $ {cmd}",
+        cmd=f'bentoml push {bento_tag} --context {get_current_bentocloud_context()}',
       ),
-      DeploymentInstruction(
+      DeploymentInstruction.from_content(
         type='container',
-        content=f"Container BentoLLM with 'bentoml containerize': `bentoml containerize {bento_tag} --opt progress=plain`",
+        instr="🐳 Container BentoLLM with 'bentoml containerize':\n    $ {cmd}",
+        cmd=f'bentoml containerize {bento_tag} --opt progress=plain',
       ),
     ],
   )
 
+  plain_instruction = {i.type: i['content'].cmd for i in response['instructions']}
+  if machine or get_debug_mode():
+    response['instructions'] = plain_instruction
   if machine:
     termui.echo(f'__object__:{orjson.dumps(response).decode()}\n\n', fg='white')
   elif not get_quiet_mode() and (not push or not containerize):
@@ -1093,9 +1128,9 @@ def build_command(
       termui.warning(f"Bento for '{model_id}' already exists [{bento}]. To overwrite it pass '--overwrite'.\n")
     if not get_debug_mode():
       termui.echo(OPENLLM_FIGLET)
-      termui.echo('\n📖 Next steps:\n\n', nl=False)
+      termui.echo('📖 Next steps:\n', nl=False)
       for instruction in response['instructions']:
-        termui.echo(f"* {instruction['content']}\n", nl=False)
+        termui.echo(f"  * {instruction['content']}\n", nl=False)
 
   if push:
     BentoMLContainer.bentocloud_client.get().push_bento(
@@ -1112,7 +1147,6 @@ def build_command(
     except Exception as err:
       raise OpenLLMException(f"Exception caught while containerizing '{bento.tag!s}':\n{err}") from err
 
-  response.pop('instructions')
   if get_debug_mode():
     termui.echo('\n' + orjson.dumps(response).decode(), fg=None)
   return response
diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py
index 23ea3d9c..e275b644 100644
--- a/openllm-python/src/openllm/cli/termui.py
+++ b/openllm-python/src/openllm/cli/termui.py
@@ -11,7 +11,6 @@ import orjson
 
 from openllm_core._typing_compat import DictStrAny
 from openllm_core.utils import get_debug_mode
-from openllm_core.utils import get_quiet_mode
 
 
 logger = logging.getLogger('openllm')
@@ -53,7 +52,10 @@ class JsonLog(t.TypedDict):
 
 
 def log(content: str, level: Level = Level.INFO, fg: str | None = None) -> None:
-  echo(orjson.dumps(JsonLog(log_level=level, content=content)).decode(), fg=fg, json=True)
+  if get_debug_mode():
+    echo(content, fg=fg)
+  else:
+    echo(orjson.dumps(JsonLog(log_level=level, content=content)).decode(), fg=fg, json=True)
 
 
 warning = functools.partial(log, level=Level.WARNING)
@@ -64,7 +66,7 @@ info = functools.partial(log, level=Level.INFO)
 notset = functools.partial(log, level=Level.NOTSET)
 
 
-def echo(text: t.Any, fg: str | None = None, _with_style: bool = True, json: bool = False, **attrs: t.Any) -> None:
+def echo(text: t.Any, fg: str | None = None, *, _with_style: bool = True, json: bool = False, **attrs: t.Any) -> None:
   if json:
     text = orjson.loads(text)
     if 'content' in text and 'log_level' in text:
@@ -77,8 +79,7 @@ def echo(text: t.Any, fg: str | None = None, _with_style: bool = True, json: boo
     content = t.cast(str, text)
   attrs['fg'] = fg
 
-  if not get_quiet_mode():
-    t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(content, **attrs)
+  (click.echo if not _with_style else click.secho)(content, **attrs)
 
 
 COLUMNS: int = int(os.environ.get('COLUMNS', str(120)))
diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py
index d02f594a..abc99c7f 100644
--- a/openllm-python/src/openllm/utils/__init__.py
+++ b/openllm-python/src/openllm/utils/__init__.py
@@ -40,6 +40,7 @@ if t.TYPE_CHECKING:
   from openllm_core.utils import generate_context as generate_context
   from openllm_core.utils import generate_hash_from_file as generate_hash_from_file
   from openllm_core.utils import get_debug_mode as get_debug_mode
+  from openllm_core.utils import get_disable_warnings as get_disable_warnings
   from openllm_core.utils import get_quiet_mode as get_quiet_mode
   from openllm_core.utils import in_notebook as in_notebook
   from openllm_core.utils import is_autoawq_available as is_autoawq_available
@@ -61,6 +62,7 @@ if t.TYPE_CHECKING:
   from openllm_core.utils import resolve_user_filepath as resolve_user_filepath
   from openllm_core.utils import serde as serde
   from openllm_core.utils import set_debug_mode as set_debug_mode
+  from openllm_core.utils import set_disable_warnings as set_disable_warnings
   from openllm_core.utils import set_quiet_mode as set_quiet_mode
   from openllm_core.utils import validate_is_path as validate_is_path
   from openllm_core.utils.serde import converter as converter