diff --git a/openllm_next/cloud.py b/openllm_next/cloud.py
index df16bf2f..56168b42 100644
--- a/openllm_next/cloud.py
+++ b/openllm_next/cloud.py
@@ -79,6 +79,7 @@ async def _run_model(model: str, timeout: int = 600):
     )
 
     import bentoml
+    from httpx import ReadError
 
     try:
         questionary.print("Model loading...", style="green")
@@ -92,6 +93,8 @@ async def _run_model(model: str, timeout: int = 600):
                     break
             except bentoml.exceptions.BentoMLException:
                 await asyncio.sleep(1)
+            except ReadError:
+                await asyncio.sleep(1)
         else:
             questionary.print("Model failed to load", style="red")
             return
diff --git a/openllm_next/common.py b/openllm_next/common.py
index b17d7a93..c9c49568 100644
--- a/openllm_next/common.py
+++ b/openllm_next/common.py
@@ -1,4 +1,5 @@
 import functools
+import hashlib
 import json
 import os
 import pathlib
@@ -9,6 +10,7 @@ from contextlib import contextmanager
 from types import SimpleNamespace
 
 import questionary
+import typer
 
 ERROR_STYLE = "red"
 SUCCESS_STYLE = "green"
@@ -107,6 +109,9 @@ class BentoInfo(SimpleNamespace):
     repo: RepoInfo
     path: pathlib.Path
 
+    def __hash__(self):
+        return md5(str(self.path))
+
     @property
     def tag(self) -> str:
         return f"{self.path.parent.name}:{self.path.name}"
@@ -166,17 +171,47 @@ class BentoInfo(SimpleNamespace):
             )
 
 
+@typing.overload
 def run_command(
     cmd,
     cwd=None,
     env=None,
     copy_env=True,
+    venv=None,
     silent=False,
-    check=True,
-) -> subprocess.CompletedProcess | subprocess.Popen | None:
+    background: typing.Literal[False] = False,
+) -> subprocess.CompletedProcess: ...
+
+
+@typing.overload
+def run_command(
+    cmd,
+    cwd=None,
+    env=None,
+    copy_env=True,
+    venv=None,
+    silent=False,
+    background: typing.Literal[True] = True,
+) -> subprocess.Popen: ...
+
+
+def run_command(
+    cmd,
+    cwd=None,
+    env=None,
+    copy_env=True,
+    venv=None,
+    silent=False,
+    background=False,
+) -> subprocess.CompletedProcess | subprocess.Popen:
+    if background:
+        run_func = subprocess.Popen
+    else:
+        run_func = subprocess.run
     import shlex
 
     env = env or {}
+    cmd = [str(c) for c in cmd]
     if not silent:
         questionary.print("\n")
         if cwd:
@@ -184,25 +219,41 @@ def run_command(
         if env:
             for k, v in env.items():
                 questionary.print(f"$ export {k}={shlex.quote(v)}", style="bold")
+        if venv:
+            questionary.print(f"$ source {venv / 'bin' / 'activate'}", style="bold")
         questionary.print(f"$ {' '.join(cmd)}", style="bold")
+
+    if venv:
+        py = venv / "bin" / "python"
+    else:
+        py = sys.executable
+
     if copy_env:
         env = {**os.environ, **env}
+
     if cmd and cmd[0] == "bentoml":
-        cmd = [sys.executable, "-m", "bentoml"] + cmd[1:]
+        cmd = [py, "-m", "bentoml"] + cmd[1:]
     if cmd and cmd[0] == "python":
-        cmd = [sys.executable] + cmd[1:]
+        cmd = [py] + cmd[1:]
+
     try:
         if silent:
-            return subprocess.run(
+            return run_func(  # type: ignore
                 cmd,
                 cwd=cwd,
                 env=env,
-                check=check,
                 stdout=subprocess.DEVNULL,
                 stderr=subprocess.DEVNULL,
             )
         else:
-            return subprocess.run(cmd, cwd=cwd, env=env, check=check)
+            return run_func(cmd, cwd=cwd, env=env)
     except subprocess.CalledProcessError:
         questionary.print("Command failed", style=ERROR_STYLE)
-        return None
+        raise typer.Exit(1)
+
+
+def md5(*strings: str) -> int:
+    m = hashlib.md5()
+    for s in strings:
+        m.update(s.encode())
+    return int(m.hexdigest(), 16)
diff --git a/openllm_next/model.py b/openllm_next/model.py
index dc29bd23..cebcc4ee 100644
--- a/openllm_next/model.py
+++ b/openllm_next/model.py
@@ -7,6 +7,7 @@ import typer
 
 from openllm_next.common import ERROR_STYLE, VERBOSE_LEVEL, BentoInfo, load_config
 from openllm_next.repo import parse_repo_url
+from openllm_next.venv import ensure_venv
 
 app = typer.Typer()
 
@@ -83,10 +84,7 @@ def pick_bento(tag) -> BentoInfo:
     return model
 
 
-def get_serve_cmd(tag: str):
-    if ":" not in tag:
-        tag = f"{tag}:latest"
-    bento = pick_bento(tag)
+def get_serve_cmd(bento: BentoInfo):
     cmd = ["bentoml", "serve", bento.tag]
     env = {
         "BENTOML_HOME": f"{bento.repo.path}/bentoml",
diff --git a/openllm_next/serve.py b/openllm_next/serve.py
index cba519ad..17124455 100644
--- a/openllm_next/serve.py
+++ b/openllm_next/serve.py
@@ -1,30 +1,38 @@
 import asyncio
-import os
-import subprocess
 
 import questionary
 import typer
 
 from openllm_next.common import run_command
-from openllm_next.model import get_serve_cmd
+from openllm_next.model import get_serve_cmd, pick_bento
+from openllm_next.venv import ensure_venv
 
 app = typer.Typer()
 
 
 @app.command()
 def serve(model: str):
-    cmd, env, cwd = get_serve_cmd(model)
-    run_command(cmd, env=env, cwd=cwd)
+    if ":" not in model:
+        model = f"{model}:latest"
+    bento = pick_bento(model)
+    venv = ensure_venv(bento)
+    cmd, env, cwd = get_serve_cmd(bento)
+    run_command(cmd, env=env, cwd=cwd, venv=venv)
 
 
 async def _run_model(model: str, timeout: int = 600):
-    cmd, env, cwd = get_serve_cmd(model)
-    server_proc = subprocess.Popen(
+    if ":" not in model:
+        model = f"{model}:latest"
+    bento = pick_bento(model)
+    venv = ensure_venv(bento)
+    cmd, env, cwd = get_serve_cmd(bento)
+    server_proc = run_command(
         cmd,
-        env={**os.environ, **env},
+        env=env,
         cwd=cwd,
-        stdout=subprocess.DEVNULL,
-        stderr=subprocess.DEVNULL,
+        venv=venv,
+        silent=True,
+        background=True,
     )
 
     import bentoml
diff --git a/openllm_next/venv.py b/openllm_next/venv.py
index 254f7a86..a715036e 100644
--- a/openllm_next/venv.py
+++ b/openllm_next/venv.py
@@ -1,8 +1,20 @@
-def _resolve_package_versions(requirement: str) -> dict[str, str]:
+import functools
+import pathlib
+import shutil
+from types import SimpleNamespace
+from typing import Iterable
+
+import questionary
+import typer
+
+from openllm_next.common import VENV_DIR, VERBOSE_LEVEL, BentoInfo, md5, run_command
+
+
+def _resolve_packages(requirement: str | pathlib.Path) -> dict[str, str]:
     from pip_requirements_parser import RequirementsFile
 
     requirements_txt = RequirementsFile.from_file(
-        requirement,
+        str(requirement),
         include_nested=True,
     )
     deps: dict[str, str] = {}
@@ -22,3 +34,110 @@ def _resolve_package_versions(requirement: str) -> dict[str, str]:
                 deps[req.name] = req.line
                 break
     return deps
+
+
+class EnvSpec(SimpleNamespace):
+    python_version: str
+    python_packages: dict[str, str]
+    name_prefix = ""
+
+    def __hash__(self):
+        return md5(
+            self.python_version,
+            *sorted(self.python_packages.values()),
+        )
+
+
+@functools.lru_cache
+def _resolve_bento_env_specs(bento: BentoInfo):
+    ver_file = bento.path / "env" / "python" / "version.txt"
+    assert ver_file.exists(), f"cannot find version file in {bento.path}"
+
+    lock_file = bento.path / "env" / "python" / "requirements.lock.txt"
+    if not lock_file.exists():
+        lock_file = bento.path / "env" / "python" / "requirements.txt"
+
+    python_packages = _resolve_packages(lock_file)
+    PREHEAT_PIP_PACKAGES = ["torch", "vllm"]
+    preheat_packages = {
+        k: v for k, v in python_packages.items() if k in PREHEAT_PIP_PACKAGES
+    }
+    ver = ver_file.read_text().strip()
+    return (
+        EnvSpec(
+            python_version=ver,
+            python_packages=preheat_packages,
+            name_prefix=f"{bento.tag.replace(':', '_')}-1-",
+        ),
+        EnvSpec(
+            python_version=ver,
+            python_packages=python_packages,
+            name_prefix=f"{bento.tag.replace(':', '_')}-2-",
+        ),
+    )
+
+
+def _ensure_venv(
+    env_spec: EnvSpec, parrent_venv: pathlib.Path | None = None
+) -> pathlib.Path:
+    venv = VENV_DIR / str(hash(env_spec))
+    if not venv.exists():
+        questionary.print(f"Installing model dependencies({venv})...", style="green")
+        try:
+            run_command(["python", "-m", "venv", venv], silent=VERBOSE_LEVEL.get() < 1)
+            pyver = next(venv.glob("lib/python*")).name
+            if parrent_venv is not None:
+                with open(
+                    venv / "lib" / pyver / "site-packages" / f"{parrent_venv.name}.pth",
+                    "w+",
+                ) as f:
+                    f.write(str(parrent_venv / "lib" / pyver / "site-packages"))
+            with open(venv / "requirements.txt", "w") as f:
+                f.write("\n".join(sorted(env_spec.python_packages.values())))
+            run_command(
+                [
+                    venv / "bin" / "pip",
+                    "install",
+                    "-r",
+                    venv / "requirements.txt",
+                    "--upgrade-strategy",
+                    "only-if-needed",
+                ],
+                silent=VERBOSE_LEVEL.get() < 1,
+            )
+            run_command(
+                [
+                    venv / "bin" / "pip",
+                    "install",
+                    "bentoml",
+                    "--upgrade-strategy",
+                    "only-if-needed",
+                    "--upgrade",
+                ],
+                silent=VERBOSE_LEVEL.get() < 1,
+            )
+        except Exception:
+            shutil.rmtree(venv, ignore_errors=True)
+            questionary.print(
+                f"Failed to install dependencies to {venv}. Cleaned up.",
+                style="red",
+            )
+            raise typer.Exit(1)
+        questionary.print(
+            f"Successfully installed dependencies to {venv}.", style="green"
+        )
+        return venv
+    else:
+        return venv
+
+
+def _ensure_venvs(env_spec_list: Iterable[EnvSpec]) -> pathlib.Path:
+    last_venv = None
+    for env_spec in env_spec_list:
+        last_venv = _ensure_venv(env_spec, last_venv)
+    assert last_venv is not None
+    return last_venv
+
+
+def ensure_venv(bento: BentoInfo) -> pathlib.Path:
+    return _ensure_venvs(_resolve_bento_env_specs(bento))
diff --git a/pyproject.toml b/pyproject.toml
index 78a484d3..2ad37831 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,7 +16,6 @@ dependencies = [
     "psutil",
     "pathlib",
     "pip_requirements_parser",
-    "venv",
 ]
 
 [tool.typer]