chore(cli): add a smol helpers to generate README.md tables

Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-12-23 23:57:46 -05:00 · 2025-02-15 07:38:33 +00:00
parent 509e9690d0
commit bd7966fca0
6 changed files with 2776 additions and 37 deletions
--- a/README.md
+++ b/README.md
@@ -1,4 +1,6 @@
-# 🦾 OpenLLM: Self-Hosting LLMs Made Easy
+<div align="center">
+<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>
+</div>

 [![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202-green.svg)](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)
 [![Releases](https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/openllm)
@@ -25,16 +27,110 @@ openllm hello

 OpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.

-| Model            | Parameters | Quantization | Required GPU  | Start a Server                      |
-| ---------------- | ---------- | ------------ | ------------- | ----------------------------------- |
-| Llama 3.3        | 70B        | -            | 80Gx2         | `openllm serve llama3.3:70b`        |
-| Llama 3.2        | 3B         | -            | 12G           | `openllm serve llama3.2:3b`         |
-| Llama 3.2 Vision | 11B        | -            | 80G           | `openllm serve llama3.2:11b-vision` |
-| Mistral          | 7B         | -            | 24G           | `openllm serve mistral:7b`          |
-| Qwen 2.5         | 1.5B       | -            | 12G           | `openllm serve qwen2.5:1.5b`        |
-| Qwen 2.5 Coder   | 7B         | -            | 24G           | `openllm serve qwen2.5-coder:7b`    |
-| Gemma 2          | 9B         | -            | 24G           | `openllm serve gemma2:9b`           |
-| Phi3             | 3.8B       | -            | 12G           | `openllm serve phi3:3.8b`           |
+<table>
+  <tr>
+    <th>Model</th>
+    <th>Parameters</th>
+    <th>Required GPU</th>
+    <th>Start a Server</th>
+  </tr>
+  <tr>
+    <td>deepseek-r1</td>
+    <td>671B</td>
+    <td>80Gx16</td>
+    <td><code>openllm serve deepseek-r1:671b-fc3d</code></td>
+  </tr>
+  <tr>
+    <td>deepseek-r1-distill</td>
+    <td>14B</td>
+    <td>80G</td>
+    <td><code>openllm serve deepseek-r1-distill:qwen2.5-14b-98a9</code></td>
+  </tr>
+  <tr>
+    <td>deepseek-v3</td>
+    <td>671B</td>
+    <td>80Gx16</td>
+    <td><code>openllm serve deepseek-v3:671b-instruct-d7ec</code></td>
+  </tr>
+  <tr>
+    <td>gemma2</td>
+    <td>2B</td>
+    <td>12G</td>
+    <td><code>openllm serve gemma2:2b-instruct-747d</code></td>
+  </tr>
+  <tr>
+    <td>llama3.1</td>
+    <td>8B</td>
+    <td>24G</td>
+    <td><code>openllm serve llama3.1:8b-instruct-3c0c</code></td>
+  </tr>
+  <tr>
+    <td>llama3.2</td>
+    <td>1B</td>
+    <td>24G</td>
+    <td><code>openllm serve llama3.2:1b-instruct-f041</code></td>
+  </tr>
+  <tr>
+    <td>llama3.3</td>
+    <td>70B</td>
+    <td>80Gx2</td>
+    <td><code>openllm serve llama3.3:70b-instruct-b850</code></td>
+  </tr>
+  <tr>
+    <td>mistral</td>
+    <td>8B</td>
+    <td>24G</td>
+    <td><code>openllm serve mistral:8b-instruct-50e8</code></td>
+  </tr>
+  <tr>
+    <td>mistral-large</td>
+    <td>123B</td>
+    <td>80Gx4</td>
+    <td><code>openllm serve mistral-large:123b-instruct-1022</code></td>
+  </tr>
+  <tr>
+    <td>mistralai</td>
+    <td>24B</td>
+    <td>80G</td>
+    <td><code>openllm serve mistralai:24b-small-instruct-2501-0e69</code></td>
+  </tr>
+  <tr>
+    <td>mixtral</td>
+    <td>7B</td>
+    <td>80Gx2</td>
+    <td><code>openllm serve mixtral:8x7b-instruct-v0.1-b752</code></td>
+  </tr>
+  <tr>
+    <td>phi4</td>
+    <td>14B</td>
+    <td>80G</td>
+    <td><code>openllm serve phi4:14b-c12d</code></td>
+  </tr>
+  <tr>
+    <td>pixtral</td>
+    <td>12B</td>
+    <td>80G</td>
+    <td><code>openllm serve pixtral:12b-240910-c344</code></td>
+  </tr>
+  <tr>
+    <td>qwen2.5</td>
+    <td>7B</td>
+    <td>24G</td>
+    <td><code>openllm serve qwen2.5:7b-instruct-3260</code></td>
+  </tr>
+  <tr>
+    <td>qwen2.5-coder</td>
+    <td>7B</td>
+    <td>24G</td>
+    <td><code>openllm serve qwen2.5-coder:7b-instruct-e75d</code></td>
+  </tr>
+  <tr>
+    <td>qwen2.5vl</td>
+    <td>3B</td>
+    <td>24G</td>
+    <td><code>openllm serve qwen2.5vl:3b-instruct-4686</code></td>
+  </tr>
+</table>

 ...

@@ -46,15 +142,16 @@ To start an LLM server locally, use the `openllm serve` command and specify the

 > [!NOTE]
 > OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.
+>
 > 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).
-> 2. Request access to the gated model, such as [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B).
+> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).
 > 3. Set your token as an environment variable by running:
 >    ```bash
 >    export HF_TOKEN=<your token>
 >    ```

 ```bash
-openllm serve llama3:8b
+openllm serve openllm serve llama3.2:1b-instruct-f041
 ```

 The server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:
@@ -79,7 +176,7 @@ client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
 # print(model_list)

 chat_completion = client.chat.completions.create(
-    model="meta-llama/Meta-Llama-3-8B-Instruct",
+    model="meta-llama/Llama-3.2-1B-Instruct",
    messages=[
        {
            "role": "user",
@@ -94,7 +191,6 @@ for chunk in chat_completion:

 </details>

-
 <details>

 <summary>LlamaIndex</summary>
@@ -102,9 +198,10 @@ for chunk in chat_completion:
 ```python
 from llama_index.llms.openai import OpenAI

-llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Meta-Llama-3-8B-Instruct", api_key="dummy")
+llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Llama-3.2-1B-Instruct", api_key="dummy")
 ...
 ```
+
 </details>

 ## Chat UI
@@ -138,7 +235,7 @@ openllm repo update
 To review a model’s information, run:

 ```bash
-openllm model get llama3:8b
+openllm model get openllm serve llama3.2:1b-instruct-f041
 ```

 ### Add a model to the default model repository
@@ -166,7 +263,7 @@ OpenLLM supports LLM cloud deployment via BentoML, the unified model serving fra
 [Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:

 ```bash
-openllm deploy llama3:8b
+openllm deploy openllm serve llama3.2:1b-instruct-f041
 ```

 > [!NOTE]
@@ -196,7 +293,6 @@ This project uses the following open-source projects:
 - [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
 - [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend
 - [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
- [chujiezheng/chat_templates](https://github.com/chujiezheng/chat_templates)
 - [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing

 We are grateful to the developers and contributors of these projects for their hard work and dedication.
--- a/README.md.tpl
+++ b/README.md.tpl
@@ -0,0 +1,210 @@
+<div align="center">
+<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>
+</div>
+
+[![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202-green.svg)](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)
+[![Releases](https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/openllm)
+[![CI](https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg)](https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main)
+[![X](https://badgen.net/badge/icon/@bentomlai/000000?icon=twitter&label=Follow)](https://twitter.com/bentomlai)
+[![Community](https://badgen.net/badge/icon/Community/562f5d?icon=slack&label=Join)](https://l.bentoml.com/join-slack)
+
+OpenLLM allows developers to run **any open-source LLMs** (Llama 3.3, Qwen2.5, Phi3 and [more](#supported-models)) or **custom models** as **OpenAI-compatible APIs** with a single command. It features a [built-in chat UI](#chat-ui), state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Docker, Kubernetes, and [BentoCloud](#deploy-to-bentocloud).
+
+Understand the [design philosophy of OpenLLM](https://www.bentoml.com/blog/from-ollama-to-openllm-running-llms-in-the-cloud).
+
+## Get Started
+
+Run the following commands to install OpenLLM and explore it interactively.
+
+```bash
+pip install openllm  # or pip3 install openllm
+openllm hello
+```
+
+![hello](https://github.com/user-attachments/assets/5af19f23-1b34-4c45-b1e0-a6798b4586d1)
+
+## Supported models
+
+OpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.
+
+<table>
+  <tr>
+    <th>Model</th>
+    <th>Parameters</th>
+    <th>Required GPU</th>
+    <th>Start a Server</th>
+  </tr>
+  {%- for key, value in model_dict|items %}
+  <tr>
+    <td>{{key}}</td>
+    <td>{{value['version'] | upper}}</td>
+    <td>{{value['pretty_gpu']}}</td>
+    <td><code>{{value['command']}}</code></td>
+  </tr>
+  {%- endfor %}
+</table>
+
+...
+
+For the full model list, see the [OpenLLM models repository](https://github.com/bentoml/openllm-models).
+
+## Start an LLM server
+
+To start an LLM server locally, use the `openllm serve` command and specify the model version.
+
+> [!NOTE]
+> OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.
+>
+> 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).
+> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).
+> 3. Set your token as an environment variable by running:
+>    ```bash
+>    export HF_TOKEN=<your token>
+>    ```
+
+```bash
+openllm serve {{model_dict.get("llama3.2")["command"]}}
+```
+
+The server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:
+
+- **The API host address**: By default, the LLM is hosted at [http://localhost:3000](http://localhost:3000/).
+- **The model name:** The name can be different depending on the tool you use.
+- **The API key**: The API key used for client authentication. This is optional.
+
+Here are some examples:
+
+<details>
+
+<summary>OpenAI Python client</summary>
+
+```python
+from openai import OpenAI
+
+client = OpenAI(base_url='http://localhost:3000/v1', api_key='na')
+
+# Use the following func to get the available models
+# model_list = client.models.list()
+# print(model_list)
+
+chat_completion = client.chat.completions.create(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    messages=[
+        {
+            "role": "user",
+            "content": "Explain superconductors like I'm five years old"
+        }
+    ],
+    stream=True,
+)
+for chunk in chat_completion:
+    print(chunk.choices[0].delta.content or "", end="")
+```
+
+</details>
+
+<details>
+
+<summary>LlamaIndex</summary>
+
+```python
+from llama_index.llms.openai import OpenAI
+
+llm = OpenAI(api_bese="http://localhost:3000/v1", model="meta-llama/Llama-3.2-1B-Instruct", api_key="dummy")
+...
+```
+
+</details>
+
+## Chat UI
+
+OpenLLM provides a chat UI at the `/chat` endpoint for the launched LLM server at http://localhost:3000/chat.
+
+<img width="800" alt="openllm_ui" src="https://github.com/bentoml/OpenLLM/assets/5886138/8b426b2b-67da-4545-8b09-2dc96ff8a707">
+
+## Chat with a model in the CLI
+
+To start a chat conversation in the CLI, use the `openllm run` command and specify the model version.
+
+```bash
+openllm run llama3:8b
+```
+
+## Model repository
+
+A model repository in OpenLLM represents a catalog of available LLMs that you can run. OpenLLM provides a default model repository that includes the latest open-source LLMs like Llama 3, Mistral, and Qwen2, hosted at [this GitHub repository](https://github.com/bentoml/openllm-models). To see all available models from the default and any added repository, use:
+
+```bash
+openllm model list
+```
+
+To ensure your local list of models is synchronized with the latest updates from all connected repositories, run:
+
+```bash
+openllm repo update
+```
+
+To review a model’s information, run:
+
+```bash
+openllm model get {{model_dict.get("llama3.2")["command"]}}
+```
+
+### Add a model to the default model repository
+
+You can contribute to the default model repository by adding new models that others can use. This involves creating and submitting a Bento of the LLM. For more information, check out this [example pull request](https://github.com/bentoml/openllm-models/pull/1).
+
+### Set up a custom repository
+
+You can add your own repository to OpenLLM with custom models. To do so, follow the format in the default OpenLLM model repository with a `bentos` directory to store custom LLMs. You need to [build your Bentos with BentoML](https://docs.bentoml.com/en/latest/guides/build-options.html) and submit them to your model repository.
+
+First, prepare your custom models in a `bentos` directory following the guidelines provided by [BentoML to build Bentos](https://docs.bentoml.com/en/latest/guides/build-options.html). Check out the [default model repository](https://github.com/bentoml/openllm-repo) for an example and read the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) for details.
+
+Then, register your custom model repository with OpenLLM:
+
+```bash
+openllm repo add <repo-name> <repo-url>
+```
+
+**Note**: Currently, OpenLLM only supports adding public repositories.
+
+## Deploy to BentoCloud
+
+OpenLLM supports LLM cloud deployment via BentoML, the unified model serving framework, and BentoCloud, an AI inference platform for enterprise AI teams. BentoCloud provides fully-managed infrastructure optimized for LLM inference with autoscaling, model orchestration, observability, and many more, allowing you to run any AI model in the cloud.
+
+[Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:
+
+```bash
+openllm deploy {{model_dict.get("llama3.2")["command"]}}
+```
+
+> [!NOTE]
+> If you are deploying a gated model, make sure to set HF_TOKEN in enviroment variables.
+
+Once the deployment is complete, you can run model inference on the BentoCloud console:
+
+<img width="800" alt="bentocloud_ui" src="https://github.com/bentoml/OpenLLM/assets/65327072/4f7819d9-73ea-488a-a66c-f724e5d063e6">
+
+## Community
+
+OpenLLM is actively maintained by the BentoML team. Feel free to reach out and join us in our pursuit to make LLMs more accessible and easy to use 👉 [Join our Slack community!](https://l.bentoml.com/join-slack)
+
+## Contributing
+
+As an open-source project, we welcome contributions of all kinds, such as new features, bug fixes, and documentation. Here are some of the ways to contribute:
+
+- Repost a bug by [creating a GitHub issue](https://github.com/bentoml/OpenLLM/issues/new/choose).
+- [Submit a pull request](https://github.com/bentoml/OpenLLM/compare) or help review other developers’ [pull requests](https://github.com/bentoml/OpenLLM/pulls).
+- Add an LLM to the OpenLLM default model repository so that other users can run your model. See the [pull request template](https://github.com/bentoml/openllm-models/pull/1).
+- Check out the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) to learn more.
+
+## Acknowledgements
+
+This project uses the following open-source projects:
+
+- [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving
+- [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend
+- [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI
+- [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing
+
+We are grateful to the developers and contributors of these projects for their hard work and dedication.
--- a/gen_readme.py
+++ b/gen_readme.py
@@ -0,0 +1,22 @@
+# /// script
+# requires-python = ">=3.11"
+# dependencies = [
+#     "jinja2",
+#     "uv",
+# ]
+# ///
+
+import subprocess, sys, pathlib, json
+
+from jinja2 import Environment, FileSystemLoader
+
+wd = pathlib.Path('.').parent
+model_dict = subprocess.run(
+    [sys.executable, '-m', 'uv', 'run', '--with-editable', '.', 'openllm', 'model', 'list', '--output', 'readme'],
+    capture_output=True,
+    text=True,
+    check=True,
+)
+E = Environment(loader=FileSystemLoader('.'))
+with (wd / 'README.md').open('w') as f:
+    f.write(E.get_template('README.md.tpl').render(model_dict=json.loads(model_dict.stdout.strip())))
--- a/src/openllm/main.py
+++ b/src/openllm/main.py
@@ -230,7 +230,7 @@ def run(
    local_run(bento, port=port, timeout=timeout)


-@app.command(help='deploy an production-ready OpenAI API compatible chat server to bentocloud ($100 free credit)')
+@app.command(help='deploy production-ready OpenAI API-compatible server to BentoCloud')
 def deploy(
    model: Annotated[str, typer.Argument()] = '',
    instance_type: Optional[str] = None,
--- a/src/openllm/model.py
+++ b/src/openllm/model.py
@@ -1,29 +1,33 @@
-import re
-import typing
-from typing import Optional
+from __future__ import annotations

-import tabulate
-import typer
+import re, typing, json
+
+import tabulate, questionary, typer

 from openllm.accelerator_spec import DeploymentTarget, can_run
 from openllm.analytic import OpenLLMTyper
-from openllm.common import VERBOSE_LEVEL, BentoInfo, output
+from openllm.common import VERBOSE_LEVEL, BentoInfo, output as output_
 from openllm.repo import ensure_repo_updated, list_repo

 app = OpenLLMTyper(help='manage models')


@app.command(help='get model')
-def get(tag: str, repo: Optional[str] = None, verbose: bool = False):
+def get(tag: str, repo: typing.Optional[str] = None, verbose: bool = False):
    if verbose:
        VERBOSE_LEVEL.set(20)
    bento_info = ensure_bento(tag, repo_name=repo)
    if bento_info:
-        output(bento_info)
+        output_(bento_info)


@app.command(name='list', help='list available models')
-def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: bool = False):
+def list_model(
+    tag: typing.Optional[str] = None,
+    repo: typing.Optional[str] = None,
+    verbose: bool = False,
+    output: typing.Optional[str] = typer.Option(None, hidden=True),
+):
    if verbose:
        VERBOSE_LEVEL.set(20)

@@ -38,6 +42,23 @@ def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: b
        seen.add(value)
        return False

+    if output == 'readme':
+        # Parse parameters from bento.tag (e.g. "model:671b-it" -> "671b", 'model:something-long-78b' -> '78b')
+        version_pattern = re.compile(r'(\d+b|-[a-z]+b)')
+        questionary.print(
+            json.dumps({
+                f'{bento.name}': dict(
+                    tag=bento.tag,
+                    version=version_pattern.search(bento.tag).group(1),
+                    pretty_gpu=bento.pretty_gpu,
+                    command=f'openllm serve {bento.tag}',
+                )
+                for bento in bentos
+                if not is_seen(bento.name) and version_pattern.search(bento.tag)
+            })
+        )
+        return
+
    table = tabulate.tabulate(
        [
            [
@@ -51,19 +72,21 @@ def list_model(tag: Optional[str] = None, repo: Optional[str] = None, verbose: b
        ],
        headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],
    )
-    output(table)
+    output_(table)


-def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_name: Optional[str] = None) -> BentoInfo:
+def ensure_bento(
+    model: str, target: typing.Optional[DeploymentTarget] = None, repo_name: typing.Optional[str] = None
+) -> BentoInfo:
    bentos = list_bento(model, repo_name=repo_name)
    if len(bentos) == 0:
-        output(f'No model found for {model}', style='red')
+        output_(f'No model found for {model}', style='red')
        raise typer.Exit(1)

    if len(bentos) == 1:
-        output(f'Found model {bentos[0]}', style='green')
+        output_(f'Found model {bentos[0]}', style='green')
        if target is not None and can_run(bentos[0], target) <= 0:
-            output(
+            output_(
                f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '
                f'resources to run model {bentos[0]}\n',
                style='yellow',
@@ -71,7 +94,7 @@ def ensure_bento(model: str, target: Optional[DeploymentTarget] = None, repo_nam
        return bentos[0]

    # multiple models, pick one according to target
-    output(f'Multiple models match {model}, did you mean one of these?', style='red')
+    output_(f'Multiple models match {model}, did you mean one of these?', style='red')
    list_model(model, repo=repo_name)
    raise typer.Exit(1)

@@ -99,9 +122,9 @@ def list_bento(
    if repo_name is not None:
        repo_map = {repo.name: repo for repo in repo_list}
        if repo_name not in repo_map:
-            output(f'Repo `{repo_name}` not found, did you mean one of these?')
+            output_(f'Repo `{repo_name}` not found, did you mean one of these?')
            for repo_name in repo_map:
-                output(f'  {repo_name}')
+                output_(f'  {repo_name}')
            raise typer.Exit(1)

    if not tag:
--- a/uv.lock
+++ b/uv.lock