mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-01-21 22:10:45 -05:00
* chore: update instruction for dependencies Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> * feat(experimental): CTranslate2 Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com> --------- Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
130 lines
5.1 KiB
Python
Executable File
130 lines
5.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import os
|
|
import shutil
|
|
import sys
|
|
|
|
import tomlkit
|
|
|
|
START_COMMENT = f'<!-- {os.path.basename(__file__)}: start -->\n'
|
|
END_COMMENT = f'<!-- {os.path.basename(__file__)}: stop -->\n'
|
|
|
|
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.insert(0, os.path.join(ROOT, 'openllm-core', 'src'))
|
|
from openllm_core.config import CONFIG_MAPPING
|
|
|
|
|
|
def markdown_noteblock(text: str):
|
|
return ['\n', f'> **Note:** {text}\n']
|
|
|
|
|
|
def markdown_importantblock(text: str):
|
|
return ['\n', f'> **Important:** {text}\n']
|
|
|
|
|
|
def main() -> int:
|
|
with open(os.path.join(ROOT, 'openllm-python', 'pyproject.toml'), 'r') as f:
|
|
deps = tomlkit.parse(f.read()).value['project']['optional-dependencies']
|
|
with open(os.path.join(ROOT, 'README.md'), 'r') as f:
|
|
readme = f.readlines()
|
|
|
|
start_index, stop_index = readme.index(START_COMMENT), readme.index(END_COMMENT)
|
|
|
|
content = []
|
|
|
|
for it in CONFIG_MAPPING.values():
|
|
it = it()
|
|
architecture_name = it.__class__.__name__[:-6]
|
|
details_block = ['<details>\n', f'<summary>{architecture_name}</summary>\n\n', '### Quickstart\n']
|
|
if it['start_name'] in deps:
|
|
instruction = f'> ```bash\n> pip install "openllm[{it["start_name"]}]"\n> ```'
|
|
details_block.extend(markdown_noteblock(f'{architecture_name} requires to install with:\n{instruction}\n'))
|
|
details_block.extend(
|
|
[
|
|
f'Run the following command to quickly spin up a {architecture_name} server:\n',
|
|
f"""\
|
|
```bash
|
|
{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['default_id']}
|
|
```""",
|
|
'In a different terminal, run the following command to interact with the server:\n',
|
|
"""\
|
|
```bash
|
|
export OPENLLM_ENDPOINT=http://localhost:3000
|
|
openllm query 'What are large language models?'
|
|
```""",
|
|
*markdown_noteblock(
|
|
f"Any {architecture_name} variants can be deployed with OpenLLM. Visit the [HuggingFace Model Hub](https://huggingface.co/models?sort=trending&search={it['model_name']}) to see more {architecture_name}-compatible models.\n"
|
|
),
|
|
'\n### Supported models\n',
|
|
f'You can specify any of the following {architecture_name} models via `openllm start`:\n\n',
|
|
]
|
|
)
|
|
list_ids = [f'- [{model_id}](https://huggingface.co/{model_id})' for model_id in it['model_ids']]
|
|
details_block.extend(list_ids)
|
|
details_block.extend(
|
|
[
|
|
'\n### Supported backends\n',
|
|
'OpenLLM will support vLLM and PyTorch as default backend. By default, it will use vLLM if vLLM is available, otherwise fallback to PyTorch.\n',
|
|
*markdown_importantblock(
|
|
'We recommend user to explicitly specify `--backend` to choose the desired backend to run the model. If you have access to a GPU, always use `--backend vllm`.\n'
|
|
),
|
|
]
|
|
)
|
|
if 'vllm' in it['backend']:
|
|
details_block.extend(
|
|
[
|
|
'\n- vLLM (Recommended):\n\n',
|
|
'To install vLLM, run `pip install "openllm[vllm]"`\n',
|
|
f"""\
|
|
```bash
|
|
{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend vllm
|
|
```""",
|
|
*markdown_importantblock(
|
|
'Using vLLM requires a GPU that has architecture newer than 8.0 to get the best performance for serving. It is recommended that for all serving usecase in production, you should choose vLLM for serving.'
|
|
),
|
|
*markdown_noteblock('Currently, adapters are yet to be supported with vLLM.'),
|
|
]
|
|
)
|
|
if 'pt' in it['backend']:
|
|
details_block.extend(
|
|
[
|
|
'\n- PyTorch:\n\n',
|
|
f"""\
|
|
```bash
|
|
{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend pt
|
|
```""",
|
|
]
|
|
)
|
|
if 'ctranslate' in it['backend']:
|
|
details_block.extend(
|
|
[
|
|
'\n- CTranslate2 (*experimental*):\n\n',
|
|
f"""\
|
|
```bash
|
|
{'' if not it['trust_remote_code'] else 'TRUST_REMOTE_CODE=True '}openllm start {it['model_ids'][0]} --backend ctranslate
|
|
```""",
|
|
*markdown_noteblock(
|
|
'Currently, All quantization methods from ctranslate2 are supported. This includes int8, int8_float16, int8_bfloat16'
|
|
),
|
|
*markdown_noteblock(
|
|
'We recommend users to convert the models beforehand, and then provide the given directory of the converted models to `openllm start`. See [CTranslate2](https://opennmt.net/CTranslate2/conversion.html) for more information.'
|
|
),
|
|
*markdown_importantblock(
|
|
'CTranslate2 is an experimental backend and yet to be fully supported. It is recommended to use vLLM for all production use-case.'
|
|
),
|
|
]
|
|
)
|
|
|
|
details_block.append('\n</details>\n\n')
|
|
content.append('\n'.join(details_block))
|
|
|
|
readme = readme[:start_index] + [START_COMMENT] + content + [END_COMMENT] + readme[stop_index + 1 :]
|
|
with open(os.path.join(ROOT, 'README.md'), 'w') as f:
|
|
f.writelines(readme)
|
|
|
|
shutil.copyfile(os.path.join(ROOT, 'README.md'), os.path.join(ROOT, 'openllm-python', 'README.md'))
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
raise SystemExit(main())
|