refactor: packages (#249)

This commit is contained in:
Aaron Pham
2023-08-22 08:55:46 -04:00
committed by GitHub
parent a964e659c1
commit 3ffb25a872
148 changed files with 2899 additions and 1937 deletions

2
.gitattributes vendored
View File

@@ -2,7 +2,7 @@
contrib/clojure/pnpm-lock.yaml linguist-generated=true
contrib/clojure/src/generated/** linguist-generated=true
# Python core
# Python sdk
openllm-python/tests/models/__snapshots__/* linguist-generated=true
openllm-python/src/openllm/utils/dummy_*.py linguist-generated=true
openllm-python/src/openllm/models/__init__.py linguist-generated=true

View File

@@ -75,7 +75,9 @@ jobs:
- name: Install build frontend
run: python -m pip install --upgrade build
- name: Build
run: python -m build -sw openllm-python/
run: |
bash local.sh
python -m build -sw openllm-python/
- name: Upload artifacts
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
with:

View File

@@ -112,7 +112,7 @@ jobs:
- name: Build OPT Bento with base embeddings
id: bento-tag
run: |
pip install -e ${{ github.workspace }}/openllm-python
bash local.sh
openllm build opt --serialisation legacy --bento-version sha-${{ env.GITHUB_SHA_SHORT }} --machine --dockerfile-template - <<EOF
{% extends "python_debian.j2" %}
{% block SETUP_BENTO_BASE_IMAGE %}

View File

@@ -116,7 +116,7 @@ jobs:
bentoml-version: 'main'
python-version-file: .python-version-default
- name: Install self
run: pip install ./openllm-python
run: bash local.sh
- name: Speed
run: hyperfine -m 100 --warmup 10 openllm
brew-dry-run:

View File

@@ -66,11 +66,15 @@ jobs:
echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
echo github.ref ${{ github.ref }}
pure-wheels-sdist:
name: Pure wheels and sdist distribution
name: Pure wheels and sdist distribution (${{ matrix.directory }})
runs-on: ubuntu-latest
needs: get_commit_message
if: >-
contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
strategy:
fail-fast: false
matrix:
directory: ["openllm-core", "openllm-python", "openllm-client"]
steps:
- uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # ratchet:actions/checkout@v3
with:
@@ -82,37 +86,27 @@ jobs:
python-version-file: .python-version-default
- name: Build
run: hatch build
working-directory: openllm-python
working-directory: ${{ matrix.directory }}
- name: Upload artifacts
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
with:
name: python-artefacts
path: openllm-python/dist/*
path: ${{ matrix.directory }}/dist/*
if-no-files-found: error
mypyc:
name: Compiled mypyc wheels (${{ matrix.name }})
runs-on: ${{ matrix.os }}
defaults:
run:
working-directory: ./openllm-python
name: Compiled wheels (${{ matrix.buildplatform[1] }}-${{ matrix.directory }})
runs-on: ${{ matrix.buildplatform[0] }}
strategy:
fail-fast: false
matrix:
include:
- os: ubuntu-latest
name: linux-x86_64
# NOTE: darwin amd64
- os: macos-latest
name: macos-x86_64
macos_arch: "x86_64"
# NOTE: darwin arm64
- os: macos-latest
name: macos-arm64
macos_arch: "arm64"
# NOTE: darwin universal2
- os: macos-latest
name: macos-universal2
macos_arch: "universal2"
# Github Actions doesn't support pairing matrix values together, let's improvise
# https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
buildplatform:
- [ubuntu-latest, linux-x86_64, ""]
- [macos-latest, macos-x86_64, "x86_64"]
- [macos-latest, macos-arm64, "arm64"]
- [macos-latest, macos-universal2, "universal2"]
directory: ["openllm-core", "openllm-python", "openllm-client"]
needs: get_commit_message
if: >-
contains(needs.get_commit_message.outputs.message, '[wheel build]') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_call' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, '02 - Wheel Build')) || (github.event_name == 'push' && (startsWith(github.ref, 'refs/tags/v') || startsWith(github.ref, 'refs/heads/main')))
@@ -128,17 +122,16 @@ jobs:
- name: Build wheels via cibuildwheel
uses: pypa/cibuildwheel@39a63b5912f086dd459cf6fcb13dcdd3fe3bc24d # ratchet:pypa/cibuildwheel@v2.15.0
with:
package-dir: openllm-python
package-dir: ${{ matrix.directory }}
config-file: pyproject.toml
env:
CIBW_PRERELEASE_PYTHONS: True
CIBW_BEFORE_BUILD_MACOS: "rustup target add aarch64-apple-darwin"
CIBW_ARCHS_MACOS: "${{ matrix.macos_arch }}"
CIBW_ARCHS_MACOS: "${{ matrix.buildplatform[2] }}"
MYPYPATH: /project/typings
- name: Upload wheels as workflow artifacts
uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # ratchet:actions/upload-artifact@v3
with:
name: ${{ matrix.name }}-mypyc-wheels
name: ${{ matrix.buildplatform[1] }}-mypyc-wheels
path: ./wheelhouse/*.whl
check-download-artefacts:
name: dry-run for downloading artefacts

2
.gitignore vendored
View File

@@ -141,4 +141,4 @@ pyapp
/target
.pdm-python
/openllm-python/src/openllm/_version.py
**/_version.py

View File

@@ -72,7 +72,7 @@ Before you can start developing, you'll need to set up your environment:
dependencies.
> [!NOTE]
> If you want to install editable, make sure to install it from `openllm-python` folder
> If you don't want to work with hatch, you can use the editable workflow with running `bash local.sh`
## Project Structure

View File

@@ -0,0 +1,5 @@
OpenLLM now comprise of three packages:
- openllm-core: main building blocks of OpenLLM, that doesn't depend on transformers and heavy DL libraries
- openllm-client: The implementation of `openllm.client`
- openllm: = openllm-core + openllm-client + DL features

View File

@@ -1,4 +1,7 @@
#!/usr/bin/env bash
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
pip install -U mypy==1.5.1 build==0.10.0
HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-python -w -C--global-option=--verbose "$@"
HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-core -w -C--global-option=--verbose "$@"
HATCH_VERBOSE=3 MYPY_CONFIG_FILE_DIR="${SCRIPT_DIR}" HATCH_BUILD_HOOKS_ENABLE=1 MYPYPATH="${SCRIPT_DIR}/typings" python -m build openllm-client -w -C--global-option=--verbose "$@"
hatch clean

10
cz.py
View File

@@ -4,21 +4,25 @@ import itertools, os, token, tokenize
from tabulate import tabulate
TOKEN_WHITELIST = [token.OP, token.NAME, token.NUMBER, token.STRING]
def main() -> int:
def run_cz(dir: str, package: str):
headers = ["Name", "Lines", "Tokens/Line"]
table = []
for path, _, files in os.walk(os.path.join("openllm-python", "src", "openllm")):
for path, _, files in os.walk(os.path.join(dir, "src", package)):
for name in files:
if not name.endswith(".py"): continue
filepath = os.path.join(path, name)
with tokenize.open(filepath) as file_:
tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST]
token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens]))
table.append([filepath.replace(os.path.join("openllm-python","src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
table.append([filepath.replace(os.path.join(dir ,"src"), ""), line_count, token_count / line_count if line_count != 0 else 0])
print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers="firstrow", floatfmt=".1f") + "\n")
for dir_name, group in itertools.groupby(sorted([(x[0].rsplit("/", 1)[0], x[1]) for x in table]), key=lambda x: x[0]):
print(f"{dir_name:35s} : {sum([x[1] for x in group]):6d}")
print(f"\ntotal line count: {sum([x[1] for x in table])}")
def main() -> int:
run_cz("openllm-python", "openllm")
run_cz("openllm-core", "openllm_core")
run_cz("openllm-client", "openllm_client")
return 0
if __name__ == "__main__": raise SystemExit(main())

View File

@@ -1,5 +1,7 @@
[envs.default]
dependencies = [
"openllm-core @ {root:uri}/openllm-core",
"openllm-client @ {root:uri}/openllm-client",
"openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
# NOTE: To run all hooks
"pre-commit",
@@ -26,7 +28,6 @@ check-stubs = [
"./tools/update-models-import.py",
"update-dummy",
]
compile = "bash ./compile.sh {args}"
inplace-changelog = "towncrier build --version main --keep"
quality = [
"./tools/dependencies.py",
@@ -36,13 +37,12 @@ quality = [
"check-stubs",
"- pre-commit run --all-files",
]
recompile = ["bash ./clean.sh", "compile"]
setup = [
"pre-commit install",
"- ln -s .python-version-default .python-version",
"curl -fsSL https://raw.githubusercontent.com/clj-kondo/clj-kondo/master/script/install-clj-kondo | bash -",
]
tool = ["quality", "recompile -nx"]
tool = ["quality", "bash ./clean.sh", "bash ./compile.sh {args}"]
typing = [
"- pre-commit run mypy {args:-a}",
"- pre-commit run pyright {args:-a}",
@@ -50,6 +50,8 @@ typing = [
update-dummy = ["- ./tools/update-dummy.py", "./tools/update-dummy.py"]
[envs.tests]
dependencies = [
"openllm-core @ {root:uri}/openllm-core",
"openllm-client @ {root:uri}/openllm-client",
"openllm[opt,chatglm,fine-tune] @ {root:uri}/openllm-python",
# NOTE: interact with docker for container tests.
"docker",
@@ -94,7 +96,10 @@ clojure = ["bash tools/run-clojure-ui.sh"]
[envs.ci]
detached = true
[envs.ci.scripts]
compile = "bash ./compile.sh {args}"
recompile = ["bash ./clean.sh", "compile"]
edi = "bash local.sh"
lock = [
"bash tools/lock-actions",
"bash tools/lock-actions.sh",
"pushd contrib/clojure && pnpm i --frozen-lockfile",
]

10
local.sh Executable file
View File

@@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -ex
GIT_ROOT=$(git rev-parse --show-toplevel)
cd "$GIT_ROOT" || exit 1
pip install -e "$GIT_ROOT/openllm-core" -v
pip install -e "$GIT_ROOT/openllm-client" -v
pip install -e "$GIT_ROOT/openllm-python" -v

View File

@@ -0,0 +1 @@
../.git_archival.txt

View File

@@ -0,0 +1 @@
../.python-version-default

194
openllm-client/LICENSE.md Normal file
View File

@@ -0,0 +1,194 @@
Apache License
==============
_Version 2.0, January 2004_
_&lt;<http://www.apache.org/licenses/>&gt;_
### Terms and Conditions for use, reproduction, and distribution
#### 1. Definitions
“License” shall mean the terms and conditions for use, reproduction, and
distribution as defined by Sections 1 through 9 of this document.
“Licensor” shall mean the copyright owner or entity authorized by the copyright
owner that is granting the License.
“Legal Entity” shall mean the union of the acting entity and all other entities
that control, are controlled by, or are under common control with that entity.
For the purposes of this definition, “control” means **(i)** the power, direct or
indirect, to cause the direction or management of such entity, whether by
contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
outstanding shares, or **(iii)** beneficial ownership of such entity.
“You” (or “Your”) shall mean an individual or Legal Entity exercising
permissions granted by this License.
“Source” form shall mean the preferred form for making modifications, including
but not limited to software source code, documentation source, and configuration
files.
“Object” form shall mean any form resulting from mechanical transformation or
translation of a Source form, including but not limited to compiled object code,
generated documentation, and conversions to other media types.
“Work” shall mean the work of authorship, whether in Source or Object form, made
available under the License, as indicated by a copyright notice that is included
in or attached to the work (an example is provided in the Appendix below).
“Derivative Works” shall mean any work, whether in Source or Object form, that
is based on (or derived from) the Work and for which the editorial revisions,
annotations, elaborations, or other modifications represent, as a whole, an
original work of authorship. For the purposes of this License, Derivative Works
shall not include works that remain separable from, or merely link (or bind by
name) to the interfaces of, the Work and Derivative Works thereof.
“Contribution” shall mean any work of authorship, including the original version
of the Work and any modifications or additions to that Work or Derivative Works
thereof, that is intentionally submitted to Licensor for inclusion in the Work
by the copyright owner or by an individual or Legal Entity authorized to submit
on behalf of the copyright owner. For the purposes of this definition,
“submitted” means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems, and
issue tracking systems that are managed by, or on behalf of, the Licensor for
the purpose of discussing and improving the Work, but excluding communication
that is conspicuously marked or otherwise designated in writing by the copyright
owner as “Not a Contribution.”
“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
of whom a Contribution has been received by Licensor and subsequently
incorporated within the Work.
#### 2. Grant of Copyright License
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the Work and such
Derivative Works in Source or Object form.
#### 3. Grant of Patent License
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to make, have
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
such license applies only to those patent claims licensable by such Contributor
that are necessarily infringed by their Contribution(s) alone or by combination
of their Contribution(s) with the Work to which such Contribution(s) was
submitted. If You institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
Contribution incorporated within the Work constitutes direct or contributory
patent infringement, then any patent licenses granted to You under this License
for that Work shall terminate as of the date such litigation is filed.
#### 4. Redistribution
You may reproduce and distribute copies of the Work or Derivative Works thereof
in any medium, with or without modifications, and in Source or Object form,
provided that You meet the following conditions:
* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
this License; and
* **(b)** You must cause any modified files to carry prominent notices stating that You
changed the files; and
* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
all copyright, patent, trademark, and attribution notices from the Source form
of the Work, excluding those notices that do not pertain to any part of the
Derivative Works; and
* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
Derivative Works that You distribute must include a readable copy of the
attribution notices contained within such NOTICE file, excluding those notices
that do not pertain to any part of the Derivative Works, in at least one of the
following places: within a NOTICE text file distributed as part of the
Derivative Works; within the Source form or documentation, if provided along
with the Derivative Works; or, within a display generated by the Derivative
Works, if and wherever such third-party notices normally appear. The contents of
the NOTICE file are for informational purposes only and do not modify the
License. You may add Your own attribution notices within Derivative Works that
You distribute, alongside or as an addendum to the NOTICE text from the Work,
provided that such additional attribution notices cannot be construed as
modifying the License.
You may add Your own copyright statement to Your modifications and may provide
additional or different license terms and conditions for use, reproduction, or
distribution of Your modifications, or for any such Derivative Works as a whole,
provided Your use, reproduction, and distribution of the Work otherwise complies
with the conditions stated in this License.
#### 5. Submission of Contributions
Unless You explicitly state otherwise, any Contribution intentionally submitted
for inclusion in the Work by You to the Licensor shall be under the terms and
conditions of this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify the terms of
any separate license agreement you may have executed with Licensor regarding
such Contributions.
#### 6. Trademarks
This License does not grant permission to use the trade names, trademarks,
service marks, or product names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the NOTICE file.
#### 7. Disclaimer of Warranty
Unless required by applicable law or agreed to in writing, Licensor provides the
Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
including, without limitation, any warranties or conditions of TITLE,
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
solely responsible for determining the appropriateness of using or
redistributing the Work and assume any risks associated with Your exercise of
permissions under this License.
#### 8. Limitation of Liability
In no event and under no legal theory, whether in tort (including negligence),
contract, or otherwise, unless required by applicable law (such as deliberate
and grossly negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special, incidental,
or consequential damages of any character arising as a result of this License or
out of the use or inability to use the Work (including but not limited to
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
any and all other commercial damages or losses), even if such Contributor has
been advised of the possibility of such damages.
#### 9. Accepting Warranty or Additional Liability
While redistributing the Work or Derivative Works thereof, You may choose to
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
other liability obligations and/or rights consistent with this License. However,
in accepting such obligations, You may act only on Your own behalf and on Your
sole responsibility, not on behalf of any other Contributor, and only if You
agree to indemnify, defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason of your
accepting any such warranty or additional liability.
_END OF TERMS AND CONDITIONS_
### APPENDIX: How to apply the Apache License to your work
To apply the Apache License to your work, attach the following boilerplate
notice, with the fields enclosed by brackets `[]` replaced with your own
identifying information. (Don't include the brackets!) The text should be
enclosed in the appropriate comment syntax for the file format. We also
recommend that a file or class name and description of purpose be included on
the same “printed page” as the copyright notice for easier identification within
third-party archives.
Copyright 2023 Atalaya Tech Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

86
openllm-client/README.md Normal file
View File

@@ -0,0 +1,86 @@
![Banner for OpenLLM](/.github/assets/main-banner.png)
<!-- hatch-fancy-pypi-readme intro start -->
<div align="center">
<h1 align="center">👾 OpenLLM Client</h1>
<a href="https://pypi.org/project/openllm-client">
<img src="https://img.shields.io/pypi/v/openllm-client.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
</a><a href="https://test.pypi.org/project/openllm-client/">
<img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
</a><a href="https://twitter.com/bentomlai">
<img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
</a><a href="https://l.bentoml.com/join-openllm-discord">
<img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
</a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
<img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
</a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
<img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
</a><br>
<a href="https://pypi.org/project/openllm-client">
<img src="https://img.shields.io/pypi/pyversions/openllm-client.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
</a><a href="htjtps://github.com/pypa/hatch">
<img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
</a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
<img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
</a><a href="https://github.com/astral-sh/ruff">
<img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
</a><a href="https://github.com/python/mypy">
<img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
</a><a href="https://github.com/microsoft/pyright">
<img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
</a><br>
<p>OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server.<br/></p>
<i></i>
</div>
## 📖 Introduction
With OpenLLM, you can run inference with any open-source large-language models,
deploy to the cloud or on-premises, and build powerful AI apps, and more.
To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
This package holds the underlying client implementation for OpenLLM. If you are
coming from OpenLLM, the client can be accessed via `openllm.client`.
It provides somewhat of a "similar" APIs to [`bentoml.Client`](https://docs.bentoml.com/en/latest/guides/client.html)
(via `openllm_client.benmin`) for interacting with OpenLLM server. This can also be extended to use with general
BentoML server as well.
> [!NOTE]
> The component of interop with generic BentoML server will be considered as experimental that will/can be merged back to BentoML.
> If you are just using this package for interacting with OpenLLM server, nothing should change from `openllm.client` namespace.
```python
import openllm
client = openllm.client.HTTPClient()
client.query('Explain to me the difference between "further" and "farther"')
```
<!-- hatch-fancy-pypi-readme intro stop -->
![Gif showing OpenLLM Intro](/.github/assets/output.gif)
<br/>
<!-- hatch-fancy-pypi-readme interim start -->
## 📔 Citation
If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
```bibtex
@software{Pham_OpenLLM_Operating_LLMs_2023,
author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
license = {Apache-2.0},
month = jun,
title = {{OpenLLM: Operating LLMs in production}},
url = {https://github.com/bentoml/OpenLLM},
year = {2023}
}
```
<!-- hatch-fancy-pypi-readme interim stop -->

View File

@@ -0,0 +1,172 @@
[build-system]
build-backend = "hatchling.build"
requires = [
"hatchling==1.18.0",
"hatch-vcs==0.3.0",
"hatch-fancy-pypi-readme==23.1.0",
]
[project]
authors = [
{ name = "Aaron Pham", email = "aarnphm@bentoml.com" },
{ name = "BentoML Team", email = "contact@bentoml.com" },
]
dynamic = ['readme', 'version']
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: GPU :: NVIDIA CUDA",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
"Typing :: Typed",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
description = "OpenLLM Client: Interacting with OpenLLM HTTP/gRPC server, or any BentoML server."
keywords = [
"MLOps",
"AI",
"BentoML",
"Model Serving",
"Model Deployment",
"LLMOps",
"Falcon",
"Vicuna",
"Llama 2",
"Fine tuning",
"Serverless",
"Large Language Model",
"Generative AI",
"StableLM",
"Alpaca",
"PyTorch",
"Transformers",
]
dependencies = ["openllm-core", "httpx"]
license = "Apache-2.0"
name = "openllm-client"
requires-python = ">=3.8"
[project.urls]
Blog = "https://modelserving.com"
Chat = "https://discord.gg/openllm"
Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client/README.md"
GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-client"
History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
Homepage = "https://bentoml.com"
Tracker = "https://github.com/bentoml/OpenLLM/issues"
Twitter = "https://twitter.com/bentomlai"
[project.optional-dependencies]
full = ["openllm-client[grpc,agents]"]
grpc = ["bentoml[grpc]>=1.0.25"]
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
[tool.hatch.version]
fallback-version = "0.0.0"
source = "vcs"
[tool.hatch.build.hooks.vcs]
version-file = "src/openllm_client/_version.py"
[tool.hatch.version.raw-options]
git_describe_command = [
"git",
"describe",
"--dirty",
"--tags",
"--long",
"--first-parent",
]
local_scheme = "no-local-version"
root = ".."
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
only-include = ["src/openllm_client"]
sources = ["src"]
[tool.hatch.build.targets.sdist]
exclude = ["/.git_archival.txt", "tests", "/.python-version-default"]
[tool.hatch.build.targets.wheel.hooks.mypyc]
dependencies = [
"hatch-mypyc==0.16.0",
"mypy==1.5.1",
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.2",
"transformers>=4.31.0",
"pandas-stubs",
"types-psutil",
"types-tabulate",
"types-PyYAML",
"types-protobuf",
]
enable-by-default = false
include = ["src/openllm_client/__init__.py", "src/openllm_client/client.py"]
# NOTE: This is consistent with pyproject.toml
mypy-args = [
"--strict",
# this is because all transient library doesn't have types
"--allow-subclassing-any",
"--check-untyped-defs",
"--ignore-missing-imports",
"--no-warn-return-any",
"--warn-unreachable",
"--no-warn-no-return",
"--no-warn-unused-ignores",
]
options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
require-runtime-dependencies = true
[tool.hatch.metadata.hooks.fancy-pypi-readme]
content-type = "text/markdown"
# PyPI doesn't support the <picture> tag.
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<a href="https://github.com/bentoml/openllm">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
</a>
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
path = "README.md"
start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
path = "README.md"
start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
---
[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
"""

View File

@@ -0,0 +1,5 @@
from __future__ import annotations
from . import benmin as benmin
from ._base import BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient
from .client import AsyncHTTPClient as AsyncHTTPClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient

View File

View File

@@ -0,0 +1,203 @@
# mypy: disable-error-code="override,no-redef"
from __future__ import annotations
import typing as t, functools, openllm_core, logging, httpx, orjson, attr, abc
from http import HTTPStatus
from urllib.parse import urljoin
from .benmin import Client as BentoClient, AsyncClient as AsyncBentoClient
from openllm_core.utils import is_transformers_supports_agent, is_transformers_available, bentoml_cattr, ensure_exec_coro
from openllm_core._typing_compat import overload, LiteralString
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralRuntime, DictStrAny
import transformers
logger = logging.getLogger(__name__)
@attr.define(slots=False, init=False)
class _ClientAttr:
_address: str
_timeout: float = attr.field(default=30)
_api_version: str = attr.field(default="v1")
def __init__(self, address: str, timeout: float = 30, api_version: str = "v1"): self.__attrs_init__(address, timeout, api_version)
@abc.abstractmethod
def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: raise NotImplementedError
@abc.abstractmethod
def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: raise NotImplementedError
@overload
@abc.abstractmethod
def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
@abc.abstractmethod
def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
@abc.abstractmethod
def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
@abc.abstractmethod
def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any: raise NotImplementedError
# NOTE: Scikit interface
@overload
@abc.abstractmethod
def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
@abc.abstractmethod
def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
@abc.abstractmethod
def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm_core.GenerationOutput: ...
@abc.abstractmethod
def predict(self, prompt: str, **attrs: t.Any) -> t.Any: raise NotImplementedError
@functools.cached_property
def _hf_agent(self) -> transformers.HfAgent:
if not is_transformers_available(): raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.")
if not self.supports_hf_agent: raise RuntimeError(f"{self.model_name} ({self.framework}) does not support running HF agent.")
if not is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
import transformers
return transformers.HfAgent(urljoin(self._address, "/hf/agent"))
@property
def _metadata(self) -> t.Any: return self.call("metadata")
@property
def model_name(self) -> str:
try: return self._metadata["model_name"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def model_id(self) -> str:
try: return self._metadata["model_id"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def framework(self) -> LiteralRuntime:
try: return self._metadata["framework"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def timeout(self) -> int:
try: return self._metadata["timeout"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def configuration(self) -> dict[str, t.Any]:
try: return orjson.loads(self._metadata["configuration"])
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try: return self._metadata.get("supports_embeddings", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try: return self._metadata.get("supports_hf_agent", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def config(self) -> openllm_core.LLMConfig: return openllm_core.AutoConfig.for_model(self.model_name).model_construct_env(**self.configuration)
@functools.cached_property
def inner(self) -> t.Any: raise NotImplementedError("'inner' client is not implemented.")
class _Client(_ClientAttr):
_host: str
_port: str
def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
def health(self) -> t.Any: return self.inner.health()
@functools.cached_property
def inner(self) -> BentoClient:
BentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
return BentoClient.from_url(self._address)
# Agent integration
def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
task = kwargs.pop("task", args[0])
return_code = kwargs.pop("return_code", False)
remote = kwargs.pop("remote", False)
try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
except Exception as err:
logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
class _AsyncClient(_ClientAttr):
_host: str
_port: str
def __init__(self, address: str, timeout: float = 30): self._address,self._timeout = address,timeout
async def call(self, api_name: str, *args: t.Any, **attrs: t.Any) -> t.Any: return await self.inner.call(f"{api_name}_{self._api_version}", *args, **attrs)
async def health(self) -> t.Any: return await self.inner.health()
@functools.cached_property
def inner(self) -> AsyncBentoClient:
ensure_exec_coro(AsyncBentoClient.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout))
return ensure_exec_coro(AsyncBentoClient.from_url(self._address))
# Agent integration
async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
"""Async version of agent.run."""
if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
if not is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
from transformers.tools.python_interpreter import evaluate
task = kwargs.pop("task", args[0])
return_code = kwargs.pop("return_code", False)
remote = kwargs.pop("remote", False)
stop = ["Task:"]
prompt = t.cast(str, self._hf_agent.format_prompt(task))
async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
response = await client.post(self._hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop}})
if response.status_code != HTTPStatus.OK: raise ValueError(f"Error {response.status_code}: {response.json()}")
result = response.json()[0]["generated_text"]
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[:-len(stop_seq)]
break
# the below have the same logic as agent.run API
explanation, code = clean_code_for_run(result)
self._hf_agent.log(f"==Explanation from the agent==\n{explanation}")
self._hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
if not return_code:
self._hf_agent.log("\n\n==Result==")
self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools)
return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy())
else:
tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote)
return f"{tool_code}\n{code}"
class BaseClient(_Client):
def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt)))
def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: return self.query(prompt, **attrs)
def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
return_raw_response = attrs.pop("return_raw_response", None)
if return_raw_response is not None:
logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
if return_raw_response is True: return_response = "raw"
return_attrs = attrs.pop("return_attrs", None)
if return_attrs is not None:
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = "attrs"
use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
r = openllm_core.GenerationOutput(**self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))
if return_response == "attrs": return r
elif return_response == "raw": return bentoml_cattr.unstructure(r)
else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
class BaseAsyncClient(_AsyncClient):
async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: return openllm_core.EmbeddingsOutput(**(await self.call("embeddings", list([prompt] if isinstance(prompt, str) else prompt))))
async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return await self.query(prompt, **attrs)
async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> t.Any:
return_raw_response = attrs.pop("return_raw_response", None)
if return_raw_response is not None:
logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
if return_raw_response is True: return_response = "raw"
return_attrs = attrs.pop("return_attrs", None)
if return_attrs is not None:
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = "attrs"
use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
r = openllm_core.GenerationOutput(**(await self.call("generate", openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())))
if return_response == "attrs": return r
elif return_response == "raw": return bentoml_cattr.unstructure(r)
else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs)

View File

@@ -0,0 +1,87 @@
"""This holds a simple client implementation, somewhat similar to `bentoml.client`.
This module is subjected to change and to be merged upstream to BentoML.
```python
import openllm_client
client = openllm_client.benmin.Client.from_url("http://localhost:3000")
```
The client implementation won't include a dynamic assignment of the service endpoints, rather this should be called
via `client.call` or `await client.call`.
"""
from __future__ import annotations
import typing as t, bentoml, attr, httpx
from abc import abstractmethod
if t.TYPE_CHECKING: from bentoml._internal.service.inference_api import InferenceAPI
__all__ = ["Client", "AsyncClient"]
@attr.define(init=False)
class Client:
server_url: str
endpoints: t.List[str]
svc: bentoml.Service
timeout: int = attr.field(default=30)
def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
for it, val in kwargs.items(): object.__setattr__(self, it, val)
def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
@abstractmethod
def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
@abstractmethod
def health(self) -> t.Any: raise NotImplementedError
@classmethod
def from_url(cls, url: str, **kwargs: t.Any) -> Client:
try:
from ._http import HttpClient
return HttpClient.from_url(url, **kwargs)
except httpx.RemoteProtocolError:
from ._grpc import GrpcClient
return GrpcClient.from_url(url, **kwargs)
except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
@staticmethod
def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
try:
from ._http import HttpClient
return HttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
except httpx.RemoteProtocolError:
from ._grpc import GrpcClient
return GrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err
@attr.define(init=False)
class AsyncClient:
server_url: str
endpoints: t.List[str]
svc: bentoml.Service
timeout: int = attr.field(default=30)
def __init__(self, server_url: str, svc: bentoml.Service, **kwargs: t.Any) -> None:
if len(svc.apis) == 0: raise bentoml.exceptions.BentoMLException("No APIs was found while constructing clients.")
self.__attrs_init__(server_url=server_url, endpoints=list(svc.apis), svc=svc)
for it, val in kwargs.items(): object.__setattr__(self, it, val)
async def call(self, bentoml_api_name: str, data: t.Any = None, **kwargs: t.Any) -> t.Any: return await self._call(data, _inference_api=self.svc.apis[bentoml_api_name], **kwargs)
@abstractmethod
async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: raise NotImplementedError
@abstractmethod
async def health(self) -> t.Any: raise NotImplementedError
@classmethod
async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncClient:
try:
from ._http import AsyncHttpClient
return await AsyncHttpClient.from_url(url, **kwargs)
except httpx.RemoteProtocolError:
from ._grpc import AsyncGrpcClient
return await AsyncGrpcClient.from_url(url, **kwargs)
except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to create client from url: %s" % url) from err
@staticmethod
async def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None:
try:
from ._http import AsyncHttpClient
await AsyncHttpClient.wait_until_server_ready(host, port, timeout, **kwargs)
except httpx.RemoteProtocolError:
from ._grpc import AsyncGrpcClient
await AsyncGrpcClient.wait_until_server_ready(host, port, timeout, **kwargs)
except Exception as err: raise bentoml.exceptions.BentoMLException("Failed to wait until server ready: %s:%d" % (host, port)) from err

View File

@@ -0,0 +1,187 @@
# mypy: disable-error-code="no-redef"
from __future__ import annotations
import typing as t, logging, time, functools, bentoml
from bentoml._internal.service.inference_api import InferenceAPI
from openllm_client.benmin import Client, AsyncClient
from openllm_core.utils import is_grpc_available, is_grpc_health_available, ensure_exec_coro
from openllm_core._typing_compat import NotRequired, overload
from bentoml.grpc.utils import load_from_file, import_generated_stubs
if not is_grpc_available() or not is_grpc_health_available(): raise ImportError("gRPC is required to use gRPC client. Install with 'pip install \"openllm-client[grpc]\"'.")
from grpc import aio
from google.protobuf import json_format
import grpc, grpc_health.v1.health_pb2 as pb_health, grpc_health.v1.health_pb2_grpc as services_health
pb, services = import_generated_stubs("v1")
if t.TYPE_CHECKING:
from bentoml.grpc.v1.service_pb2 import ServiceMetadataResponse
logger = logging.getLogger(__name__)
class ClientCredentials(t.TypedDict):
root_certificates: NotRequired[t.Union[bytes, str]]
private_key: NotRequired[t.Union[bytes, str]]
certificate_chain: NotRequired[t.Union[bytes, str]]
@overload
def dispatch_channel(server_url: str, typ: t.Literal["async"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = ...) -> aio.Channel: ...
@overload
def dispatch_channel(server_url: str, typ: t.Literal["sync"], ssl: bool = ..., ssl_client_credentials: ClientCredentials | None = ..., options: t.Any | None = ..., compression: grpc.Compression | None = ..., interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> grpc.Channel: ...
def dispatch_channel(server_url: str, typ: t.Literal["async", "sync"] = "sync", ssl: bool = False, ssl_client_credentials: ClientCredentials | None = None, options: t.Any | None = None, compression: grpc.Compression | None = None, interceptors: t.Sequence[aio.ClientInterceptor] | None = None) -> aio.Channel | grpc.Channel:
credentials = None
if ssl:
if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()})
if typ == "async" and ssl: return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors)
elif typ == "async": return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors)
elif typ == "sync" and ssl: return grpc.secure_channel(server_url, credentials=credentials, options=options, compression=compression)
elif typ == "sync": return grpc.insecure_channel(server_url, options=options, compression=compression)
else: raise ValueError(f"Unknown type: {typ}")
class GrpcClient(Client):
ssl: bool
ssl_client_credentials: t.Optional[ClientCredentials]
options: t.Any
compression: t.Optional[grpc.Compression]
def __init__(
self,
server_url: str,
svc: bentoml.Service,
# gRPC specific options
ssl: bool = False,
options: t.Any | None = None,
compression: grpc.Compression | None = None,
ssl_client_credentials: ClientCredentials | None = None,
**kwargs: t.Any) -> None:
self.ssl, self.ssl_client_credentials, self.options, self.compression = ssl, ssl_client_credentials, options, compression
super().__init__(server_url, svc, **kwargs)
@functools.cached_property
def inner(self) -> grpc.Channel:
if self.ssl:
if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression)
return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression)
@staticmethod
def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
req = pb_health.HealthCheckRequest()
req.service = "bentoml.grpc.v1.BentoService"
health_stub = services_health.HealthStub(channel)
start_time = time.time()
while time.time() - start_time < timeout:
try:
resp = health_stub.Check(req)
if resp.status == pb_health.HealthCheckResponse.SERVING: break
else: time.sleep(check_interval)
except grpc.RpcError:
logger.debug("Waiting for server to be ready...")
time.sleep(check_interval)
try:
resp = health_stub.Check(req)
if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
except grpc.RpcError as err:
logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
logger.error(err)
raise
@classmethod
def from_url(cls, url: str, **kwargs: t.Any) -> GrpcClient:
with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="sync", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
reflection = bentoml.Service(metadata.name)
for api in metadata.apis:
try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
name=api.name, doc=api.docs)
except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
return cls(url, reflection, **kwargs)
def health(self) -> t.Any: return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
if _inference_api.multi_input:
if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs))
else: fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data))
api_fn = {v: k for k, v in self.svc.apis.items()}
stubs = services.BentoServiceStub(self.inner)
proto = stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content"))))
class AsyncGrpcClient(AsyncClient):
ssl: bool
ssl_client_credentials: t.Optional[ClientCredentials]
options: aio.ChannelArgumentType
interceptors: t.Optional[t.Sequence[aio.ClientInterceptor]]
compression: t.Optional[grpc.Compression]
def __init__(
self,
server_url: str,
svc: bentoml.Service,
# gRPC specific options
ssl: bool = False,
options: aio.ChannelArgumentType | None = None,
interceptors: t.Sequence[aio.ClientInterceptor] | None = None,
compression: grpc.Compression | None = None,
ssl_client_credentials: ClientCredentials | None = None,
**kwargs: t.Any) -> None:
self.ssl, self.ssl_client_credentials, self.options, self.interceptors, self.compression = ssl, ssl_client_credentials, options, interceptors, compression
super().__init__(server_url, svc, **kwargs)
@functools.cached_property
def inner(self) -> aio.Channel:
if self.ssl:
if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'")
credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()})
return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors)
return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors)
@staticmethod
async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None)) as channel:
req = pb_health.HealthCheckRequest()
req.service = "bentoml.grpc.v1.BentoService"
health_stub = services_health.HealthStub(channel)
start_time = time.time()
while time.time() - start_time < timeout:
try:
resp = health_stub.Check(req)
if resp.status == pb_health.HealthCheckResponse.SERVING: break
else: time.sleep(check_interval)
except grpc.RpcError:
logger.debug("Waiting for server to be ready...")
time.sleep(check_interval)
try:
resp = health_stub.Check(req)
if resp.status != pb_health.HealthCheckResponse.SERVING: raise TimeoutError(f"Timed out waiting {timeout} seconds for server at '{host}:{port}' to be ready.")
except grpc.RpcError as err:
logger.error("Caught RpcError while connecting to %s:%s:\n", host, port)
logger.error(err)
raise
@classmethod
async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncGrpcClient:
async with dispatch_channel(url.replace(r"localhost", "0.0.0.0"), typ="async", options=kwargs.get("options", None), compression=kwargs.get("compression", None), ssl=kwargs.get("ssl", False), ssl_client_credentials=kwargs.get("ssl_client_credentials", None), interceptors=kwargs.get("interceptors", None)) as channel: metadata = t.cast("ServiceMetadataResponse", channel.unary_unary("/bentoml.grpc.v1.BentoService/ServiceMetadata", request_serializer=pb.ServiceMetadataRequest.SerializeToString, response_deserializer=pb.ServiceMetadataResponse.FromString)(pb.ServiceMetadataRequest()))
reflection = bentoml.Service(metadata.name)
for api in metadata.apis:
try: reflection.apis[api.name] = InferenceAPI[t.Any](None,
bentoml.io.from_spec({"id": api.input.descriptor_id, "args": json_format.MessageToDict(api.input.attributes).get("args", None)}),
bentoml.io.from_spec({"id": api.output.descriptor_id, "args": json_format.MessageToDict(api.output.attributes).get("args", None)}),
name=api.name, doc=api.docs)
except Exception as e: logger.error("Failed to instantiate client for API %s: ", api.name, e)
return cls(url, reflection, **kwargs)
async def health(self) -> t.Any: return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service=""))
async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
channel_kwargs = {k: kwargs.pop(f"_grpc_channel_{k}", None) for k in {"timeout", "metadata", "credentials", "wait_for_ready", "compression"}}
state = self.inner.get_state(try_to_connect=True)
if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready()
if _inference_api.multi_input:
if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
fake_resp = await _inference_api.input.to_proto(kwargs)
else: fake_resp = await _inference_api.input.to_proto(data)
api_fn = {v: k for k, v in self.svc.apis.items()}
async with self.inner:
stubs = services.BentoServiceStub(self.inner)
proto = await stubs.Call(pb.Request(**{"api_name": api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs)
return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof("content")))

View File

@@ -0,0 +1,137 @@
from __future__ import annotations
import functools, httpx, time, logging, urllib.error, typing as t, orjson, bentoml, starlette.requests, starlette.datastructures, starlette.responses, asyncio
from bentoml._internal.service.inference_api import InferenceAPI
from urllib.parse import urlparse
from openllm_client.benmin import Client, AsyncClient
from openllm_core.utils import ensure_exec_coro
logger = logging.getLogger(__name__)
class HttpClient(Client):
@functools.cached_property
def inner(self) -> httpx.Client:
if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
return httpx.Client(base_url=self.server_url)
@staticmethod
def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
host = host if "://" in host else "http://" + host
logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
start = time.time()
while time.time() - start < timeout:
try:
status = httpx.get(f"{host}:{port}/readyz").status_code
if status == 200: break
else: time.sleep(check_interval)
except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
time.sleep(check_interval)
# Try once more and raise for exception
try: httpx.get(f"{host}:{port}/readyz").raise_for_status()
except httpx.HTTPStatusError as err:
logger.error("Failed to wait until server ready: %s:%d", host, port)
logger.error(err)
raise
def health(self) -> httpx.Response: return self.inner.get("/readyz")
@classmethod
def from_url(cls, url: str, **kwargs: t.Any) -> HttpClient:
url = url if "://" in url else "http://" + url
resp = httpx.get(f"{url}/docs.json")
if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}")
_spec = orjson.loads(resp.content)
reflection = bentoml.Service(_spec["info"]["title"])
for route, spec in _spec["paths"].items():
for meth_spec in spec.values():
if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
except Exception as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
return cls(url, reflection)
def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
# All gRPC kwargs should be popped out.
kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
if _inference_api.multi_input:
if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None))
else: fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None))
# XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
else: body = fake_resp.body
resp = self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
if resp.status_code != 200: raise ValueError(f"Error while making request: {resp.status_code}: {resp.content!s}")
fake_req = starlette.requests.Request(scope={"type": "http"})
headers = starlette.datastructures.Headers(headers=resp.headers)
fake_req._body = resp.content
# Request.headers sets a _headers variable. We will need to set this value to our fake request object.
fake_req._headers = headers
return ensure_exec_coro(_inference_api.output.from_http_request(fake_req))
class AsyncHttpClient(AsyncClient):
@functools.cached_property
def inner(self) -> httpx.AsyncClient:
if not urlparse(self.server_url).netloc: raise ValueError(f"Invalid server url: {self.server_url}")
return httpx.AsyncClient(base_url=self.server_url)
@staticmethod
async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None:
host = host if "://" in host else "http://" + host
logger.debug("Waiting for server @ `%s:%d` to be ready...", host, port)
start = time.time()
while time.time() - start < timeout:
try:
async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
resp = await sess.get("/readyz")
if resp.status_code == 200: break
else: await asyncio.sleep(check_interval)
except (httpx.ConnectError, urllib.error.URLError, ConnectionError):
logger.debug("Server is not ready yet, retrying in %d seconds...", check_interval)
await asyncio.sleep(check_interval)
# Try once more and raise for exception
async with httpx.AsyncClient(base_url=f"{host}:{port}") as sess:
resp = await sess.get("/readyz")
if resp.status_code != 200: raise TimeoutError(f"Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}")
async def health(self) -> httpx.Response: return await self.inner.get("/readyz")
@classmethod
async def from_url(cls, url: str, **kwargs: t.Any) -> AsyncHttpClient:
url = url if "://" in url else "http://" + url
async with httpx.AsyncClient(base_url=url) as session:
resp = await session.get("/docs.json")
if resp.status_code != 200: raise ValueError(f"Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}")
_spec = orjson.loads(await resp.aread())
reflection = bentoml.Service(_spec["info"]["title"])
for route, spec in _spec["paths"].items():
for meth_spec in spec.values():
if "tags" in meth_spec and "Service APIs" in meth_spec["tags"]:
if "x-bentoml-io-descriptor" not in meth_spec["requestBody"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
if "x-bentoml-io-descriptor" not in meth_spec["responses"]["200"]: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
if "x-bentoml-name" not in meth_spec: raise ValueError(f"Malformed BentoML spec received from BentoML server {url}")
try: reflection.apis[meth_spec["x-bentoml-name"]] = InferenceAPI[t.Any](None, bentoml.io.from_spec(meth_spec["requestBody"]["x-bentoml-io-descriptor"]), bentoml.io.from_spec(meth_spec["responses"]["200"]["x-bentoml-io-descriptor"]), name=meth_spec["x-bentoml-name"], doc=meth_spec["description"], route=route.lstrip("/"))
except ValueError as e: logger.error("Failed to instantiate client for API %s: ", meth_spec["x-bentoml-name"], e)
return cls(url, reflection)
async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any:
# All gRPC kwargs should be popped out.
kwargs = {k: v for k, v in kwargs.items() if not k.startswith("_grpc_")}
if _inference_api.multi_input:
if data is not None: raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.")
fake_resp = await _inference_api.input.to_http_response(kwargs, None)
else: fake_resp = await _inference_api.input.to_http_response(data, None)
# XXX: hack around StreamingResponse, since now we only have Text, for metadata so it is fine to do this.
if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None
else: body = t.cast(t.Any, fake_resp.body)
resp = await self.inner.post("/" + _inference_api.route if not _inference_api.route.startswith("/") else _inference_api.route, data=body, headers={"content-type": fake_resp.headers["content-type"]}, timeout=self.timeout)
if resp.status_code != 200: raise ValueError(f"Error making request: {resp.status_code}: {(await resp.aread())!s}")
fake_req = starlette.requests.Request(scope={"type": "http"})
headers = starlette.datastructures.Headers(headers=resp.headers)
fake_req._body = resp.content
# Request.headers sets a _headers variable. We will need to set this value to our fake request object.
fake_req._headers = headers
return await _inference_api.output.from_http_request(fake_req)

View File

@@ -0,0 +1,33 @@
from __future__ import annotations
import logging
from urllib.parse import urlparse
from ._base import BaseClient, BaseAsyncClient
logger = logging.getLogger(__name__)
def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
address = address if "://" in address else "http://" + address
parsed = urlparse(address)
self._host, *_port = parsed.netloc.split(":")
if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
else: self._port = next(iter(_port))
class HTTPClient(BaseClient):
def __init__(self, address: str, timeout: int = 30):
process_http_address(self, address)
super().__init__(address, timeout)
class AsyncHTTPClient(BaseAsyncClient):
def __init__(self, address: str, timeout: int = 30):
process_http_address(self, address)
super().__init__(address, timeout)
class GrpcClient(BaseClient):
def __init__(self, address: str, timeout: int = 30):
self._host, self._port = address.split(":")
super().__init__(address, timeout)
class AsyncGrpcClient(BaseAsyncClient):
def __init__(self, address: str, timeout: int = 30):
self._host, self._port = address.split(":")
super().__init__(address, timeout)

View File

@@ -0,0 +1 @@
../.git_archival.txt

View File

@@ -0,0 +1 @@
../.python-version-default

194
openllm-core/LICENSE.md Normal file
View File

@@ -0,0 +1,194 @@
Apache License
==============
_Version 2.0, January 2004_
_&lt;<http://www.apache.org/licenses/>&gt;_
### Terms and Conditions for use, reproduction, and distribution
#### 1. Definitions
“License” shall mean the terms and conditions for use, reproduction, and
distribution as defined by Sections 1 through 9 of this document.
“Licensor” shall mean the copyright owner or entity authorized by the copyright
owner that is granting the License.
“Legal Entity” shall mean the union of the acting entity and all other entities
that control, are controlled by, or are under common control with that entity.
For the purposes of this definition, “control” means **(i)** the power, direct or
indirect, to cause the direction or management of such entity, whether by
contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
outstanding shares, or **(iii)** beneficial ownership of such entity.
“You” (or “Your”) shall mean an individual or Legal Entity exercising
permissions granted by this License.
“Source” form shall mean the preferred form for making modifications, including
but not limited to software source code, documentation source, and configuration
files.
“Object” form shall mean any form resulting from mechanical transformation or
translation of a Source form, including but not limited to compiled object code,
generated documentation, and conversions to other media types.
“Work” shall mean the work of authorship, whether in Source or Object form, made
available under the License, as indicated by a copyright notice that is included
in or attached to the work (an example is provided in the Appendix below).
“Derivative Works” shall mean any work, whether in Source or Object form, that
is based on (or derived from) the Work and for which the editorial revisions,
annotations, elaborations, or other modifications represent, as a whole, an
original work of authorship. For the purposes of this License, Derivative Works
shall not include works that remain separable from, or merely link (or bind by
name) to the interfaces of, the Work and Derivative Works thereof.
“Contribution” shall mean any work of authorship, including the original version
of the Work and any modifications or additions to that Work or Derivative Works
thereof, that is intentionally submitted to Licensor for inclusion in the Work
by the copyright owner or by an individual or Legal Entity authorized to submit
on behalf of the copyright owner. For the purposes of this definition,
“submitted” means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems, and
issue tracking systems that are managed by, or on behalf of, the Licensor for
the purpose of discussing and improving the Work, but excluding communication
that is conspicuously marked or otherwise designated in writing by the copyright
owner as “Not a Contribution.”
“Contributor” shall mean Licensor and any individual or Legal Entity on behalf
of whom a Contribution has been received by Licensor and subsequently
incorporated within the Work.
#### 2. Grant of Copyright License
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the Work and such
Derivative Works in Source or Object form.
#### 3. Grant of Patent License
Subject to the terms and conditions of this License, each Contributor hereby
grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
irrevocable (except as stated in this section) patent license to make, have
made, use, offer to sell, sell, import, and otherwise transfer the Work, where
such license applies only to those patent claims licensable by such Contributor
that are necessarily infringed by their Contribution(s) alone or by combination
of their Contribution(s) with the Work to which such Contribution(s) was
submitted. If You institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work or a
Contribution incorporated within the Work constitutes direct or contributory
patent infringement, then any patent licenses granted to You under this License
for that Work shall terminate as of the date such litigation is filed.
#### 4. Redistribution
You may reproduce and distribute copies of the Work or Derivative Works thereof
in any medium, with or without modifications, and in Source or Object form,
provided that You meet the following conditions:
* **(a)** You must give any other recipients of the Work or Derivative Works a copy of
this License; and
* **(b)** You must cause any modified files to carry prominent notices stating that You
changed the files; and
* **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
all copyright, patent, trademark, and attribution notices from the Source form
of the Work, excluding those notices that do not pertain to any part of the
Derivative Works; and
* **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
Derivative Works that You distribute must include a readable copy of the
attribution notices contained within such NOTICE file, excluding those notices
that do not pertain to any part of the Derivative Works, in at least one of the
following places: within a NOTICE text file distributed as part of the
Derivative Works; within the Source form or documentation, if provided along
with the Derivative Works; or, within a display generated by the Derivative
Works, if and wherever such third-party notices normally appear. The contents of
the NOTICE file are for informational purposes only and do not modify the
License. You may add Your own attribution notices within Derivative Works that
You distribute, alongside or as an addendum to the NOTICE text from the Work,
provided that such additional attribution notices cannot be construed as
modifying the License.
You may add Your own copyright statement to Your modifications and may provide
additional or different license terms and conditions for use, reproduction, or
distribution of Your modifications, or for any such Derivative Works as a whole,
provided Your use, reproduction, and distribution of the Work otherwise complies
with the conditions stated in this License.
#### 5. Submission of Contributions
Unless You explicitly state otherwise, any Contribution intentionally submitted
for inclusion in the Work by You to the Licensor shall be under the terms and
conditions of this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify the terms of
any separate license agreement you may have executed with Licensor regarding
such Contributions.
#### 6. Trademarks
This License does not grant permission to use the trade names, trademarks,
service marks, or product names of the Licensor, except as required for
reasonable and customary use in describing the origin of the Work and
reproducing the content of the NOTICE file.
#### 7. Disclaimer of Warranty
Unless required by applicable law or agreed to in writing, Licensor provides the
Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
including, without limitation, any warranties or conditions of TITLE,
NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
solely responsible for determining the appropriateness of using or
redistributing the Work and assume any risks associated with Your exercise of
permissions under this License.
#### 8. Limitation of Liability
In no event and under no legal theory, whether in tort (including negligence),
contract, or otherwise, unless required by applicable law (such as deliberate
and grossly negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special, incidental,
or consequential damages of any character arising as a result of this License or
out of the use or inability to use the Work (including but not limited to
damages for loss of goodwill, work stoppage, computer failure or malfunction, or
any and all other commercial damages or losses), even if such Contributor has
been advised of the possibility of such damages.
#### 9. Accepting Warranty or Additional Liability
While redistributing the Work or Derivative Works thereof, You may choose to
offer, and charge a fee for, acceptance of support, warranty, indemnity, or
other liability obligations and/or rights consistent with this License. However,
in accepting such obligations, You may act only on Your own behalf and on Your
sole responsibility, not on behalf of any other Contributor, and only if You
agree to indemnify, defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason of your
accepting any such warranty or additional liability.
_END OF TERMS AND CONDITIONS_
### APPENDIX: How to apply the Apache License to your work
To apply the Apache License to your work, attach the following boilerplate
notice, with the fields enclosed by brackets `[]` replaced with your own
identifying information. (Don't include the brackets!) The text should be
enclosed in the appropriate comment syntax for the file format. We also
recommend that a file or class name and description of purpose be included on
the same “printed page” as the copyright notice for easier identification within
third-party archives.
Copyright 2023 Atalaya Tech Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

75
openllm-core/README.md Normal file
View File

@@ -0,0 +1,75 @@
![Banner for OpenLLM](/.github/assets/main-banner.png)
<!-- hatch-fancy-pypi-readme intro start -->
<div align="center">
<h1 align="center">🦑 OpenLLM Core</h1>
<a href="https://pypi.org/project/openllm-core">
<img src="https://img.shields.io/pypi/v/openllm-core.svg?logo=pypi&label=PyPI&logoColor=gold" alt="pypi_status" />
</a><a href="https://test.pypi.org/project/openllm-core/">
<img src="https://img.shields.io/badge/Nightly-PyPI?logo=pypi&label=PyPI&color=gray&link=https%3A%2F%2Ftest.pypi.org%2Fproject%2Fopenllm%2F" alt="test_pypi_status" />
</a><a href="https://twitter.com/bentomlai">
<img src="https://badgen.net/badge/icon/@bentomlai/1DA1F2?icon=twitter&label=Follow%20Us" alt="Twitter" />
</a><a href="https://l.bentoml.com/join-openllm-discord">
<img src="https://badgen.net/badge/icon/OpenLLM/7289da?icon=discord&label=Join%20Us" alt="Discord" />
</a><a href="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml">
<img src="https://github.com/bentoml/OpenLLM/actions/workflows/ci.yml/badge.svg?branch=main" alt="ci" />
</a><a href="https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main">
<img src="https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg" alt="pre-commit.ci status" />
</a><br>
<a href="https://pypi.org/project/openllm-core">
<img src="https://img.shields.io/pypi/pyversions/openllm-core.svg?logo=python&label=Python&logoColor=gold" alt="python_version" />
</a><a href="htjtps://github.com/pypa/hatch">
<img src="https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg" alt="Hatch" />
</a><a href="https://github.com/bentoml/OpenLLM/blob/main/STYLE.md">
<img src="https://img.shields.io/badge/code%20style-experimental-000000.svg" alt="code style" />
</a><a href="https://github.com/astral-sh/ruff">
<img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/charliermarsh/ruff/main/assets/badge/v2.json" alt="Ruff" />
</a><a href="https://github.com/python/mypy">
<img src="https://img.shields.io/badge/types-mypy-blue.svg" alt="types - mypy" />
</a><a href="https://github.com/microsoft/pyright">
<img src="https://img.shields.io/badge/types-pyright-yellow.svg" alt="types - pyright" />
</a><br>
<p>OpenLLM Core: Core components for OpenLLM.<br/></p>
<i></i>
</div>
## 📖 Introduction
With OpenLLM, you can run inference with any open-source large-language models,
deploy to the cloud or on-premises, and build powerful AI apps, and more.
To learn more about OpenLLM, please visit <a href="https://github.com/bentoml/OpenLLM">OpenLLM's README.md</a>
This package holds the core components of OpenLLM, and considered as internal.
Components includes:
- Configuration generation.
- Utilities for interacting with OpenLLM server.
- Schema and generation utilities for OpenLLM server.
<!-- hatch-fancy-pypi-readme intro stop -->
![Gif showing OpenLLM Intro](/.github/assets/output.gif)
<br/>
<!-- hatch-fancy-pypi-readme interim start -->
## 📔 Citation
If you use OpenLLM in your research, we provide a [citation](../CITATION.cff) to use:
```bibtex
@software{Pham_OpenLLM_Operating_LLMs_2023,
author = {Pham, Aaron and Yang, Chaoyu and Sheng, Sean and Zhao, Shenyang and Lee, Sauyon and Jiang, Bo and Dong, Fog and Guan, Xipeng and Ming, Frost},
license = {Apache-2.0},
month = jun,
title = {{OpenLLM: Operating LLMs in production}},
url = {https://github.com/bentoml/OpenLLM},
year = {2023}
}
```
<!-- hatch-fancy-pypi-readme interim stop -->

195
openllm-core/pyproject.toml Normal file
View File

@@ -0,0 +1,195 @@
[build-system]
build-backend = "hatchling.build"
requires = [
"hatchling==1.18.0",
"hatch-vcs==0.3.0",
"hatch-fancy-pypi-readme==23.1.0",
]
[project]
authors = [
{ name = "Aaron Pham", email = "aarnphm@bentoml.com" },
{ name = "BentoML Team", email = "contact@bentoml.com" },
]
dynamic = ['readme', 'version']
classifiers = [
"Development Status :: 5 - Production/Stable",
"Environment :: GPU :: NVIDIA CUDA",
"Environment :: GPU :: NVIDIA CUDA :: 12",
"Environment :: GPU :: NVIDIA CUDA :: 11.8",
"Environment :: GPU :: NVIDIA CUDA :: 11.7",
"License :: OSI Approved :: Apache Software License",
"Topic :: Scientific/Engineering",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Software Development :: Libraries",
"Operating System :: OS Independent",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"Intended Audience :: System Administrators",
"Typing :: Typed",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
]
description = "OpenLLM Core: Core components for OpenLLM."
keywords = [
"MLOps",
"AI",
"BentoML",
"Model Serving",
"Model Deployment",
"LLMOps",
"Falcon",
"Vicuna",
"Llama 2",
"Fine tuning",
"Serverless",
"Large Language Model",
"Generative AI",
"StableLM",
"Alpaca",
"PyTorch",
"Transformers",
]
dependencies = [
"bentoml>=1.1.2",
"attrs>=23.1.0",
"cattrs>=23.1.0",
"orjson",
"inflection",
"typing_extensions",
"mypy_extensions",
]
license = "Apache-2.0"
name = "openllm-core"
requires-python = ">=3.8"
[project.urls]
Blog = "https://modelserving.com"
Chat = "https://discord.gg/openllm"
Documentation = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core/README.md"
GitHub = "https://github.com/bentoml/OpenLLM/blob/main/openllm-core"
History = "https://github.com/bentoml/OpenLLM/blob/main/CHANGELOG.md"
Homepage = "https://bentoml.com"
Tracker = "https://github.com/bentoml/OpenLLM/issues"
Twitter = "https://twitter.com/bentomlai"
[project.optional-dependencies]
full = ["openllm-core[vllm,fine-tune]"]
vllm = ["vllm", "ray"]
fine-tune = ["transformers[torch,tokenizers,accelerate]>=4.29.0", "peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
[tool.hatch.version]
fallback-version = "0.0.0"
source = "vcs"
[tool.hatch.build.hooks.vcs]
version-file = "src/openllm_core/_version.py"
[tool.hatch.version.raw-options]
git_describe_command = [
"git",
"describe",
"--dirty",
"--tags",
"--long",
"--first-parent",
]
local_scheme = "no-local-version"
root = ".."
[tool.hatch.metadata]
allow-direct-references = true
[tool.hatch.build.targets.wheel]
only-include = ["src/openllm_core"]
sources = ["src"]
[tool.hatch.build.targets.sdist]
exclude = [
"/.git_archival.txt",
"tests",
"/.python-version-default",
"ADDING_NEW_MODEL.md",
]
[tool.hatch.build.targets.wheel.hooks.mypyc]
dependencies = [
"hatch-mypyc==0.16.0",
"mypy==1.5.1",
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.2",
"transformers>=4.31.0",
"pandas-stubs",
"types-psutil",
"types-tabulate",
"types-PyYAML",
"types-protobuf",
]
enable-by-default = false
exclude = ["src/openllm_core/_typing_compat.py"]
include = [
"src/openllm_core/utils/__init__.py",
"src/openllm_core/__init__.py",
"src/openllm_core/_prompt.py",
"src/openllm_core/_schema.py",
"src/openllm_core/_strategies.py",
"src/openllm_core/exceptions.py",
]
# NOTE: This is consistent with pyproject.toml
mypy-args = [
"--strict",
# this is because all transient library doesn't have types
"--follow-imports=skip",
"--allow-subclassing-any",
"--check-untyped-defs",
"--ignore-missing-imports",
"--no-warn-return-any",
"--warn-unreachable",
"--no-warn-no-return",
"--no-warn-unused-ignores",
"--exclude='/src\\/openllm_core\\/_typing_compat\\.py$'",
]
options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
require-runtime-dependencies = true
[tool.hatch.metadata.hooks.fancy-pypi-readme]
content-type = "text/markdown"
# PyPI doesn't support the <picture> tag.
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<a href="https://github.com/bentoml/openllm">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/main-banner.png" alt="Banner for OpenLLM" />
</a>
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
end-before = "\n<!-- hatch-fancy-pypi-readme intro stop -->"
path = "README.md"
start-after = "<!-- hatch-fancy-pypi-readme intro start -->\n"
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/output.gif" alt="Gif showing OpenLLM Intro" />
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
<p align="center">
<img src="https://raw.githubusercontent.com/bentoml/openllm/main/.github/assets/agent.gif" alt="Gif showing Agent integration" />
</p>
"""
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
end-before = "\n<!-- hatch-fancy-pypi-readme interim stop -->"
path = "README.md"
start-after = "<!-- hatch-fancy-pypi-readme interim start -->\n"
[[tool.hatch.metadata.hooks.fancy-pypi-readme.fragments]]
text = """
---
[Click me for full changelog](https://github.com/bentoml/openllm/blob/main/CHANGELOG.md)
"""

View File

@@ -0,0 +1,7 @@
from __future__ import annotations
from . import utils as utils
from . import exceptions as exceptions
from ._configuration import LLMConfig as LLMConfig, GenerationConfig as GenerationConfig, SamplingParams as SamplingParams
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource, available_resource_spec as available_resource_spec, LiteralResourceSpec as LiteralResourceSpec, NvidiaGpuResource as NvidiaGpuResource, AmdGpuResource as AmdGpuResource
from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
from .config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING, ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING, DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING, FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING, FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING, GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING, LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING, MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING, OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING, StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING, StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING

View File

@@ -34,8 +34,7 @@ dynamically during serve, ahead-of-serve or per requests.
Refer to ``openllm.LLMConfig`` docstring for more information.
"""
from __future__ import annotations
import copy, enum, logging, os, sys, types, typing as t
import attr, click_option_group as cog, inflection, orjson, openllm
import copy, enum, logging, os, sys, types, typing as t, attr, click_option_group as cog, inflection, orjson, openllm_core
from cattr.gen import make_dict_structure_fn, make_dict_unstructure_fn, override
from deepmerge.merger import Merger
from ._strategies import LiteralResourceSpec, available_resource_spec, resource_spec
@@ -51,9 +50,10 @@ from .utils import (
field_env_key,
first_not_none,
lenient_issubclass,
LazyLoader
)
from .utils.import_utils import BACKENDS_MAPPING
# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm.LLMConfig to become 'attrs'-ish
# NOTE: Using internal API from attr here, since we are actually allowing subclass of openllm_core.LLMConfig to become 'attrs'-ish
from attr._compat import set_closure_cell
from attr._make import _CountingAttr, _make_init, _transform_attrs
from ._typing_compat import AnyCallable, At, Self, ListStr, DictStrAny
@@ -63,11 +63,11 @@ if t.TYPE_CHECKING:
from transformers.generation.beam_constraints import Constraint
else:
Constraint = t.Any
vllm = openllm.utils.LazyLoader("vllm", globals(), "vllm")
transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
peft = openllm.utils.LazyLoader("peft", globals(), "peft")
vllm = LazyLoader("vllm", globals(), "vllm")
transformers = LazyLoader("transformers", globals(), "transformers")
peft = LazyLoader("peft", globals(), "peft")
__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams"]
__all__ = ["LLMConfig", "GenerationConfig", "SamplingParams", "field_env_key"]
logger = logging.getLogger(__name__)
config_merger = Merger([(dict, "merge")], ["override"], ["override"])
@@ -225,15 +225,12 @@ class GenerationConfig(ReprMixin):
if t.TYPE_CHECKING and not MYPY:
# stubs this for pyright as mypy already has a attr plugin builtin
def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: ...
def __init__(self, *, _internal: bool = False, **attrs: t.Any):
if not _internal: raise RuntimeError("GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config")
self.__attrs_init__(**attrs)
def __getitem__(self, item: str) -> t.Any:
if hasattr(self, item): return getattr(self, item)
raise KeyError(f"'{self.__class__.__name__}' has no attribute {item}.")
@property
def __repr_keys__(self) -> set[str]: return {i.name for i in attr.fields(self.__class__)}
@@ -334,7 +331,7 @@ class ModelSettings(t.TypedDict, total=False):
name_type: NotRequired[t.Optional[t.Literal["dasherize", "lowercase"]]]
model_name: NotRequired[str]
start_name: NotRequired[str]
env: NotRequired[openllm.utils.EnvVarMixin]
env: NotRequired[openllm_core.utils.EnvVarMixin]
# serving configuration
timeout: int
workers_per_resource: t.Union[int, float]
@@ -376,7 +373,7 @@ class _ModelSettingsAttr:
name_type: t.Optional[t.Literal["dasherize", "lowercase"]]
model_name: str
start_name: str
env: openllm.utils.EnvVarMixin
env: openllm_core.utils.EnvVarMixin
timeout: int
workers_per_resource: t.Union[int, float]
fine_tune_strategies: t.Dict[AdapterType, FineTuneConfig]
@@ -414,7 +411,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _
if not BACKENDS_MAPPING[library_stub][0](): default_implementation[rs] = "pt"
_final_value_dct["default_implementation"] = default_implementation
env = openllm.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
env = openllm_core.utils.EnvVarMixin(model_name, get_default_implementation(default_implementation), model_id=_settings_attr.default_id, bettertransformer=_settings_attr.bettertransformer)
_final_value_dct["env"] = env
# bettertransformer support
@@ -459,7 +456,8 @@ _reserved_namespace = {"__config__", "GenerationConfig", "SamplingParams"}
@attr.define(slots=True)
class _ConfigAttr:
Field = dantic.Field
@staticmethod
def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: return dantic.Field(default, **attrs)
"""Field is a alias to the internal dantic utilities to easily create
attrs.fields with pydantic-compatible interface. For example:
@@ -513,11 +511,11 @@ class _ConfigAttr:
"""The accepted keys for this LLMConfig."""
__openllm_extras__: DictStrAny = Field(None, init=False)
"""Extra metadata for this LLMConfig."""
__openllm_generation_class__: type[openllm._configuration.GenerationConfig] = Field(None)
__openllm_generation_class__: type[openllm_core._configuration.GenerationConfig] = Field(None)
"""The result generated GenerationConfig class for this LLMConfig. This will be used
to create the generation_config argument that can be used throughout the lifecycle.
This class will also be managed internally by OpenLLM."""
__openllm_sampling_class__: type[openllm._configuration.SamplingParams] = Field(None)
__openllm_sampling_class__: type[openllm_core._configuration.SamplingParams] = Field(None)
"""The result generated SamplingParams class for this LLMConfig. This will be used
to create arguments for vLLM LLMEngine that can be used throughout the lifecycle.
This class will also be managed internally by OpenLLM."""
@@ -587,7 +585,7 @@ class _ConfigAttr:
"""The normalized version of __openllm_start_name__, determined by __openllm_name_type__"""
__openllm_start_name__: str = Field(None)
"""Default name to be used with `openllm start`"""
__openllm_env__: openllm.utils.EnvVarMixin = Field(None)
__openllm_env__: openllm_core.utils.EnvVarMixin = Field(None)
"""A EnvVarMixin instance for this LLMConfig."""
__openllm_timeout__: int = Field(None)
"""The default timeout to be set for this given LLM."""
@@ -697,7 +695,7 @@ class _ConfigBuilder:
if not closure_cells: continue # Catch None or the empty list.
for cell in closure_cells:
try: match = cell.cell_contents is self._cls
except ValueError: pass # noqa: PERF203 # ValueError: Cell is empty
except ValueError: pass # ValueError: Cell is empty
else:
if match: set_closure_cell(cell, cls)
return cls
@@ -705,7 +703,6 @@ class _ConfigBuilder:
def add_attrs_init(self) -> Self:
self._cls_dict["__attrs_init__"] = codegen.add_method_dunders(self._cls, _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True))
return self
def add_repr(self) -> Self:
for key, fn in ReprMixin.__dict__.items():
if key in ("__repr__", "__str__", "__repr_name__", "__repr_str__", "__repr_args__"): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn)
@@ -849,7 +846,7 @@ class LLMConfig(_ConfigAttr):
unannotated = ca_names - annotated_names
if len(unannotated) > 0:
missing_annotated = sorted(unannotated, key=lambda n: t.cast("_CountingAttr", cd.get(n)).counter)
raise openllm.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}")
# We need to set the accepted key before generation_config
# as generation_config is a special field that users shouldn't pass.
cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__)} | {a.name for a in attr.fields(cls.__openllm_sampling_class__)}
@@ -925,7 +922,7 @@ class LLMConfig(_ConfigAttr):
@overload
def __getitem__(self, item: t.Literal["start_name"]) -> str: ...
@overload
def __getitem__(self, item: t.Literal["env"]) -> openllm.utils.EnvVarMixin: ...
def __getitem__(self, item: t.Literal["env"]) -> openllm_core.utils.EnvVarMixin: ...
@overload
def __getitem__(self, item: t.Literal["timeout"]) -> int: ...
@overload
@@ -936,9 +933,9 @@ class LLMConfig(_ConfigAttr):
def __getitem__(self, item: t.Literal["tokenizer_class"]) -> t.Optional[str]: ...
# NOTE: generation_class, sampling_class and extras arguments
@overload
def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm.GenerationConfig]: ...
def __getitem__(self, item: t.Literal["generation_class"]) -> t.Type[openllm_core.GenerationConfig]: ...
@overload
def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm.SamplingParams]: ...
def __getitem__(self, item: t.Literal["sampling_class"]) -> t.Type[openllm_core.SamplingParams]: ...
@overload
def __getitem__(self, item: t.Literal["extras"]) -> t.Dict[str, t.Any]: ...
# NOTE: GenerationConfig arguments
@@ -1120,7 +1117,6 @@ class LLMConfig(_ConfigAttr):
try: new_cls.__module__ = sys._getframe(1).f_globals.get("__name__", "__main__")
except (AttributeError, ValueError): pass
return new_cls(**attrs)
def model_dump(self, flatten: bool = False, **_: t.Any) -> DictStrAny:
dumped = bentoml_cattr.unstructure(self)
generation_config = bentoml_cattr.unstructure(self.generation_config)
@@ -1129,15 +1125,13 @@ class LLMConfig(_ConfigAttr):
else: dumped["generation_config"] = generation_config
dumped.update(sampling_config)
return dumped
def model_dump_json(self, **kwargs: t.Any) -> bytes: return orjson.dumps(self.model_dump(**kwargs))
@classmethod
def model_construct_json(cls, json_str: str | bytes) -> Self:
try: attrs = orjson.loads(json_str)
except orjson.JSONDecodeError as err: raise openllm.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
except orjson.JSONDecodeError as err: raise openllm_core.exceptions.ValidationError(f"Failed to load JSON: {err}") from None
return bentoml_cattr.structure(attrs, cls)
@classmethod
def model_construct_env(cls, **attrs: t.Any) -> Self:
"""A helpers that respect configuration values environment variables."""
@@ -1186,7 +1180,6 @@ class LLMConfig(_ConfigAttr):
config = transformers.GenerationConfig(**bentoml_cattr.unstructure(self.generation_config))
return config.to_dict() if return_as_dict else config
def to_sampling_config(self) -> vllm.SamplingParams: return self.sampling_config.to_vllm()
@classmethod
def to_click_options(cls, f: AnyCallable) -> click.Command:
"""Convert current configuration to click options.
@@ -1228,6 +1221,32 @@ class LLMConfig(_ConfigAttr):
@classmethod
def default_implementation(cls) -> LiteralRuntime: return first_not_none(cls.__openllm_env__["framework_value"], default=get_default_implementation(cls.__openllm_default_implementation__))
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
"""This handler will sanitize all attrs and setup prompt text.
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
Returns a tuple of three items:
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
- The attributes dictionary that will be passed into `self.postprocess_generate`.
`openllm.LLM` also has a sanitize_parameters that will just call this method.
"""
return prompt, attrs, attrs
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
"""This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
> [!NOTE]
> This will be used from the client side.
`openllm.LLM` also has a postprocess_generate that will just call this method.
"""
return generation_result
bentoml_cattr.register_unstructure_hook_factory(lambda cls: lenient_issubclass(cls, LLMConfig), lambda cls: make_dict_unstructure_fn(cls, bentoml_cattr, _cattrs_omit_if_default=False, _cattrs_use_linecache=True))
def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
@@ -1253,7 +1272,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig:
return cls(generation_config=generation_config, __openllm_extras__=data, **cls_attrs)
bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config)
openllm_home = os.path.expanduser(os.environ.get("OPENLLM_HOME", os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")), "openllm")))
__all__ = ["LLMConfig", "field_env_key"]

View File

@@ -1,8 +1,8 @@
"""Schema definition for OpenLLM. This can be use for client interaction."""
from __future__ import annotations
import functools, typing as t
import attr, inflection, openllm
from ._configuration import GenerationConfig, LLMConfig
import attr, inflection
from openllm_core._configuration import GenerationConfig, LLMConfig
from .utils import bentoml_cattr
if t.TYPE_CHECKING: import vllm
@@ -19,9 +19,11 @@ class GenerationInput:
if cls is None: raise ValueError("'cls' must pass if given data is a dictionary.")
return cls(**data)
@classmethod
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]: return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
def for_model(cls, model_name: str, **attrs: t.Any) -> type[GenerationInput]:
import openllm
return cls.from_llm_config(openllm.AutoConfig.for_model(model_name, **attrs))
@classmethod
def from_llm_config(cls, llm_config: openllm.LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config["model_name"]) + "GenerationInput", attrs={"prompt": attr.field(type=str), "llm_config": attr.field(type=llm_config.__class__, default=llm_config, converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), "adapter_name": attr.field(default=None, type=str)})
@attr.frozen(slots=True)
class GenerationOutput:
responses: t.List[t.Any]

View File

@@ -1,10 +1,10 @@
# mypy: disable-error-code="no-redef"
from __future__ import annotations
import functools, inspect, logging, math, os, sys, types, typing as t, warnings, psutil, bentoml
from bentoml._internal.resource import get_resource, system_resources
from bentoml._internal.runner.strategy import THREAD_ENVS
from .utils import DEBUG, ReprMixin
if sys.version_info[:2] >= (3, 11): from typing import overload
else: from typing_extensions import overload
from ._typing_compat import overload
class DynResource(t.Protocol):
resource_id: t.ClassVar[str]
@@ -176,10 +176,7 @@ def _validate(cls: type[DynResource], val: list[t.Any]) -> None:
except (ImportError, RuntimeError):
pass
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]:
return types.new_class(
name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}),
)
def _make_resource_class(name: str, resource_kind: str, docstring: str) -> type[DynResource]: return types.new_class(name, (bentoml.Resource[t.List[str]], ReprMixin), {"resource_id": resource_kind}, lambda ns: ns.update({"resource_id": resource_kind, "from_spec": classmethod(_from_spec), "from_system": classmethod(_from_system), "validate": classmethod(_validate), "__repr_keys__": property(lambda _: {"resource_id"}), "__doc__": inspect.cleandoc(docstring), "__module__": "openllm._strategies"}))
# NOTE: we need to hint these t.Literal since mypy is to dumb to infer this as literal :facepalm:
_TPU_RESOURCE: t.Literal["cloud-tpus.google.com/v2"] = "cloud-tpus.google.com/v2"

View File

@@ -1,13 +1,14 @@
# mypy: disable-error-code="type-arg,valid-type"
from __future__ import annotations
import sys, typing as t, bentoml, attr, abc
from bentoml._internal.types import ModelSignatureDict as ModelSignatureDict
if t.TYPE_CHECKING:
import openllm, peft, transformers, auto_gptq as autogptq, vllm
from bentoml._internal.runner.runnable import RunnableMethod
from bentoml._internal.runner.runner import RunnerMethod
from bentoml._internal.runner.strategy import Strategy
from .bundle.oci import LiteralContainerVersionStrategy
from .utils.lazy import VersionInfo
M = t.TypeVar("M", bound="t.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, vllm.AsyncLLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]")
@@ -23,6 +24,10 @@ At = t.TypeVar("At", bound=attr.AttrsInstance)
LiteralRuntime = t.Literal["pt", "tf", "flax", "vllm"]
AdapterType = t.Literal["lora", "adalora", "adaption_prompt", "prefix_tuning", "p_tuning", "prompt_tuning", "ia3"]
# TODO: support quay
LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
if sys.version_info[:2] >= (3,11):
from typing import LiteralString as LiteralString, Self as Self, overload as overload
from typing import NotRequired as NotRequired, Required as Required, dataclass_transform as dataclass_transform
@@ -35,12 +40,7 @@ if sys.version_info[:2] >= (3,10):
else:
from typing_extensions import TypeAlias as TypeAlias, ParamSpec as ParamSpec, Concatenate as Concatenate
if sys.version_info[:2] >= (3,9):
from typing import TypedDict as TypedDict
else:
from typing_extensions import TypedDict as TypedDict
class PeftAdapterOutput(TypedDict):
class PeftAdapterOutput(t.TypedDict):
success: bool
result: t.Dict[str, peft.PeftConfig]
error_msg: str

View File

@@ -0,0 +1,13 @@
from __future__ import annotations
from .configuration_auto import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
from .configuration_baichuan import BaichuanConfig as BaichuanConfig, START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING
from .configuration_chatglm import ChatGLMConfig as ChatGLMConfig, START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING
from .configuration_dolly_v2 import DollyV2Config as DollyV2Config, START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING
from .configuration_falcon import FalconConfig as FalconConfig, START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING
from .configuration_flan_t5 import FlanT5Config as FlanT5Config, START_FLAN_T5_COMMAND_DOCSTRING as START_FLAN_T5_COMMAND_DOCSTRING
from .configuration_gpt_neox import GPTNeoXConfig as GPTNeoXConfig, START_GPT_NEOX_COMMAND_DOCSTRING as START_GPT_NEOX_COMMAND_DOCSTRING
from .configuration_llama import LlamaConfig as LlamaConfig, START_LLAMA_COMMAND_DOCSTRING as START_LLAMA_COMMAND_DOCSTRING
from .configuration_mpt import MPTConfig as MPTConfig, START_MPT_COMMAND_DOCSTRING as START_MPT_COMMAND_DOCSTRING
from .configuration_opt import OPTConfig as OPTConfig, START_OPT_COMMAND_DOCSTRING as START_OPT_COMMAND_DOCSTRING
from .configuration_stablelm import StableLMConfig as StableLMConfig, START_STABLELM_COMMAND_DOCSTRING as START_STABLELM_COMMAND_DOCSTRING
from .configuration_starcoder import StarCoderConfig as StarCoderConfig, START_STARCODER_COMMAND_DOCSTRING as START_STARCODER_COMMAND_DOCSTRING

View File

@@ -1,18 +1,16 @@
# mypy: disable-error-code="type-arg"
from __future__ import annotations
import typing as t
import inflection, openllm_core, importlib, typing as t
from collections import OrderedDict
import inflection, openllm
from openllm.utils import ReprMixin
from openllm_core.utils import ReprMixin
if t.TYPE_CHECKING:
import types
from openllm._typing_compat import LiteralString
from openllm_core._typing_compat import LiteralString
from collections import _odict_items, _odict_keys, _odict_values
ConfigKeysView = _odict_keys[str, type[openllm.LLMConfig]]
ConfigValuesView = _odict_values[str, type[openllm.LLMConfig]]
ConfigItemsView = _odict_items[str, type[openllm.LLMConfig]]
ConfigKeysView = _odict_keys[str, type[openllm_core.LLMConfig]]
ConfigValuesView = _odict_values[str, type[openllm_core.LLMConfig]]
ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]]
# NOTE: This is the entrypoint when adding new model config
CONFIG_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLMConfig"), ("dolly_v2", "DollyV2Config"), ("falcon", "FalconConfig"), ("flan_t5", "FlanT5Config"), ("gpt_neox", "GPTNeoXConfig"), ("llama", "LlamaConfig"), ("mpt", "MPTConfig"), ("opt", "OPTConfig"), ("stablelm", "StableLMConfig"), ("starcoder", "StarCoderConfig"), ("baichuan", "BaichuanConfig")])
@@ -28,10 +26,10 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
if inflection.underscore(key) in self._mapping: return self.__getitem__(inflection.underscore(key))
raise KeyError(key)
value, module_name = self._mapping[key], inflection.underscore(key)
if module_name not in self._modules: self._modules[module_name] = openllm.utils.EnvVarMixin(module_name).module
if module_name not in self._modules: self._modules[module_name] = openllm_core.utils.EnvVarMixin(module_name).module
if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value)
# Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the object at the top level.
return getattr(openllm, value)
return getattr(importlib.import_module("openllm"), value)
@property
def __repr_keys__(self) -> set[str]: return set(self._mapping.keys())
def __repr__(self) -> str: return ReprMixin.__repr__(self)
@@ -45,19 +43,19 @@ class _LazyConfigMapping(OrderedDict, ReprMixin):
if key in self._mapping.keys(): raise ValueError(f"'{key}' is already used by a OpenLLM config, pick another name.")
self._extra_content[key] = value
CONFIG_MAPPING: dict[str, type[openllm.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
# The below handle special alias when we call underscore to the name directly without processing camelcase first.
CONFIG_NAME_ALIASES: dict[str, str] = {"chat_glm": "chatglm", "stable_lm": "stablelm", "star_coder": "starcoder", "gpt_neo_x": "gpt_neox",}
class AutoConfig:
def __init__(self, *_: t.Any, **__: t.Any): raise EnvironmentError("Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.")
@classmethod
def for_model(cls, model_name: str, **attrs: t.Any) -> openllm.LLMConfig:
def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig:
model_name = inflection.underscore(model_name)
if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs)
raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.")
@classmethod
def infer_class_from_name(cls, name: str) -> type[openllm.LLMConfig]:
def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]:
model_name = inflection.underscore(name)
if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name]
if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name]

View File

@@ -1,22 +1,6 @@
from __future__ import annotations
import openllm
class BaichuanConfig(openllm.LLMConfig):
"""Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
Baichuan-7B is based on Transformer architecture,
which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
It supports both Chinese and English languages with a context window length of 4096.
It has achieved the best performance among models of the same size on standard Chinese
and English benchmarks (C-Eval, MMLU, etc).
Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
"default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
class GenerationConfig:
max_new_tokens: int = 2048
top_p: float = 0.7
temperature: float = 0.95
import openllm_core, typing as t
from openllm_core._prompt import process_prompt
START_BAICHUAN_COMMAND_DOCSTRING = """\
Run a LLMServer for Baichuan model.
@@ -38,3 +22,24 @@ or provide `--model-id` flag when running ``openllm start baichuan``:
$ openllm start baichuan --model-id='fireballoon/baichuan-vicuna-chinese-7b'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
class BaichuanConfig(openllm_core.LLMConfig):
"""Baichuan-7B is an open-source, large-scale pre-trained language model developed by Baichuan Intelligent Technology.
Baichuan-7B is based on Transformer architecture,
which contains 7 billion parameters and trained on approximately 1.2 trillion tokens.
It supports both Chinese and English languages with a context window length of 4096.
It has achieved the best performance among models of the same size on standard Chinese
and English benchmarks (C-Eval, MMLU, etc).
Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/baichuan-inc/Baichuan-7B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "BaiChuanForCausalLM",
"default_id": "baichuan-inc/baichuan-7b", "model_ids": ["baichuan-inc/baichuan-7b", "baichuan-inc/baichuan-13b-base", "baichuan-inc/baichuan-13b-chat", "fireballoon/baichuan-vicuna-chinese-7b", "fireballoon/baichuan-vicuna-7b", "hiyouga/baichuan-7b-sft"]}
class GenerationConfig:
max_new_tokens: int = 2048
top_p: float = 0.7
temperature: float = 0.95
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,29 +1,6 @@
from __future__ import annotations
import openllm
class ChatGLMConfig(openllm.LLMConfig):
"""ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
With the quantization technique, users can deploy locally on consumer-grade graphics cards
(only 6GB of GPU memory is required at the INT4 quantization level).
ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
feedback bootstrap, and reinforcement learning wit human feedback.
With only about 6.2 billion parameters, the model is able to generate answers that are in line
with human preference.
Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
"default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
retain_history: bool = openllm.LLMConfig.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
max_new_tokens: int = 2048
num_beams: int = 1
top_p: float = 0.7
temperature: float = 0.95
import openllm_core, typing as t
from openllm_core.utils import dantic
START_CHATGLM_COMMAND_DOCSTRING = """\
Run a LLMServer for ChatGLM model.
@@ -45,3 +22,42 @@ or provide `--model-id` flag when running ``openllm start chatglm``:
$ openllm start chatglm --model-id='thudm/chatglm-6b-int8'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
class ChatGLMConfig(openllm_core.LLMConfig):
"""ChatGLM is an open bilingual language model based on [General Language Model (GLM)](https://github.com/THUDM/GLM) framework.
With the quantization technique, users can deploy locally on consumer-grade graphics cards
(only 6GB of GPU memory is required at the INT4 quantization level).
ChatGLM-6B uses technology similar to ChatGPT, optimized for Chinese QA and dialogue.
The model is trained for about 1T tokens of Chinese and English corpus, supplemented by supervised fine-tuning,
feedback bootstrap, and reinforcement learning wit human feedback.
With only about 6.2 billion parameters, the model is able to generate answers that are in line
with human preference.
Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "timeout": 3600000, "requires_gpu": True, "url": "https://github.com/THUDM/ChatGLM-6B", "requirements": ["cpm-kernels", "sentencepiece"], "architecture": "ChatGLMForConditionalGeneration",
"default_id": "thudm/chatglm-6b", "model_ids": ["thudm/chatglm-6b", "thudm/chatglm-6b-int8", "thudm/chatglm-6b-int4", "thudm/chatglm2-6b", "thudm/chatglm2-6b-int4"]}
retain_history: bool = dantic.Field(False, description="Whether to retain history given to the model. If set to True, then the model will retain given history.")
use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
max_new_tokens: int = 2048
num_beams: int = 1
top_p: float = 0.7
temperature: float = 0.95
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
prompt_text = ""
if use_default_prompt_template and chat_history is not None:
for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
else: prompt_text = prompt
postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
generated, history = generation_result
if self.config.retain_history:
if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
chat_history.extend(history)
return generated

View File

@@ -1,29 +1,9 @@
from __future__ import annotations
import typing as t, openllm
import typing as t, openllm_core
from openllm_core._prompt import process_prompt
from openllm_core.utils import dantic
if t.TYPE_CHECKING: import transformers
class DollyV2Config(openllm.LLMConfig):
"""Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
classification, closed QA, generation, information extraction, open QA and summarization.
dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
following behavior not characteristic of the foundation model on which it is based.
Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
"""
__config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
"default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
return_full_text: bool = openllm.LLMConfig.Field(False, description="Whether to return the full prompt to the users.")
class GenerationConfig:
temperature: float = 0.9
top_p: float = 0.92
top_k: int = 5
max_new_tokens: int = 256
eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
START_DOLLY_V2_COMMAND_DOCSTRING = """\
Run a LLMServer for dolly-v2 model.
@@ -74,3 +54,28 @@ def get_special_token_id(tokenizer: transformers.PreTrainedTokenizer, key: str)
token_ids = tokenizer.encode(key)
if len(token_ids) > 1: raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
return token_ids[0]
class DollyV2Config(openllm_core.LLMConfig):
"""Databricks` Dolly is an instruction-following large language model trained on the Databricks machine learning platform that is licensed for commercial use.
Based on pythia-12b, Dolly is trained on ~15k instruction/response fine tuning records databricks-dolly-15k
generated by Databricks employees in capability domains from the InstructGPT paper, including brainstorming,
classification, closed QA, generation, information extraction, open QA and summarization.
dolly-v2-12b is not a state-of-the-art model, but does exhibit surprisingly high quality instruction
following behavior not characteristic of the foundation model on which it is based.
Refer to [Databricks's Dolly page](https://github.com/databrickslabs/dolly) for more information.
"""
__config__ = {"timeout": 3600000, "url": "https://github.com/databrickslabs/dolly", "architecture": "GPTNeoXForCausalLM",
"default_id": "databricks/dolly-v2-3b", "model_ids": ["databricks/dolly-v2-3b", "databricks/dolly-v2-7b", "databricks/dolly-v2-12b"]}
return_full_text: bool = dantic.Field(False, description="Whether to return the full prompt to the users.")
class GenerationConfig:
temperature: float = 0.9
top_p: float = 0.92
top_k: int = 5
max_new_tokens: int = 256
eos_token_id: int = 50277 # NOTE: from get_special_token_id(self.tokenizer, END_KEY)
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]

View File

@@ -1,22 +1,6 @@
from __future__ import annotations
import openllm
class FalconConfig(openllm.LLMConfig):
"""Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
It is made available under the TII Falcon LLM License.
Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
"default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
class GenerationConfig:
max_new_tokens: int = 200
top_k: int = 10
num_return_sequences: int = 1
num_beams: int = 4
early_stopping: bool = True
import openllm_core, typing as t
from openllm_core._prompt import process_prompt
START_FALCON_COMMAND_DOCSTRING = """\
Run a LLMServer for FalconLM model.
@@ -43,3 +27,24 @@ DEFAULT_PROMPT_TEMPLATE = """{context}
{user_name}: {instruction}
{agent}:
"""
class FalconConfig(openllm_core.LLMConfig):
"""Falcon-7B is a 7B parameters causal decoder-only model built by TII and trained on 1,500B tokens of [RefinedWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb) enhanced with curated corpora.
It is made available under the TII Falcon LLM License.
Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "requires_gpu": True, "timeout": int(36e6), "url": "https://falconllm.tii.ae/", "requirements": ["einops", "xformers"], "architecture": "FalconForCausalLM",
"default_id": "tiiuae/falcon-7b", "model_ids": ["tiiuae/falcon-7b", "tiiuae/falcon-40b", "tiiuae/falcon-7b-instruct", "tiiuae/falcon-40b-instruct"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none", "target_modules": ["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]},)}
class GenerationConfig:
max_new_tokens: int = 200
top_k: int = 10
num_return_sequences: int = 1
num_beams: int = 4
early_stopping: bool = True
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,21 +1,6 @@
from __future__ import annotations
import openllm
class FlanT5Config(openllm.LLMConfig):
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
"""
__config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
"default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 2048
top_k: int = 50
top_p: float = 0.4
repetition_penalty = 1.0
import openllm_core, typing as t
from openllm_core._prompt import process_prompt
START_FLAN_T5_COMMAND_DOCSTRING = """\
Run a LLMServer for FLAN-T5 model.
@@ -43,3 +28,23 @@ or provide `--model-id` flag when running ``openllm start flan-t5``:
$ openllm start flan-t5 --model-id google/flan-t5-xxl
"""
DEFAULT_PROMPT_TEMPLATE = """Answer the following question:\nQuestion: {instruction}\nAnswer:"""
class FlanT5Config(openllm_core.LLMConfig):
"""FLAN-T5 was released in the paper [Scaling Instruction-Finetuned Language Models](https://arxiv.org/pdf/2210.11416.pdf).
It is an enhanced version of T5 that has been finetuned in a mixture of tasks.
Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information.
"""
__config__ = {"url": "https://huggingface.co/docs/transformers/model_doc/flan-t5", "architecture": "T5ForConditionalGeneration", "model_type": "seq2seq_lm",
"default_id": "google/flan-t5-large", "model_ids": ["google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", "google/flan-t5-xxl",]}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 2048
top_k: int = 50
top_p: float = 0.4
repetition_penalty = 1.0
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, repetition_penalty: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p, "repetition_penalty": repetition_penalty}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,28 +1,7 @@
from __future__ import annotations
import openllm
class GPTNeoXConfig(openllm.LLMConfig):
"""GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
It is, to the best of our knowledge, the largest dense autoregressive model
that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
can be found at https://github.com/EleutherAI/gpt-neox.
GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
for more information.
"""
__config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
"default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
use_half_precision: bool = openllm.LLMConfig.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 100
import openllm_core, typing as t
from openllm_core._prompt import process_prompt
from openllm_core.utils import dantic
START_GPT_NEOX_COMMAND_DOCSTRING = """\
Run a LLMServer for GPTNeoX model.
@@ -44,3 +23,29 @@ or provide `--model-id` flag when running ``openllm start gpt-neox``:
$ openllm start gpt-neox --model-id 'stabilityai/stablelm-tuned-alpha-3b'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
class GPTNeoXConfig(openllm_core.LLMConfig):
"""GPTNeoX is an autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license.
It is, to the best of our knowledge, the largest dense autoregressive model
that has publicly available weights at the time of submission. The training and evaluation code, as well as the model weights,
can be found at https://github.com/EleutherAI/gpt-neox.
GPTNeoX has been used to fine-tune on various models, such as Dolly, StableLM, and Pythia.
Note that OpenLLM provides first-class support for all of the aforementioned model. Users can
also use `openllm start gpt-neox` to run all of the GPTNeoX variant's model
Refer to [GPTNeoX's model card](https://huggingface.co/docs/transformers/model_doc/gpt_neox)
for more information.
"""
__config__ = {"model_name": "gpt_neox", "start_name": "gpt-neox", "requires_gpu": True, "architecture": "GPTNeoXForCausalLM", "url": "https://github.com/EleutherAI/gpt-neox",
"default_id": "eleutherai/gpt-neox-20b", "model_ids": ["eleutherai/gpt-neox-20b"]}
use_half_precision: bool = dantic.Field(True, description="Whether to use half precision for model.")
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 100
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,32 +1,7 @@
from __future__ import annotations
import typing as t, openllm
class LlamaConfig(openllm.LLMConfig):
"""LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
It is a collection of foundation language models ranging from 7B to 65B parameters.
Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
for more information.
"""
use_llama2_prompt: bool = openllm.LLMConfig.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
__config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
"default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
"openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
class GenerationConfig:
max_new_tokens: int = 128
temperature: float = 0.6
top_p: float = 0.9
top_k: int = 12
class SamplingParams:
best_of: int = 1
presence_penalty: float = 0.5
import typing as t, openllm_core
from openllm_core._prompt import process_prompt
from openllm_core.utils import dantic
START_LLAMA_COMMAND_DOCSTRING = """\
Run a LLMServer for Llama model.
@@ -68,3 +43,33 @@ _v1_prompt, _v2_prompt = """{instruction}""", """{start_key} {sys_key}\n{system_
PROMPT_MAPPING = {"v1": _v1_prompt, "v2": _v2_prompt}
def _get_prompt(model_type: t.Literal["v1", "v2"]) -> str: return PROMPT_MAPPING[model_type]
DEFAULT_PROMPT_TEMPLATE = _get_prompt
class LlamaConfig(openllm_core.LLMConfig):
"""LLaMA model was proposed in [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
It is a collection of foundation language models ranging from 7B to 65B parameters.
Llama also include support for the recent propsed [Llama-2](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/)
Note that all variants of Llama including fine-tuning, quantisation format are all supported with ``openllm.Llama``.
Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama)
for more information.
"""
use_llama2_prompt: bool = dantic.Field(False, description="Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.")
__config__ = {"name_type": "lowercase", "url": "https://github.com/facebookresearch/llama", "default_implementation": {"cpu": "pt", "nvidia.com/gpu": "pt"}, "architecture": "LlamaForCausalLM", "requirements": ["fairscale", "sentencepiece"], "tokenizer_class": "LlamaTokenizerFast",
"default_id": "NousResearch/llama-2-7b-hf", "model_ids": ["meta-llama/Llama-2-70b-chat-hf", "meta-llama/Llama-2-13b-chat-hf", "meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-70b-hf", "meta-llama/Llama-2-13b-hf",
"meta-llama/Llama-2-7b-hf", "NousResearch/llama-2-70b-chat-hf", "NousResearch/llama-2-13b-chat-hf", "NousResearch/llama-2-7b-chat-hf", "NousResearch/llama-2-70b-hf", "NousResearch/llama-2-13b-hf", "NousResearch/llama-2-7b-hf",
"openlm-research/open_llama_7b_v2", "openlm-research/open_llama_3b_v2", "openlm-research/open_llama_13b", "huggyllama/llama-65b", "huggyllama/llama-30b", "huggyllama/llama-13b", "huggyllama/llama-7b"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 64, "lora_alpha": 16, "lora_dropout": 0.1, "bias": "none"},)}
class GenerationConfig:
max_new_tokens: int = 128
temperature: float = 0.6
top_p: float = 0.9
top_k: int = 12
class SamplingParams:
best_of: int = 1
presence_penalty: float = 0.5
def sanitize_parameters(self, prompt: str, top_k: int | None = None, top_p: float | None = None, temperature: float | None = None, max_new_tokens: int | None = None, use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE("v2" if use_llama2_prompt else "v1") if use_default_prompt_template else None, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p, "top_k": top_k}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,28 +1,9 @@
from __future__ import annotations
import sys, typing as t
import typing as t, openllm_core
from openllm_core.utils import dantic
from openllm_core._prompt import process_prompt
import openllm
if t.TYPE_CHECKING: MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
else: MPTPromptType = str
class MPTConfig(openllm.LLMConfig):
"""MPT is a decoder-style transformer pretrained from scratch on English text and code.
This model was trained by [MosaicML](https://www.mosaicml.com/).
``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
for more details on specific models.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
"default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
prompt_type: MPTPromptType = openllm.LLMConfig.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
max_sequence_length: int = openllm.LLMConfig.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
class GenerationConfig:
max_new_tokens: int = 128
temperature: float = 0
top_p: float = 0.8
MPTPromptType = t.Literal["default", "instruct", "chat", "storywriter"]
START_MPT_COMMAND_DOCSTRING = """\
Run a LLMServer for MPT model.
@@ -63,3 +44,32 @@ _chat_prompt, _default_prompt, _instruct_prompt = """{instruction}""", """{instr
PROMPT_MAPPING = {"default": _default_prompt, "instruct": _instruct_prompt, "storywriter": _default_prompt, "chat": _chat_prompt}
def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type]
DEFAULT_PROMPT_TEMPLATE = _get_prompt
class MPTConfig(openllm_core.LLMConfig):
"""MPT is a decoder-style transformer pretrained from scratch on English text and code.
This model was trained by [MosaicML](https://www.mosaicml.com/).
``openllm.MPT`` encapsulate a family of MPT variants that is publicly available
on HuggingFace. Refers [HuggingFace's MosaicML page](https://huggingface.co/mosaicml)
for more details on specific models.
"""
__config__ = {"name_type": "lowercase", "trust_remote_code": True, "url": "https://huggingface.co/mosaicml", "timeout": int(36e6), "requirements": ["triton", "einops"], "architecture": "MPTForCausalLM",
"default_id": "mosaicml/mpt-7b-instruct", "model_ids": ["mosaicml/mpt-7b", "mosaicml/mpt-7b-instruct", "mosaicml/mpt-7b-chat", "mosaicml/mpt-7b-storywriter", "mosaicml/mpt-30b", "mosaicml/mpt-30b-instruct", "mosaicml/mpt-30b-chat"]}
prompt_type: MPTPromptType = dantic.Field('"default"', description="Given prompt type for running MPT. Default will be inferred from model name if pretrained.")
max_sequence_length: int = dantic.Field(2048, description="Max sequence length to run MPT with. Note that MPT is trained ith sequence length of 2048, but with [ALiBi](https://arxiv.org/abs/2108.12409) it can set up to 4096 (for 7b models) and 16384 (for 30b models)")
class GenerationConfig:
max_new_tokens: int = 128
temperature: float = 0
top_p: float = 0.8
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_p: float | None = None, prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any,) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
_template = None
if use_default_prompt_template:
if prompt_type is None:
if "instruct" in self.model_id: prompt_type = "instruct"
elif "storywriter" in self.model_id: prompt_type = "storywriter"
elif "chat" in self.model_id: prompt_type = "chat"
else: prompt_type = "default"
_template = DEFAULT_PROMPT_TEMPLATE(prompt_type)
return process_prompt(prompt, _template, use_default_prompt_template), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_p": top_p}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str: return generation_result[0]

View File

@@ -1,27 +1,7 @@
from __future__ import annotations
import openllm
class OPTConfig(openllm.LLMConfig):
"""OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
the self-supervised causal language modeling objective.
Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
"""
__config__ = {
"name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
"default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
}
format_outputs: bool = openllm.LLMConfig.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
class GenerationConfig:
top_k: int = 15
temperature: float = 0.75
max_new_tokens: int = 1024
num_return_sequences: int = 1
import openllm_core, typing as t
from openllm_core.utils import dantic
from openllm_core._prompt import process_prompt
START_OPT_COMMAND_DOCSTRING = """\
Run a LLMServer for OPT model.
@@ -49,3 +29,30 @@ or provide `--model-id` flag when running ``openllm start opt``:
$ openllm start opt --model-id facebook/opt-6.7b
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
class OPTConfig(openllm_core.LLMConfig):
"""OPT was first introduced in [Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) and first released in [metaseq's repository](https://github.com/facebookresearch/metaseq) on May 3rd 2022 by Meta AI.
OPT was predominantly pretrained with English text, but a small amount of non-English data is still present
within the training corpus via CommonCrawl. The model was pretrained using a causal language modeling (CLM)
objective. OPT belongs to the same family of decoder-only models like GPT-3. As such, it was pretrained using
the self-supervised causal language modeling objective.
Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information.
"""
__config__ = {
"name_type": "lowercase", "trust_remote_code": False, "url": "https://huggingface.co/docs/transformers/model_doc/opt",
"default_id": "facebook/opt-1.3b", "architecture": "OPTForCausalLM", "model_ids": ["facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b", "facebook/opt-2.7b", "facebook/opt-6.7b", "facebook/opt-66b"],
"fine_tune_strategies": ({"adapter_type": "lora", "r": 16, "lora_alpha": 32, "target_modules": ["q_proj", "v_proj"], "lora_dropout": 0.05, "bias": "none"},)
}
format_outputs: bool = dantic.Field(False, description="""Whether to format the outputs. This can be used when num_return_sequences > 1.""")
class GenerationConfig:
top_k: int = 15
temperature: float = 0.75
max_new_tokens: int = 1024
num_return_sequences: int = 1
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, num_return_sequences: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "num_return_sequences": num_return_sequences}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **attrs: t.Any) -> str:
if len(generation_result) == 1: return generation_result[0]
if self.config.format_outputs: return "Generated result:\n" + "\n -".join(generation_result)
else: return "\n".join(generation_result)

View File

@@ -1,27 +1,6 @@
from __future__ import annotations
import openllm
class StableLMConfig(openllm.LLMConfig):
"""StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
It is pre-trained on a diverse collection of English datasets with a sequence
length of 4096 to push beyond the context window limitations of existing open-source language models.
StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
instruction-following datasets.
Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
for more information.
"""
__config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
"default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 128
top_k: int = 0
top_p: float = 0.9
import openllm_core, typing as t
from openllm_core._prompt import process_prompt
START_STABLELM_COMMAND_DOCSTRING = """\
Run a LLMServer for StableLM model.
@@ -49,3 +28,32 @@ SYSTEM_PROMPT = """<|SYSTEM|># StableLM Tuned (Alpha version)
- StableLM will refuse to participate in anything that could harm a human.
"""
DEFAULT_PROMPT_TEMPLATE = """{system_prompt}<|USER|>{instruction}<|ASSISTANT|>"""
class StableLMConfig(openllm_core.LLMConfig):
"""StableLM-Base-Alpha is a suite of 3B and 7B parameter decoder-only language models.
It is pre-trained on a diverse collection of English datasets with a sequence
length of 4096 to push beyond the context window limitations of existing open-source language models.
StableLM-Tuned-Alpha is a suite of 3B and 7B parameter decoder-only language models
built on top of the StableLM-Base-Alpha models and further fine-tuned on various chat and
instruction-following datasets.
Refer to [StableLM-tuned's model card](https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b)
and [StableLM-base's model card](https://huggingface.co/stabilityai/stablelm-base-alpha-7b)
for more information.
"""
__config__ = {"name_type": "lowercase", "url": "https://github.com/Stability-AI/StableLM", "architecture": "GPTNeoXForCausalLM",
"default_id": "stabilityai/stablelm-tuned-alpha-3b", "model_ids": ["stabilityai/stablelm-tuned-alpha-3b", "stabilityai/stablelm-tuned-alpha-7b", "stabilityai/stablelm-base-alpha-3b", "stabilityai/stablelm-base-alpha-7b"]}
class GenerationConfig:
temperature: float = 0.9
max_new_tokens: int = 128
top_k: int = 0
top_p: float = 0.9
def sanitize_parameters(self, prompt: str, temperature: float | None = None, max_new_tokens: int | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
if "tuned" in self._model_id and use_default_prompt_template:
system_prompt = attrs.pop("system_prompt", SYSTEM_PROMPT)
prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs)
else: prompt_text = prompt
return prompt_text, {"max_new_tokens": max_new_tokens, "temperature": temperature, "top_k": top_k, "top_p": top_p}, {}
def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -1,26 +1,5 @@
from __future__ import annotations
import openllm
class StarCoderConfig(openllm.LLMConfig):
"""The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
[a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
[Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
"""
__config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
"default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
class GenerationConfig:
temperature: float = 0.2
max_new_tokens: int = 256
min_new_tokens: int = 32
top_k: float = 50
top_p: float = 0.95
pad_token_id: int = 49152
repetition_penalty: float = 1.2
import openllm_core, typing as t
START_STARCODER_COMMAND_DOCSTRING = """\
Run a LLMServer for StarCoder model.
@@ -43,3 +22,33 @@ $ openllm start starcoder --model-id 'bigcode/starcoder'
"""
DEFAULT_PROMPT_TEMPLATE = """{instruction}"""
FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD, EOD, FIM_INDICATOR = "<fim-prefix>", "<fim-middle>", "<fim-suffix>", "<fim-pad>", "<|endoftext|>", "<FILL_HERE>"
class StarCoderConfig(openllm_core.LLMConfig):
"""The StarCoder models are 15.5B parameter models trained on 80+ programming languages from [The Stack (v1.2)](https://huggingface.co/datasets/bigcode/the-stack), with opt-out requests excluded.
The model uses [Multi Query Attention](https://arxiv.org/abs/1911.02150),
[a context window of 8192 tokens](https://arxiv.org/abs/2205.14135), and was trained using the
[Fill-in-the-Middle](https://arxiv.org/abs/2207.14255) objective on 1 trillion tokens.
Refer to [StarCoder's model card](https://huggingface.co/bigcode/starcoder) for more information.
"""
__config__ = {"name_type": "lowercase", "requires_gpu": True, "url": "https://github.com/bigcode-project/starcoder", "architecture": "GPTBigCodeForCausalLM", "requirements": ["bitsandbytes"], "workers_per_resource": 0.5,
"default_id": "bigcode/starcoder", "model_ids": ["bigcode/starcoder", "bigcode/starcoderbase"]}
class GenerationConfig:
temperature: float = 0.2
max_new_tokens: int = 256
min_new_tokens: int = 32
top_k: float = 50
top_p: float = 0.95
pad_token_id: int = 49152
repetition_penalty: float = 1.2
def sanitize_parameters(self, prompt: str, temperature: float | None = None, top_p: float | None = None, max_new_tokens: int | None = None, repetition_penalty: float | None = None, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
fim_mode, prefix, suffix = FIM_INDICATOR in prompt, None, None
if fim_mode:
try: prefix, suffix = prompt.split(FIM_INDICATOR)
except Exception as err: raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt") from err
prompt_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
else: prompt_text = prompt
# XXX: This value for pad_token_id is currently a hack, need more investigate why the default starcoder doesn't include the same value as santacoder EOD
return prompt_text, {"temperature": temperature, "top_p": top_p, "max_new_tokens": max_new_tokens, "repetition_penalty": repetition_penalty, "pad_token_id": 49152, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]

View File

@@ -0,0 +1,19 @@
"""Base exceptions for OpenLLM. This extends BentoML exceptions."""
from __future__ import annotations
import bentoml
class OpenLLMException(bentoml.exceptions.BentoMLException):
"""Base class for all OpenLLM exceptions. This extends BentoMLException."""
class GpuNotAvailableError(OpenLLMException):
"""Raised when there is no GPU available in given system."""
class ValidationError(OpenLLMException):
"""Raised when a validation fails."""
class ForbiddenAttributeError(OpenLLMException):
"""Raised when using an _internal field."""
class MissingAnnotationAttributeError(OpenLLMException):
"""Raised when a field under openllm.LLMConfig is missing annotations."""
class MissingDependencyError(BaseException):
"""Raised when a dependency is missing."""
class Error(BaseException):
"""To be used instead of naked raise."""
class FineTuneStrategyNotSupportedError(OpenLLMException):
"""Raised when a fine-tune strategy is not supported for given LLM."""

View File

@@ -0,0 +1,304 @@
"""Utilities function for OpenLLM.
User can import these function for convenience, but we won't ensure backward compatibility for these functions. So use with caution.
"""
from __future__ import annotations
import contextlib, functools, hashlib, logging, logging.config, os, sys, types, typing as t, openllm_core, asyncio
from pathlib import Path
from circus.exc import ConflictError
from bentoml._internal.configuration import (
DEBUG_ENV_VAR as DEBUG_ENV_VAR,
GRPC_DEBUG_ENV_VAR as _GRPC_DEBUG_ENV_VAR,
QUIET_ENV_VAR as QUIET_ENV_VAR,
get_debug_mode as _get_debug_mode,
get_quiet_mode as _get_quiet_mode,
set_quiet_mode as set_quiet_mode,
)
from bentoml._internal.models.model import ModelContext as _ModelContext
from bentoml._internal.types import LazyType as LazyType
from bentoml._internal.utils import (
LazyLoader as LazyLoader,
bentoml_cattr as bentoml_cattr,
calc_dir_size as calc_dir_size,
first_not_none as first_not_none,
pkg as pkg,
reserve_free_port as reserve_free_port,
resolve_user_filepath as resolve_user_filepath,
)
from openllm_core.utils.lazy import (
LazyModule as LazyModule,
VersionInfo as VersionInfo,
)
if t.TYPE_CHECKING:
from openllm_core._typing_compat import AnyCallable
logger = logging.getLogger(__name__)
try: from typing import GenericAlias as _TypingGenericAlias # type: ignore
except ImportError: _TypingGenericAlias = () # type: ignore # python < 3.9 does not have GenericAlias (list[int], tuple[str, ...] and so on)
if sys.version_info < (3, 10): _WithArgsTypes = (_TypingGenericAlias,)
else: _WithArgsTypes: t.Any = (t._GenericAlias, types.GenericAlias, types.UnionType) # type: ignore # _GenericAlias is the actual GenericAlias implementation
DEV_DEBUG_VAR = "OPENLLMDEVDEBUG"
def set_debug_mode(enabled: bool, level: int = 1) -> None:
# monkeypatch bentoml._internal.configuration.set_debug_mode to remove unused logs
if enabled: os.environ[DEV_DEBUG_VAR] = str(level)
os.environ[DEBUG_ENV_VAR] = str(enabled)
os.environ[_GRPC_DEBUG_ENV_VAR] = "DEBUG" if enabled else "ERROR"
def lenient_issubclass(cls: t.Any, class_or_tuple: type[t.Any] | tuple[type[t.Any], ...] | None) -> bool:
try:
return isinstance(cls, type) and issubclass(cls, class_or_tuple) # type: ignore[arg-type]
except TypeError:
if isinstance(cls, _WithArgsTypes): return False
raise
def ensure_exec_coro(coro: t.Coroutine[t.Any, t.Any, t.Any]) -> t.Any:
loop = asyncio.get_event_loop()
if loop.is_running(): return asyncio.run_coroutine_threadsafe(coro, loop).result()
else: return loop.run_until_complete(coro)
def available_devices() -> tuple[str, ...]:
"""Return available GPU under system. Currently only supports NVIDIA GPUs."""
from openllm_core._strategies import NvidiaGpuResource
return tuple(NvidiaGpuResource.from_system())
@functools.lru_cache(maxsize=128)
def generate_hash_from_file(f: str, algorithm: t.Literal["md5", "sha1"] = "sha1") -> str:
"""Generate a hash from given file's modification time.
Args:
f: The file to generate the hash from.
algorithm: The hashing algorithm to use. Defaults to 'sha1' (similar to how Git generate its commit hash.)
Returns:
The generated hash.
"""
return getattr(hashlib, algorithm)(str(os.path.getmtime(resolve_filepath(f))).encode()).hexdigest()
@functools.lru_cache(maxsize=1)
def device_count() -> int: return len(available_devices())
# equivocal setattr to save one lookup per assignment
_object_setattr = object.__setattr__
def non_intrusive_setattr(obj: t.Any, name: str, value: t.Any) -> None:
"""This makes sure that we don't overwrite any existing attributes on the object."""
_setattr = functools.partial(setattr, obj) if isinstance(obj, type) else _object_setattr.__get__(obj)
if not hasattr(obj, name): _setattr(name, value)
def field_env_key(model_name: str, key: str, suffix: str | None = None) -> str: return "_".join(filter(None, map(str.upper, ["OPENLLM", model_name, suffix.strip("_") if suffix else "", key])))
# Special debug flag controled via OPENLLMDEVDEBUG
DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and bool(os.environ.get(DEV_DEBUG_VAR)))
# MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins
MYPY = False
SHOW_CODEGEN: bool = DEBUG and int(os.environ.get("OPENLLMDEVDEBUG", str(0))) > 3
def get_debug_mode() -> bool: return DEBUG or _get_debug_mode()
def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode()
class ExceptionFilter(logging.Filter):
def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any):
"""A filter of all exception."""
if exclude_exceptions is None: exclude_exceptions = [ConflictError]
if ConflictError not in exclude_exceptions: exclude_exceptions.append(ConflictError)
super(ExceptionFilter, self).__init__(**kwargs)
self.EXCLUDE_EXCEPTIONS = exclude_exceptions
def filter(self, record: logging.LogRecord) -> bool:
if record.exc_info:
etype, _, _ = record.exc_info
if etype is not None:
for exc in self.EXCLUDE_EXCEPTIONS:
if issubclass(etype, exc): return False
return True
class InfoFilter(logging.Filter):
def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING
_LOGGING_CONFIG: dict[str, t.Any] = {
"version": 1, "disable_existing_loggers": True,
"filters": {"excfilter": {"()": "openllm_core.utils.ExceptionFilter"}, "infofilter": {"()": "openllm_core.utils.InfoFilter"}},
"handlers": {"bentomlhandler": {"class": "logging.StreamHandler", "filters": ["excfilter", "infofilter"], "stream": "ext://sys.stdout"}, "defaulthandler": {"class": "logging.StreamHandler", "level": logging.WARNING}},
"loggers": {"bentoml": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}, "openllm": {"handlers": ["bentomlhandler", "defaulthandler"], "level": logging.INFO, "propagate": False}},
"root": {"level": logging.WARNING},
}
def configure_logging() -> None:
"""Configure logging for OpenLLM.
Behaves similar to how BentoML loggers are being configured.
"""
if get_quiet_mode():
_LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.ERROR
_LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.ERROR
_LOGGING_CONFIG["root"]["level"] = logging.ERROR
elif get_debug_mode() or DEBUG:
_LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.DEBUG
_LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.DEBUG
_LOGGING_CONFIG["root"]["level"] = logging.DEBUG
else:
_LOGGING_CONFIG["loggers"]["openllm"]["level"] = logging.INFO
_LOGGING_CONFIG["loggers"]["bentoml"]["level"] = logging.INFO
_LOGGING_CONFIG["root"]["level"] = logging.INFO
logging.config.dictConfig(_LOGGING_CONFIG)
@functools.lru_cache(maxsize=1)
def in_notebook() -> bool:
try:
from IPython.core.getipython import get_ipython
if t.TYPE_CHECKING:
from IPython.core.interactiveshell import InteractiveShell
return "IPKernelApp" in t.cast("dict[str, t.Any]", t.cast(t.Callable[[], "InteractiveShell"], get_ipython)().config)
except (ImportError, AttributeError): return False
_dockerenv, _cgroup = Path("/.dockerenv"), Path("/proc/self/cgroup")
class suppress(contextlib.suppress, contextlib.ContextDecorator):
"""A version of contextlib.suppress with decorator support.
>>> @suppress(KeyError)
... def key_error():
... {}['']
>>> key_error()
"""
def compose(*funcs: AnyCallable) -> AnyCallable:
"""Compose any number of unary functions into a single unary function.
>>> import textwrap
>>> expected = str.strip(textwrap.dedent(compose.__doc__))
>>> strip_and_dedent = compose(str.strip, textwrap.dedent)
>>> strip_and_dedent(compose.__doc__) == expected
True
Compose also allows the innermost function to take arbitrary arguments.
>>> round_three = lambda x: round(x, ndigits=3)
>>> f = compose(round_three, int.__truediv__)
>>> [f(3*x, x+1) for x in range(1,10)]
[1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7]
"""
def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs))
return functools.reduce(compose_two, funcs)
def apply(transform: AnyCallable) -> t.Callable[[AnyCallable], AnyCallable]:
"""Decorate a function with a transform function that is invoked on results returned from the decorated function.
```python
@apply(reversed)
def get_numbers(start):
"doc for get_numbers"
return range(start, start+3)
list(get_numbers(4))
# [6, 5, 4]
```
```python
get_numbers.__doc__
# 'doc for get_numbers'
```
"""
return lambda func: functools.wraps(func)(compose(transform, func))
@apply(bool)
@suppress(FileNotFoundError)
def _text_in_file(text: str, filename: Path) -> bool:
return any(text in line for line in filename.open())
def in_docker() -> bool:
"""Is this current environment running in docker?
```python
type(in_docker())
```
"""
return _dockerenv.exists() or _text_in_file("docker", _cgroup)
T, K = t.TypeVar("T"), t.TypeVar("K")
def resolve_filepath(path: str, ctx: str | None = None) -> str:
"""Resolve a file path to an absolute path, expand user and environment variables."""
try: return resolve_user_filepath(path, ctx)
except FileNotFoundError: return path
def validate_is_path(maybe_path: str) -> bool: return os.path.exists(os.path.dirname(resolve_filepath(maybe_path)))
def generate_context(framework_name: str) -> _ModelContext:
framework_versions = {"transformers": pkg.get_pkg_version("transformers")}
if openllm_core.utils.is_torch_available(): framework_versions["torch"] = pkg.get_pkg_version("torch")
if openllm_core.utils.is_tf_available():
from bentoml._internal.frameworks.utils.tensorflow import get_tf_version
framework_versions["tensorflow"] = get_tf_version()
if openllm_core.utils.is_flax_available(): framework_versions.update({"flax": pkg.get_pkg_version("flax"), "jax": pkg.get_pkg_version("jax"), "jaxlib": pkg.get_pkg_version("jaxlib")})
return _ModelContext(framework_name=framework_name, framework_versions=framework_versions)
_TOKENIZER_PREFIX = "_tokenizer_"
def normalize_attrs_to_model_tokenizer_pair(**attrs: t.Any) -> tuple[dict[str, t.Any], dict[str, t.Any]]:
"""Normalize the given attrs to a model and tokenizer kwargs accordingly."""
tokenizer_attrs = {k[len(_TOKENIZER_PREFIX):]: v for k, v in attrs.items() if k.startswith(_TOKENIZER_PREFIX)}
for k in tuple(attrs.keys()):
if k.startswith(_TOKENIZER_PREFIX): del attrs[k]
return attrs, tokenizer_attrs
# NOTE: The set marks contains a set of modules name
# that are available above and are whitelisted
# to be included in the extra_objects map.
_whitelist_modules = {"pkg"}
# XXX: define all classes, functions import above this line
# since _extras will be the locals() import from this file.
_extras: dict[str, t.Any] = {k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith("_"))}
_extras["__openllm_migration__"] = {"ModelEnv": "EnvVarMixin"}
_import_structure: dict[str, list[str]] = {
"analytics": [], "codegen": [], "dantic": [], "representation": ["ReprMixin"], "lazy": ["LazyModule"],
"import_utils": ["OPTIONAL_DEPENDENCIES", "ENV_VARS_TRUE_VALUES", "DummyMetaclass", "EnvVarMixin", "require_backends",
"is_cpm_kernels_available", "is_einops_available", "is_flax_available", "is_tf_available", "is_vllm_available", "is_torch_available", "is_bitsandbytes_available", "is_peft_available", "is_datasets_available",
"is_transformers_supports_kbit", "is_transformers_supports_agent", "is_jupyter_available", "is_jupytext_available", "is_notebook_available", "is_triton_available", "is_autogptq_available", "is_sentencepiece_available",
"is_xformers_available", "is_fairscale_available", "is_grpc_available", "is_grpc_health_available", "is_transformers_available"]}
if t.TYPE_CHECKING:
# NOTE: The following exports useful utils from bentoml
from . import (
analytics as analytics,
codegen as codegen,
dantic as dantic,
)
from .import_utils import (
ENV_VARS_TRUE_VALUES as ENV_VARS_TRUE_VALUES,
OPTIONAL_DEPENDENCIES as OPTIONAL_DEPENDENCIES,
DummyMetaclass as DummyMetaclass,
EnvVarMixin as EnvVarMixin,
is_autogptq_available as is_autogptq_available,
is_bitsandbytes_available as is_bitsandbytes_available,
is_cpm_kernels_available as is_cpm_kernels_available,
is_datasets_available as is_datasets_available,
is_einops_available as is_einops_available,
is_fairscale_available as is_fairscale_available,
is_flax_available as is_flax_available,
is_jupyter_available as is_jupyter_available,
is_jupytext_available as is_jupytext_available,
is_notebook_available as is_notebook_available,
is_peft_available as is_peft_available,
is_sentencepiece_available as is_sentencepiece_available,
is_tf_available as is_tf_available,
is_torch_available as is_torch_available,
is_transformers_supports_agent as is_transformers_supports_agent,
is_transformers_supports_kbit as is_transformers_supports_kbit,
is_triton_available as is_triton_available,
is_vllm_available as is_vllm_available,
is_xformers_available as is_xformers_available,
is_grpc_available as is_grpc_available,
is_grpc_health_available as is_grpc_health_available,
is_transformers_available as is_transformers_available,
require_backends as require_backends,
)
from .representation import ReprMixin as ReprMixin
__lazy = LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects=_extras)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -3,10 +3,9 @@
Users can disable this with OPENLLM_DO_NOT_TRACK envvar.
"""
from __future__ import annotations
import contextlib, functools, logging, os, re, typing as t, importlib.metadata
import attr, openllm
import contextlib, functools, logging, os, re, typing as t, importlib.metadata, attr, openllm_core
from bentoml._internal.utils import analytics as _internal_analytics
from openllm._typing_compat import ParamSpec
from openllm_core._typing_compat import ParamSpec
P = ParamSpec("P")
T = t.TypeVar("T")
@@ -17,7 +16,7 @@ OPENLLM_DO_NOT_TRACK = "OPENLLM_DO_NOT_TRACK"
DO_NOT_TRACK = os.environ.get(OPENLLM_DO_NOT_TRACK, str(False)).upper()
@functools.lru_cache(maxsize=1)
def do_not_track() -> bool: return DO_NOT_TRACK in openllm.utils.ENV_VARS_TRUE_VALUES
def do_not_track() -> bool: return DO_NOT_TRACK in openllm_core.utils.ENV_VARS_TRUE_VALUES
@functools.lru_cache(maxsize=1)
def _usage_event_debugging() -> bool: return os.environ.get("__BENTOML_DEBUG_USAGE", str(False)).lower() == "true"
@@ -27,7 +26,7 @@ def silent(func: t.Callable[P, T]) -> t.Callable[P, T]:
try: return func(*args, **kwargs)
except Exception as err:
if _usage_event_debugging():
if openllm.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
if openllm_core.utils.get_debug_mode(): logger.error("Tracking Error: %s", err, stack_info=True, stacklevel=3)
else: logger.info("Tracking Error: %s", err)
else: logger.debug("Tracking Error: %s", err)
return wrapper
@@ -73,8 +72,8 @@ class StartInitEvent(EventMeta):
model_name: str
llm_config: t.Dict[str, t.Any] = attr.field(default=None)
@staticmethod
def handler(llm_config: openllm.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
def handler(llm_config: openllm_core.LLMConfig) -> StartInitEvent: return StartInitEvent(model_name=llm_config["model_name"], llm_config=llm_config.model_dump())
def track_start_init(llm_config: openllm.LLMConfig) -> None:
def track_start_init(llm_config: openllm_core.LLMConfig) -> None:
if do_not_track(): return
track(StartInitEvent.handler(llm_config))

View File

@@ -1,49 +1,14 @@
from __future__ import annotations
import functools, inspect, linecache, os, logging, string, types, typing as t
import functools, inspect, linecache, logging, types, typing as t, orjson
from operator import itemgetter
from pathlib import Path
import orjson
if t.TYPE_CHECKING:
from fs.base import FS
import openllm
from openllm._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
import openllm_core
from openllm_core._typing_compat import LiteralString, AnyCallable, DictStrAny, ListStr
PartialAny = functools.partial[t.Any]
_T = t.TypeVar("_T", bound=t.Callable[..., t.Any])
logger = logging.getLogger(__name__)
OPENLLM_MODEL_NAME = "# openllm: model name"
OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
class ModelNameFormatter(string.Formatter):
model_keyword: LiteralString = "__model_name__"
def __init__(self, model_name: str):
"""The formatter that extends model_name to be formatted the 'service.py'."""
super().__init__()
self.model_name = model_name
def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
def can_format(self, value: str) -> bool:
try:
self.parse(value)
return True
except ValueError: return False
class ModelIdFormatter(ModelNameFormatter):
model_keyword: LiteralString = "__model_id__"
class ModelAdapterMapFormatter(ModelNameFormatter):
model_keyword: LiteralString = "__model_adapter_map__"
_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
from openllm.utils import DEBUG
model_name = llm.config["model_name"]
logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
for it in src_contents:
if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
if DEBUG: logger.info("Generated script:\n%s", script)
llm_fs.writetext(llm.config["service_name"], script)
# sentinel object for unequivocal object() getattr
_sentinel = object()
@@ -72,7 +37,7 @@ def add_method_dunders(cls: type[t.Any], method_or_cls: _T, _overwrite_doc: str
try: method_or_cls.__doc__ = _overwrite_doc or "Generated by ``openllm.LLMConfig`` for class " f"{cls.__qualname__}."
except AttributeError: pass
return method_or_cls
def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs) # noqa: S307
def _compile_and_eval(script: str, globs: DictStrAny, locs: t.Any = None, filename: str = "") -> None: eval(compile(script, filename, "exec"), globs, locs)
def _make_method(name: str, script: str, filename: str, globs: DictStrAny) -> AnyCallable:
locs: DictStrAny = {}
# In order of debuggers like PDB being able to step through the code, we add a fake linecache entry.
@@ -111,15 +76,15 @@ def make_attr_tuple_class(cls_name: str, attr_names: t.Sequence[str]) -> type[t.
def generate_unique_filename(cls: type[t.Any], func_name: str) -> str: return f"<{cls.__name__} generated {func_name} {cls.__module__}.{getattr(cls, '__qualname__', cls.__name__)}>"
def generate_function(typ: type[t.Any], func_name: str, lines: list[str] | None, args: tuple[str, ...] | None, globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable:
from openllm.utils import SHOW_CODEGEN
from openllm_core.utils import SHOW_CODEGEN
script = "def %s(%s):\n %s\n" % (func_name, ", ".join(args) if args is not None else "", "\n ".join(lines) if lines else "pass")
meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs)
if annotations: meth.__annotations__ = annotations
if SHOW_CODEGEN: logger.info("Generated script for %s:\n\n%s", typ, script)
return meth
def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
from openllm.utils import dantic, field_env_key
def make_env_transformer(cls: type[openllm_core.LLMConfig], model_name: str, suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None,) -> AnyCallable:
from openllm_core.utils import dantic, field_env_key
def identity(_: str, x_value: t.Any) -> t.Any: return x_value
default_callback = identity if default_callback is None else default_callback
globs = {} if globs is None else globs
@@ -129,13 +94,13 @@ def make_env_transformer(cls: type[openllm.LLMConfig], model_name: str, suffix:
return generate_function(cls, "__auto_env", lines, args=("_", "fields"), globs=globs, annotations={"_": "type[LLMConfig]", "fields": fields_ann, "return": fields_ann})
def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T:
"""Enhance sdk with nice repr that plays well with your brain."""
from openllm.utils import ReprMixin
from openllm_core.utils import ReprMixin
if name is None: name = func.__name__.strip("_")
_signatures = inspect.signature(func).parameters
def _repr(self: ReprMixin) -> str: return f"<generated function {name} {orjson.dumps(dict(self.__repr_args__()), option=orjson.OPT_NON_STR_KEYS | orjson.OPT_INDENT_2).decode()}>"
def _repr_args(self: ReprMixin) -> t.Iterator[t.Tuple[str, t.Any]]: return ((k, _signatures[k].annotation) for k in self.__repr_keys__)
if func.__doc__ is None: doc = f"Generated SDK for {func.__name__}"
else: doc = func.__doc__
return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm",}),)(func, **attrs), func,))
return t.cast(_T, functools.update_wrapper(types.new_class(name, (t.cast("PartialAny", functools.partial), ReprMixin), exec_body=lambda ns: ns.update({"__repr_keys__": property(lambda _: [i for i in _signatures.keys() if not i.startswith("_")]), "__repr_args__": _repr_args, "__repr__": _repr, "__doc__": inspect.cleandoc(doc), "__module__": "openllm"}),)(func, **attrs), func,))
__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function", "OPENLLM_MODEL_NAME", "OPENLLM_MODEL_ADAPTER_MAP"]
__all__ = ["gen_sdk", "make_attr_tuple_class", "make_env_transformer", "generate_unique_filename", "generate_function"]

View File

@@ -335,7 +335,6 @@ class CudaValueType(ParamType):
name = "cuda"
envvar_list_splitter = ","
is_composite = True
typ = click_types.convert_type(str)
def split_envvar_value(self, rv: str) -> t.Sequence[str]:
var = tuple(i for i in rv.split(self.envvar_list_splitter))
@@ -352,10 +351,11 @@ class CudaValueType(ParamType):
param: The parameter that is requesting completion.
incomplete: Value being completed. May be empty.
"""
from openllm.utils import available_devices
from openllm_core.utils import available_devices
mapping = incomplete.split(self.envvar_list_splitter) if incomplete else available_devices()
return [sc.CompletionItem(str(i), help=f"CUDA device index {i}") for i in mapping]
def convert(self, value: t.Any, param: click.Parameter | None, ctx: click.Context | None) -> t.Any:
typ = click_types.convert_type(str)
if isinstance(value, bytes):
enc = _get_argv_encoding()
try: value = value.decode(enc)
@@ -365,7 +365,7 @@ class CudaValueType(ParamType):
try: value = value.decode(fs_enc)
except UnicodeError: value = value.decode("utf-8", "replace")
else: value = value.decode("utf-8", "replace")
return tuple(self.typ(x, param, ctx) for x in value.split(","))
return tuple(typ(x, param, ctx) for x in value.split(","))
def __repr__(self) -> str: return "STRING"

View File

@@ -1,19 +1,19 @@
"""Some imports utils are vendorred from transformers/utils/import_utils.py for performance reasons."""
from __future__ import annotations
import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t
import importlib, importlib.metadata, importlib.util, logging, os, abc, typing as t, openllm_core
from collections import OrderedDict
import inflection, packaging.version
from bentoml._internal.utils import LazyLoader, pkg
from openllm._typing_compat import overload, LiteralString
from openllm_core._typing_compat import overload, LiteralString
from .representation import ReprMixin
if t.TYPE_CHECKING:
BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]]
from openllm._typing_compat import LiteralRuntime
from openllm_core._typing_compat import LiteralRuntime
logger = logging.getLogger(__name__)
OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq",}
OPTIONAL_DEPENDENCIES = {"opt", "flan-t5", "vllm", "fine-tune", "ggml", "agents", "openai", "playground", "gptq"}
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
USE_TF = os.environ.get("USE_TF", "AUTO").upper()
@@ -32,6 +32,9 @@ _torch_available = importlib.util.find_spec("torch") is not None
_tf_available = importlib.util.find_spec("tensorflow") is not None
_flax_available = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("flax") is not None
_vllm_available = importlib.util.find_spec("vllm") is not None
_transformers_available = _is_package_available("transformers")
_grpc_available = importlib.util.find_spec("grpc") is not None
_grpc_health_available = importlib.util.find_spec("grpc_health") is not None
_peft_available = _is_package_available("peft")
_einops_available = _is_package_available("einops")
_cpm_kernel_available = _is_package_available("cpm_kernels")
@@ -46,6 +49,9 @@ _sentencepiece_available = _is_package_available("sentencepiece")
_xformers_available = _is_package_available("xformers")
_fairscale_available = _is_package_available("fairscale")
def is_transformers_available() -> bool: return _transformers_available
def is_grpc_available() -> bool: return _grpc_available
def is_grpc_health_available() -> bool: return _grpc_health_available
def is_transformers_supports_kbit() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 30)
def is_transformers_supports_agent() -> bool: return pkg.pkg_version_info("transformers")[:2] >= (4, 29)
def is_jupyter_available() -> bool: return _jupyter_available
@@ -86,7 +92,7 @@ def is_tf_available() -> bool:
try:
_tf_version = importlib.metadata.version(_pkg)
break
except importlib.metadata.PackageNotFoundError: pass # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
except importlib.metadata.PackageNotFoundError: pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
_tf_available = _tf_version is not None
if _tf_available:
if _tf_version and packaging.version.parse(_tf_version) < packaging.version.parse("2"):
@@ -281,7 +287,7 @@ class EnvVarMixin(ReprMixin):
raise KeyError(f"Key {item} not found in {self}")
def __init__(self, model_name: str, implementation: LiteralRuntime = "pt", model_id: str | None = None, bettertransformer: bool | None = None, quantize: LiteralString | None = None, runtime: t.Literal["ggml", "transformers"] = "transformers") -> None:
"""EnvVarMixin is a mixin class that returns the value extracted from environment variables."""
from openllm._configuration import field_env_key
from openllm_core.utils import field_env_key
self.model_name = inflection.underscore(model_name)
self._implementation = implementation
self._model_id = model_id
@@ -307,6 +313,6 @@ class EnvVarMixin(ReprMixin):
@property
def __repr_keys__(self) -> set[str]: return {"config", "model_id", "quantize", "framework", "bettertransformer", "runtime"}
@property
def start_docstring(self) -> str: return getattr(self.module, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
def start_docstring(self) -> str: return getattr(openllm_core.config, f"START_{self.model_name.upper()}_COMMAND_DOCSTRING")
@property
def module(self) -> LazyLoader: return LazyLoader(self.model_name, globals(), f"openllm.models.{self.model_name}")

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t
import attr, openllm
import functools, importlib, importlib.machinery, importlib.metadata, importlib.util, itertools, os, time, types, warnings, typing as t, attr, openllm_core
__all__ = ["VersionInfo", "LazyModule"]
# vendorred from attrs
@@ -75,7 +74,7 @@ class LazyModule(types.ModuleType):
It also contains a special case for all of the metadata information, such as __version__ and __version_info__.
"""
if name in _reserved_namespace: raise openllm.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
if name in _reserved_namespace: raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.")
dunder_to_metadata = {"__title__": "Name", "__copyright__": "", "__version__": "version", "__version_info__": "version", "__description__": "summary", "__uri__": "", "__url__": "", "__author__": "", "__email__": "", "__license__": "license", "__homepage__": ""}
if name in dunder_to_metadata:
if name not in {"__version_info__", "__copyright__", "__version__"}: warnings.warn(f"Accessing '{self._name}.{name}' is deprecated. Please consider using 'importlib.metadata' directly to query for openllm packaging metadata.", DeprecationWarning, stacklevel=2)

View File

@@ -1,9 +1,8 @@
from __future__ import annotations
import typing as t
from abc import abstractmethod
import attr, orjson
from openllm import utils
if t.TYPE_CHECKING: from openllm._typing_compat import TypeAlias
import attr, orjson, typing as t
from openllm_core import utils
if t.TYPE_CHECKING: from openllm_core._typing_compat import TypeAlias
ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None]
class ReprMixin:

View File

@@ -8,21 +8,24 @@ environment by referring to our
## Procedure
All the relevant code for incorporating a new model resides within
[`src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
[`$GIT_ROOT/openllm-python/src/openllm/models`](./src/openllm/models/) `model_name` in snake_case.
Here's your roadmap:
- [ ] Generate model configuration file:
`src/openllm/models/{model_name}/configuration_{model_name}.py`
`$GIT_ROOT/openllm-core/src/openllm_core/config/configuration_{model_name}.py`
- [ ] Establish model implementation files:
`src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
`$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/modeling_{runtime}_{model_name}.py`
- [ ] Create module's `__init__.py`:
`src/openllm/models/{model_name}/__init__.py`
- [ ] Adjust the entrypoints for files at `src/openllm/models/auto/*` If it is a
new runtime, then add it a `src/openllm/models/auto/modeling_{runtime}_auto.py`.
`$GIT_ROOT/openllm-python/src/openllm/models/{model_name}/__init__.py`
- [ ] Adjust the entrypoints for files at `$GIT_ROOT/openllm-python/src/openllm/models/auto/*` If it is a
new runtime, then add it a `$GIT_ROOT/openllm-python/src/openllm/models/auto/modeling_{runtime}_auto.py`.
See the other auto runtime for example.
- [ ] Modify the main `__init__.py`: `src/openllm/models/__init__.py`
- [ ] Run the following script: `$GIT_ROOT/tools/update-models-import.py`
- [ ] Run the following to update stubs: `hatch run check-stubs`
> [!NOTE]
> `$GIT_ROOT` refers to `$(git rev-parse --show-toplevel)`
For a working example, check out any existing model.
### Model Configuration

View File

@@ -5,7 +5,6 @@ requires = [
"hatchling==1.18.0",
"hatch-vcs==0.3.0",
"hatch-fancy-pypi-readme==23.1.0",
"hatch-mypyc==0.16.0",
]
[project]
@@ -39,20 +38,14 @@ classifiers = [
"Programming Language :: Python :: Implementation :: PyPy",
]
dependencies = [
"bentoml[grpc,io]>=1.0.25",
"bentoml[grpc,io]>=1.1.2",
"transformers[torch,tokenizers,accelerate]>=4.29.0",
"openllm-client",
"safetensors",
"optimum",
"attrs>=23.1.0",
"cattrs>=23.1.0",
"orjson",
"inflection",
"tabulate[widechars]>=0.9.0",
"httpx",
"click>=8.1.3",
"typing_extensions",
"mypy_extensions",
"ghapi",
"tabulate[widechars]>=0.9.0",
"click>=8.1.3",
"cuda-python;platform_system!=\"Darwin\"",
"bitsandbytes<0.42",
]
@@ -102,34 +95,21 @@ Twitter = "https://twitter.com/bentomlai"
[project.optional-dependencies]
agents = ["transformers[agents]>=4.30", "diffusers", "soundfile"]
all = [
"openllm[agents]",
"openllm[baichuan]",
"openllm[chatglm]",
"openllm[falcon]",
"openllm[fine-tune]",
"openllm[flan-t5]",
"openllm[ggml]",
"openllm[gptq]",
"openllm[llama]",
"openllm[mpt]",
"openllm[openai]",
"openllm[opt]",
"openllm[playground]",
"openllm[starcoder]",
"openllm[vllm]",
]
all = ["openllm[full]"]
baichuan = ["cpm-kernels", "sentencepiece"]
chatglm = ["cpm-kernels", "sentencepiece"]
falcon = ["einops", "xformers"]
fine-tune = ["peft>=0.4.0", "bitsandbytes", "datasets", "accelerate", "trl"]
flan-t5 = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
flan-t5 = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
full = [
"openllm[agents,baichuan,chatglm,falcon,fine-tune,flan-t5,ggml,gptq,llama,mpt,openai,opt,playground,starcoder,vllm]",
]
ggml = ["ctransformers"]
gptq = ["auto-gptq[triton]"]
llama = ["fairscale", "sentencepiece"]
mpt = ["triton", "einops"]
openai = ["openai", "tiktoken"]
opt = ["flax", "jax", "jaxlib", "tensorflow", "keras"]
opt = ["flax>=0.7", "jax", "jaxlib", "tensorflow", "keras"]
playground = ["jupyter", "notebook", "ipython", "jupytext", "nbformat"]
starcoder = ["bitsandbytes"]
vllm = ["vllm", "ray"]
@@ -165,10 +145,10 @@ exclude = [
[tool.hatch.build.targets.wheel.hooks.mypyc]
dependencies = [
"hatch-mypyc==0.16.0",
"mypy==1.4.1",
"mypy==1.5.1",
# avoid https://github.com/pallets/click/issues/2558
"click==8.1.3",
"bentoml==1.1.1",
"bentoml==1.1.2",
"transformers>=4.31.0",
"pandas-stubs",
"types-psutil",
@@ -177,19 +157,14 @@ dependencies = [
"types-protobuf",
]
enable-by-default = false
exclude = ["src/openllm/_service.py", "src/openllm/_typing_compat.py"]
exclude = ["src/openllm/_service.py", "src/openllm/utils/__init__.py"]
include = [
"src/openllm/models/__init__.py",
"src/openllm/models/auto/__init__.py",
"src/openllm/utils/__init__.py",
"src/openllm/__init__.py",
"src/openllm/_prompt.py",
"src/openllm/_schema.py",
"src/openllm/_quantisation.py",
"src/openllm/_generation.py",
"src/openllm/_strategies.py",
"src/openllm/exceptions.py",
"src/openllm/testing.py",
"src/openllm/utils",
]
# NOTE: This is consistent with pyproject.toml
mypy-args = [
@@ -204,7 +179,6 @@ mypy-args = [
"--no-warn-no-return",
"--no-warn-unused-ignores",
"--exclude='/src\\/openllm\\/playground\\/**'",
"--exclude='/src\\/openllm\\/_typing_compat\\.py$'",
]
options = { verbose = true, strip_asserts = true, debug_level = "2", opt_level = "3", include_runtime_files = true }
require-runtime-dependencies = true

View File

@@ -9,13 +9,18 @@ deploy, and monitor any LLMs with ease.
* Native integration with BentoML and LangChain for custom LLM apps
"""
from __future__ import annotations
import logging as _logging, os as _os, typing as _t, warnings as _warnings
import logging as _logging, os as _os, typing as _t, warnings as _warnings, openllm_core
from pathlib import Path as _Path
from . import exceptions as exceptions, utils as utils
if utils.DEBUG:
utils.set_debug_mode(True)
utils.set_quiet_mode(False)
from openllm_core._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
from openllm_core._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
from openllm_core._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES, BaichuanConfig as BaichuanConfig, ChatGLMConfig as ChatGLMConfig, DollyV2Config as DollyV2Config, FalconConfig as FalconConfig, FlanT5Config as FlanT5Config, GPTNeoXConfig as GPTNeoXConfig, LlamaConfig as LlamaConfig, MPTConfig as MPTConfig, OPTConfig as OPTConfig, StableLMConfig as StableLMConfig, StarCoderConfig as StarCoderConfig
if openllm_core.utils.DEBUG:
openllm_core.utils.set_debug_mode(True)
openllm_core.utils.set_quiet_mode(False)
_logging.basicConfig(level=_logging.NOTSET)
else:
# configuration for bitsandbytes before import
@@ -28,40 +33,26 @@ else:
_warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated")
_import_structure: dict[str, list[str]] = {
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [], "utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_configuration": ["LLMConfig", "GenerationConfig", "SamplingParams"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
"_quantisation": ["infer_quantisation_config"], "_schema": ["GenerationInput", "GenerationOutput", "MetadataOutput", "EmbeddingsOutput", "unmarshal_vllm_outputs", "HfAgentInput"], "_embeddings": ["GenericEmbeddingRunnable"], "_strategies": ["CascadingResourceStrategy", "get_resource"],
"models.auto": ["AutoConfig", "CONFIG_MAPPING", "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": ["ChatGLMConfig"], "models.baichuan": ["BaichuanConfig"], "models.dolly_v2": ["DollyV2Config"], "models.falcon": ["FalconConfig"], "models.flan_t5": ["FlanT5Config"], "models.gpt_neox": ["GPTNeoXConfig"], "models.llama": ["LlamaConfig"], "models.mpt": ["MPTConfig"], "models.opt": ["OPTConfig"], "models.stablelm": ["StableLMConfig"], "models.starcoder": ["StarCoderConfig"]
"exceptions": [], "models": [], "client": [], "bundle": [], "playground": [], "testing": [],
"utils": ["infer_auto_class"], "serialisation": ["ggml", "transformers"], "cli._sdk": ["start", "start_grpc", "build", "import_model", "list_models"], "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"],
"_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "LLMEmbeddings"], "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"],
"models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": [], "models.baichuan": [], "models.dolly_v2": [], "models.falcon": [], "models.flan_t5": [], "models.gpt_neox": [], "models.llama": [], "models.mpt": [], "models.opt": [], "models.stablelm": [], "models.starcoder": []
}
COMPILED = _Path(__file__).suffix in (".pyd", ".so")
if _t.TYPE_CHECKING:
from . import bundle as bundle, cli as cli, client as client, models as models, playground as playground, serialisation as serialisation, testing as testing
from ._configuration import GenerationConfig as GenerationConfig, LLMConfig as LLMConfig, SamplingParams as SamplingParams
from ._generation import LogitsProcessorList as LogitsProcessorList, StopOnTokens as StopOnTokens, StoppingCriteriaList as StoppingCriteriaList, StopSequenceCriteria as StopSequenceCriteria, prepare_logits_processor as prepare_logits_processor
from ._llm import LLM as LLM, LLMEmbeddings as LLMEmbeddings, LLMRunnable as LLMRunnable, LLMRunner as LLMRunner, Runner as Runner
from ._quantisation import infer_quantisation_config as infer_quantisation_config
from ._schema import EmbeddingsOutput as EmbeddingsOutput, GenerationInput as GenerationInput, GenerationOutput as GenerationOutput, HfAgentInput as HfAgentInput, MetadataOutput as MetadataOutput, unmarshal_vllm_outputs as unmarshal_vllm_outputs
from ._embeddings import GenericEmbeddingRunnable as GenericEmbeddingRunnable
from ._strategies import CascadingResourceStrategy as CascadingResourceStrategy, get_resource as get_resource
from .cli._sdk import build as build, import_model as import_model, list_models as list_models, start as start, start_grpc as start_grpc
from .models.auto import CONFIG_MAPPING as CONFIG_MAPPING, MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES, AutoConfig as AutoConfig
from .models.baichuan import BaichuanConfig as BaichuanConfig
from .models.chatglm import ChatGLMConfig as ChatGLMConfig
from .models.dolly_v2 import DollyV2Config as DollyV2Config
from .models.falcon import FalconConfig as FalconConfig
from .models.flan_t5 import FlanT5Config as FlanT5Config
from .models.gpt_neox import GPTNeoXConfig as GPTNeoXConfig
from .models.llama import LlamaConfig as LlamaConfig
from .models.mpt import MPTConfig as MPTConfig
from .models.opt import OPTConfig as OPTConfig
from .models.stablelm import StableLMConfig as StableLMConfig
from .models.starcoder import StarCoderConfig as StarCoderConfig
from .models.auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES, MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES as MODEL_VLLM_MAPPING_NAMES
from .serialisation import ggml as ggml, transformers as transformers
from .utils import infer_auto_class as infer_auto_class
try:
if not (utils.is_torch_available() and utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_cpm_kernels_available()): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_pt_objects"] = ["ChatGLM", "Baichuan"]
else:
@@ -71,7 +62,7 @@ else:
from .models.baichuan import Baichuan as Baichuan
from .models.chatglm import ChatGLM as ChatGLM
try:
if not (utils.is_torch_available() and utils.is_triton_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_triton_available()): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["MPT"])
else: _import_structure["utils.dummy_pt_objects"] = ["MPT"]
@@ -79,7 +70,7 @@ else:
_import_structure["models.mpt"].extend(["MPT"])
if _t.TYPE_CHECKING: from .models.mpt import MPT as MPT
try:
if not (utils.is_torch_available() and utils.is_einops_available()): raise exceptions.MissingDependencyError
if not (openllm_core.utils.is_torch_available() and openllm_core.utils.is_einops_available()): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
if "utils.dummy_pt_objects" in _import_structure: _import_structure["utils.dummy_pt_objects"].extend(["Falcon"])
else: _import_structure["utils.dummy_pt_objects"] = ["Falcon"]
@@ -88,7 +79,7 @@ else:
if _t.TYPE_CHECKING: from .models.falcon import Falcon as Falcon
try:
if not utils.is_torch_available(): raise exceptions.MissingDependencyError
if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_pt_objects"] = [name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations")]
else:
@@ -110,7 +101,7 @@ else:
from .models.stablelm import StableLM as StableLM
from .models.starcoder import StarCoder as StarCoder
try:
if not utils.is_vllm_available(): raise exceptions.MissingDependencyError
if not openllm_core.utils.is_vllm_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_vllm_objects"] = [name for name in dir(utils.dummy_vllm_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
@@ -136,7 +127,7 @@ else:
from .models.stablelm import VLLMStableLM as VLLMStableLM
from .models.starcoder import VLLMStarCoder as VLLMStarCoder
try:
if not utils.is_flax_available(): raise exceptions.MissingDependencyError
if not openllm_core.utils.is_flax_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_flax_objects"] = [name for name in dir(utils.dummy_flax_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
@@ -148,7 +139,7 @@ else:
from .models.flan_t5 import FlaxFlanT5 as FlaxFlanT5
from .models.opt import FlaxOPT as FlaxOPT
try:
if not utils.is_tf_available(): raise exceptions.MissingDependencyError
if not openllm_core.utils.is_tf_available(): raise exceptions.MissingDependencyError
except exceptions.MissingDependencyError:
_import_structure["utils.dummy_tf_objects"] = [name for name in dir(utils.dummy_tf_objects) if not name.startswith("_") and name not in ("annotations",)]
else:
@@ -161,7 +152,7 @@ else:
from .models.opt import TFOPT as TFOPT
# NOTE: update this to sys.modules[__name__] once mypy_extensions can recognize __spec__
__lazy = utils.LazyModule(__name__, _os.path.abspath("__file__"), _import_structure, extra_objects={"COMPILED": COMPILED})
__lazy = openllm_core.utils.LazyModule(__name__, globals()["__file__"], _import_structure, extra_objects={"COMPILED": COMPILED})
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -1,22 +1,21 @@
from __future__ import annotations
import functools, inspect, logging, os, re, traceback, types, typing as t, uuid
import functools, inspect, logging, os, re, traceback, types, typing as t, uuid, attr, fs.path, inflection, orjson, bentoml, openllm, openllm_core, gc
from abc import ABC, abstractmethod
from pathlib import Path
import attr, fs.path, inflection, orjson, bentoml, openllm, gc
from huggingface_hub import hf_hub_download
from bentoml._internal.models.model import ModelSignature
from ._configuration import (
from openllm_core._configuration import (
FineTuneConfig,
LLMConfig,
_object_getattribute,
_setattr_class,
)
from ._quantisation import infer_quantisation_config
from ._schema import unmarshal_vllm_outputs
from openllm_core._schema import unmarshal_vllm_outputs
from .exceptions import ForbiddenAttributeError, GpuNotAvailableError, OpenLLMException
from .models.auto import AutoConfig
from .utils import (
from openllm_core.utils import (
DEBUG,
ENV_VARS_TRUE_VALUES,
MYPY,
@@ -29,7 +28,6 @@ from .utils import (
device_count,
first_not_none,
generate_hash_from_file,
infer_auto_class,
is_peft_available,
is_torch_available,
non_intrusive_setattr,
@@ -37,8 +35,8 @@ from .utils import (
resolve_filepath,
validate_is_path,
)
from ._typing_compat import (
from .utils import infer_auto_class
from openllm_core._typing_compat import (
AdaptersMapping,
AdaptersTuple,
AnyCallable,
@@ -57,8 +55,8 @@ from ._typing_compat import (
if t.TYPE_CHECKING:
import auto_gptq as autogptq, peft, torch, transformers, vllm
from ._configuration import PeftType
from .utils.representation import ReprArgs
from openllm_core._configuration import PeftType
from openllm_core.utils.representation import ReprArgs
else:
autogptq = LazyLoader("autogptq", globals(), "auto_gptq")
vllm = LazyLoader("vllm", globals(), "vllm")
@@ -156,27 +154,6 @@ class LLMInterface(ABC, t.Generic[M, T]):
"""The iterator version of `generate` function."""
raise NotImplementedError("Currently generate_iterator requires SSE (Server-side events) support, which is not yet implemented.")
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
"""This handler will sanitize all attrs and setup prompt text.
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
Returns a tuple of three items:
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
- The attributes dictionary that will be passed into `self.postprocess_generate`.
"""
return prompt, attrs, attrs
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
"""This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
> [!NOTE]
> This will be used from the client side.
"""
return generation_result
def llm_post_init(self) -> None:
"""This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals."""
pass
@@ -380,9 +357,7 @@ def _make_assignment_script(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]
anns[key] = interface_anns.get(key)
return codegen.generate_function(cls, "__assign_llm_attr", lines, args=("cls", *args), globs=globs, annotations=anns)
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str:
return generation_result[0]["outputs"][0]["text"]
def vllm_postprocess_generate(self: LLM["vllm.LLMEngine", T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]["outputs"][0]["text"]
def vllm_generate(self: LLM["vllm.LLMEngine", T], prompt: str, **attrs: t.Any) -> list[dict[str, t.Any]]:
outputs: list[vllm.RequestOutput] = []
# TODO: support prompt_token_ids
@@ -430,8 +405,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
elif "config_class" not in cd: raise RuntimeError("Missing required key 'config_class'. Make sure to define it within the LLM subclass.")
_make_assignment_script(cls)(cls)
if "tokenizer_id" not in cd and cls.__llm_implementation__ == "vllm": cls.tokenizer_id = _DEFAULT_TOKENIZER
# fmt: off
@overload
def __getitem__(self, item: t.Literal["trust_remote_code"]) -> bool: ...
@overload
@@ -459,24 +432,14 @@ class LLM(LLMInterface[M, T], ReprMixin):
if hasattr(self, internal_attributes): return getattr(self, internal_attributes)
elif hasattr(self, item): return getattr(self, item)
else: raise KeyError(item)
@classmethod
@overload
def from_pretrained(
cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
) -> LLM[M, T]: ...
@classmethod
def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["int8", "int4"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: transformers.BitsAndBytesConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
@overload
def from_pretrained(
cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ...,
quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any
) -> LLM[M, T]: ...
# fmt: on
@classmethod
def from_pretrained(
cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None,
adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any,
) -> LLM[M, T]:
def from_pretrained(cls, model_id: str | None = ..., model_version: str | None = ..., llm_config: LLMConfig | None = ..., *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = ..., quantize: t.Literal["gptq"] = ..., bettertransformer: str | bool | None = ..., adapter_id: str | None = ..., adapter_name: str | None = ..., adapter_map: dict[str, str | None] | None = ..., quantization_config: autogptq.BaseQuantizeConfig | None = ..., serialisation: t.Literal["safetensors", "legacy"] = ..., **attrs: t.Any) -> LLM[M, T]: ...
@classmethod
def from_pretrained(cls, model_id: str | None = None, model_version: str | None = None, llm_config: LLMConfig | None = None, *args: t.Any, runtime: t.Literal["ggml", "transformers"] | None = None, quantize: t.Literal["int8", "int4", "gptq"] | None = None, bettertransformer: str | bool | None = None, adapter_id: str | None = None, adapter_name: str | None = None, adapter_map: dict[str, str | None] | None = None, quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None = None, serialisation: t.Literal["safetensors", "legacy"] = "safetensors", **attrs: t.Any) -> LLM[M, T]:
"""Instantiate a pretrained LLM.
``LLM.from_pretrained`` follows the same design principle as HuggingFace's `from_pretrained` method, plus the following:
@@ -708,7 +671,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
normalized_model_kwds, normalized_tokenizer_kwds = normalize_attrs_to_model_tokenizer_pair(**attrs)
# NOTE: Save the args and kwargs for latter load
self.__attrs_init__(llm_config, quantization_config, model_id, _runtime, args, {**model_kwds, **normalized_model_kwds}, {**tokenizer_kwds, **normalized_tokenizer_kwds}, _tag, _adapters_mapping, _model_version, _quantize_method, _serialisation_format, _local)
# handle trust_remote_code
_from_env = os.getenv("TRUST_REMOTE_CODE", None)
self.__llm_trust_remote_code__ = first_not_none(str(_from_env).upper() in ENV_VARS_TRUE_VALUES if _from_env else None, default=self._model_attrs.pop("trust_remote_code", self.config["trust_remote_code"]))
@@ -723,7 +685,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
def __setattr__(self, attr: str, value: t.Any) -> None:
if attr in _reserved_namespace: raise ForbiddenAttributeError(f"{attr} should not be set during runtime as these value will be reflected during runtime. Instead, you can create a custom LLM subclass {self.__class__.__name__}.")
super().__setattr__(attr, value)
@property
def adapters_mapping(self) -> AdaptersMapping | None: return self._adapters_mapping
@adapters_mapping.setter
@@ -740,6 +701,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
def runtime(self) -> t.Literal["ggml", "transformers"]: return self._runtime
@property
def runner_name(self) -> str: return f"llm-{self.config['start_name']}-runner"
# NOTE: The section below defines a loose contract with langchain's LLM interface.
@property
def llm_type(self) -> str: return normalise_model_name(self._model_id)
@property
@@ -755,6 +717,27 @@ class LLM(LLMInterface[M, T], ReprMixin):
if self.__llm_bentomodel__ is None: self.__llm_bentomodel__ = openllm.serialisation.get(self)
return self.__llm_bentomodel__
def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]:
"""This handler will sanitize all attrs and setup prompt text.
It takes a prompt that is given by the user, attrs that can be parsed with the prompt.
Returns a tuple of three items:
- The attributes dictionary that can be passed into LLMConfig to generate a GenerationConfig
- The attributes dictionary that will be passed into `self.postprocess_generate`.
"""
return self.config.sanitize_parameters(prompt, **attrs)
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any:
"""This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.).
You can customize how the output of the LLM looks with this hook. By default, it is a simple echo.
> [!NOTE]
> This will be used from the client side.
"""
if isinstance(generation_result, dict): return generation_result["text"]
return self.config.postprocess_generate(prompt, generation_result, **attrs)
@property
def model(self) -> M:
# Run check for GPU
@@ -868,7 +851,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
# order of these fields matter here, make sure to sync it with
# openllm.models.auto.factory.BaseAutoLLMClass.for_model
def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] | None = None) -> LLMRunner[M, T]:
def to_runner(self, models: list[bentoml.Model] | None = None, max_batch_size: int | None = None, max_latency_ms: int | None = None, scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]:
"""Convert this LLM into a Runner.
Args:
@@ -894,10 +877,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
try: models.append(self._bentomodel)
except bentoml.exceptions.NotFound as err: raise RuntimeError(f"Failed to locate {self._bentomodel}:{err}") from None
if scheduling_strategy is None:
from ._strategies import CascadingResourceStrategy
scheduling_strategy = CascadingResourceStrategy
generate_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
embeddings_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=True, batch_dim=0)))
generate_iterator_sig = ModelSignature.from_dict(t.cast("_ModelSignatureDict", ModelSignatureDict(batchable=False)))
@@ -932,10 +911,6 @@ class LLM(LLMInterface[M, T], ReprMixin):
for it in self.generate_iterator(prompt, **attrs): pass
return [it]
def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> str:
if isinstance(generation_result, dict): return generation_result["text"]
return generation_result
def generate_iterator(self, prompt: str, /,
*, context_length: int | None = None, echo: bool = True, stream_interval: int = 2, stop: str | t.Iterable[str] | None = None, stop_token_ids: list[int] | None = None, **attrs: t.Any) -> t.Iterator[t.Any]:
# NOTE: encoder-decoder models will need to implement their own generate_iterator for now

View File

@@ -1,12 +1,11 @@
# mypy: disable-error-code="name-defined"
# mypy: disable-error-code="name-defined,no-redef"
from __future__ import annotations
import logging, sys, typing as t
from .utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
if sys.version_info[:2] >= (3, 11): from typing import overload
else: from typing_extensions import overload
import logging, typing as t
from openllm_core.utils import LazyLoader, is_autogptq_available, is_bitsandbytes_available, is_transformers_supports_kbit, pkg
from openllm_core._typing_compat import overload
if t.TYPE_CHECKING:
from ._llm import LLM
from ._typing_compat import DictStrAny
from openllm_core._typing_compat import DictStrAny
autogptq, torch, transformers = LazyLoader("autogptq", globals(), "auto_gptq"), LazyLoader("torch", globals(), "torch"), LazyLoader("transformers", globals(), "transformers")

View File

@@ -4,15 +4,12 @@ These utilities will stay internal, and its API can be changed or updated withou
"""
from __future__ import annotations
import os, typing as t
from openllm.utils import LazyModule
from openllm_core.utils import LazyModule
_import_structure: dict[str, list[str]] = {"_package": ["create_bento", "build_editable", "construct_python_options", "construct_docker_options"], "oci": ["CONTAINER_NAMES", "get_base_container_tag", "build_container", "get_base_container_name", "supported_registries", "RefResolver"]}
if t.TYPE_CHECKING:
from . import (
_package as _package,
oci as oci,
)
from . import _package as _package, oci as oci
from ._package import (
build_editable as build_editable,
construct_docker_options as construct_docker_options,
@@ -28,7 +25,7 @@ if t.TYPE_CHECKING:
supported_registries as supported_registries,
)
__lazy=LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
__all__=__lazy.__all__
__dir__=__lazy.__dir__
__getattr__=__lazy.__getattr__
__lazy = LazyModule(__name__, os.path.abspath("__file__"), _import_structure)
__all__ = __lazy.__all__
__dir__ = __lazy.__dir__
__getattr__ = __lazy.__getattr__

View File

@@ -1,35 +1,34 @@
# mypy: disable-error-code="misc"
from __future__ import annotations
import importlib.metadata, inspect, logging, os, typing as t
import fs, fs.copy, fs.errors, orjson, bentoml, openllm_core, importlib.metadata, inspect, logging, os, typing as t, string
from pathlib import Path
import fs, fs.copy, fs.errors, orjson, bentoml, openllm
from simple_di import Provide, inject
from bentoml._internal.bento.build_config import BentoBuildConfig, DockerOptions, ModelSpec, PythonOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from . import oci
if t.TYPE_CHECKING:
import openllm
from fs.base import FS
from openllm._typing_compat import LiteralString
from openllm_core._typing_compat import LiteralString, LiteralContainerRegistry, LiteralContainerVersionStrategy
from bentoml._internal.bento import BentoStore
from bentoml._internal.models.model import ModelStore
from .oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
logger = logging.getLogger(__name__)
OPENLLM_DEV_BUILD = "OPENLLM_DEV_BUILD"
def build_editable(path: str) -> str | None:
def build_editable(path: str, package: t.Literal["openllm", "openllm_core", "openllm_client"] = "openllm") -> str | None:
"""Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set."""
if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != "true": return None
# We need to build the package in editable mode, so that we can import it
from build import ProjectBuilder
from build.env import IsolatedEnvBuilder
module_location = openllm.utils.pkg.source_locations("openllm")
module_location = openllm_core.utils.pkg.source_locations(package)
if not module_location: raise RuntimeError("Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.")
pyproject_path = Path(module_location).parent.parent/"pyproject.toml"
if os.path.isfile(pyproject_path.__fspath__()):
logger.info("OpenLLM is installed in editable mode. Generating built wheels...")
logger.info("Generating built wheels for package %s...", package)
with IsolatedEnvBuilder() as env:
builder = ProjectBuilder(pyproject_path.parent)
builder.python_executable = env.executable
@@ -49,15 +48,15 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
req = llm.config["requirements"]
if req is not None: packages.extend(req)
if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm.utils.pkg.pkg_version_info('bentoml')])}")
if str(os.environ.get("BENTOML_BUNDLE_LOCAL_BUILD", False)).lower() == "false": packages.append(f"bentoml>={'.'.join([str(i) for i in openllm_core.utils.pkg.pkg_version_info('bentoml')])}")
env = llm.config["env"]
framework_envvar = env["framework_value"]
if framework_envvar == "flax":
if not openllm.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.framework} is set to 'flax'")
packages.extend([importlib.metadata.version("flax"), importlib.metadata.version("jax"), importlib.metadata.version("jaxlib")])
elif framework_envvar == "tf":
if not openllm.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.framework} is set to 'tf'")
candidates = ("tensorflow", "tensorflow-cpu", "tensorflow-gpu", "tf-nightly", "tf-nightly-cpu", "tf-nightly-gpu", "intel-tensorflow", "intel-tensorflow-avx512", "tensorflow-rocm", "tensorflow-macos",)
# For the metadata, we have to look for both tensorflow and tensorflow-cpu
for candidate in candidates:
@@ -68,19 +67,19 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_d
_tf_version = importlib.metadata.version(candidate)
packages.extend([f"tensorflow>={_tf_version}"])
break
except importlib.metadata.PackageNotFoundError: pass # noqa: PERF203 # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
except importlib.metadata.PackageNotFoundError: pass # Ok to ignore here since we actually need to check for all possible tensorflow distribution.
else:
if not openllm.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
if not openllm_core.utils.is_torch_available(): raise ValueError("PyTorch is not available. Make sure to have it locally installed.")
packages.extend([f'torch>={importlib.metadata.version("torch")}'])
wheels: list[str] = []
built_wheels = build_editable(llm_fs.getsyspath("/"))
if built_wheels is not None: wheels.append(llm_fs.getsyspath(f"/{built_wheels.split('/')[-1]}"))
built_wheels: list[str | None] = [build_editable(llm_fs.getsyspath("/"), t.cast(t.Literal["openllm", "openllm_core", "openllm_client"], p)) for p in ("openllm_core", "openllm_client", "openllm")]
if all(i for i in built_wheels): wheels.extend([llm_fs.getsyspath(f"/{i.split('/')[-1]}") for i in t.cast(t.List[str], built_wheels)])
return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, extra_index_url=["https://download.pytorch.org/whl/cu118"])
def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, bettertransformer: bool | None, adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, runtime: t.Literal["ggml", "transformers"], serialisation_format: t.Literal["safetensors", "legacy"], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions:
from openllm.cli._factory import parse_config_options
environ = parse_config_options(llm.config, llm.config["timeout"], workers_per_resource, None, True, os.environ.copy())
env: openllm.utils.EnvVarMixin = llm.config["env"]
env: openllm_core.utils.EnvVarMixin = llm.config["env"]
if env["framework_value"] == "vllm": serialisation_format = "legacy"
env_dict = {
env.framework: env["framework_value"], env.config: f"'{llm.config.model_dump_json().decode()}'",
@@ -91,13 +90,45 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_
if adapter_map: env_dict["BITSANDBYTES_NOWELCOME"] = os.environ.get("BITSANDBYTES_NOWELCOME", "1")
# We need to handle None separately here, as env from subprocess doesn't accept None value.
_env = openllm.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
_env = openllm_core.utils.EnvVarMixin(llm.config["model_name"], bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
env_dict[_env.bettertransformer] = str(_env["bettertransformer_value"])
if _env["quantize_value"] is not None: env_dict[_env.quantize] = t.cast(str, _env["quantize_value"])
env_dict[_env.runtime] = _env["runtime_value"]
return DockerOptions(base_image=f"{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}", env=env_dict, dockerfile_template=dockerfile_template)
OPENLLM_MODEL_NAME = "# openllm: model name"
OPENLLM_MODEL_ADAPTER_MAP = "# openllm: model adapter map"
class ModelNameFormatter(string.Formatter):
model_keyword: LiteralString = "__model_name__"
def __init__(self, model_name: str):
"""The formatter that extends model_name to be formatted the 'service.py'."""
super().__init__()
self.model_name = model_name
def vformat(self, format_string: str, *args: t.Any, **attrs: t.Any) -> t.Any: return super().vformat(format_string, (), {self.model_keyword: self.model_name})
def can_format(self, value: str) -> bool:
try:
self.parse(value)
return True
except ValueError: return False
class ModelIdFormatter(ModelNameFormatter):
model_keyword: LiteralString = "__model_id__"
class ModelAdapterMapFormatter(ModelNameFormatter):
model_keyword: LiteralString = "__model_adapter_map__"
_service_file = Path(os.path.abspath(__file__)).parent.parent/"_service.py"
def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None:
from openllm_core.utils import DEBUG
model_name = llm.config["model_name"]
logger.debug("Generating service file for %s at %s (dir=%s)", model_name, llm.config["service_name"], llm_fs.getsyspath("/"))
with open(_service_file.__fspath__(), "r") as f: src_contents = f.readlines()
for it in src_contents:
if OPENLLM_MODEL_NAME in it: src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + "\n")
elif OPENLLM_MODEL_ADAPTER_MAP in it: src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + "\n")
script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + "".join(src_contents)
if DEBUG: logger.info("Generated script:\n%s", script)
llm_fs.writetext(llm.config["service_name"], script)
@inject
def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.Any], workers_per_resource: str | float, quantize: LiteralString | None, bettertransformer: bool | None, dockerfile_template: str | None, adapter_map: dict[str, str | None] | None = None, extra_dependencies: tuple[str, ...] | None = None,
runtime: t.Literal[ "ggml", "transformers"] = "transformers", serialisation_format: t.Literal["safetensors", "legacy"] = "safetensors", container_registry: LiteralContainerRegistry = "ecr", container_version_strategy: LiteralContainerVersionStrategy = "release",
@@ -108,14 +139,14 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
if adapter_map: labels.update(adapter_map)
if isinstance(workers_per_resource, str):
if workers_per_resource == "round_robin": workers_per_resource = 1.0
elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm.utils.device_count() == 0 else float(1 / openllm.utils.device_count())
elif workers_per_resource == "conserved": workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count())
else:
try: workers_per_resource = float(workers_per_resource)
except ValueError: raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None
elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource)
logger.info("Building Bento for '%s'", llm.config["start_name"])
# add service.py definition to this temporary folder
openllm.utils.codegen.write_service(llm, adapter_map, llm_fs)
write_service(llm, adapter_map, llm_fs)
llm_spec = ModelSpec.from_item({"tag": str(llm.tag), "alias": llm.tag.name})
build_config = BentoBuildConfig(
@@ -134,7 +165,7 @@ def create_bento(bento_tag: bentoml.Tag, llm_fs: FS, llm: openllm.LLM[t.Any, t.A
if "__bento_name__" in it: service_contents[service_contents.index(it)] = it.format(__bento_name__=str(bento.tag))
script = "".join(service_contents)
if openllm.utils.DEBUG: logger.info("Generated script:\n%s", script)
if openllm_core.utils.DEBUG: logger.info("Generated script:\n%s", script)
bento._fs.writetext(service_fs_path, script)
if "model_store" in inspect.signature(bento.save).parameters: return bento.save(bento_store=_bento_store, model_store=_model_store)

View File

@@ -1,26 +1,23 @@
# mypy: disable-error-code="misc"
"""OCI-related utilities for OpenLLM. This module is considered to be internal and API are subjected to change."""
from __future__ import annotations
import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t
import functools, importlib, logging, os, pathlib, shutil, subprocess, typing as t, openllm_core
from datetime import datetime, timedelta, timezone
import attr, orjson, bentoml, openllm
from openllm.utils.lazy import VersionInfo
from openllm_core.utils.lazy import VersionInfo
if t.TYPE_CHECKING:
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
from ghapi import all
from openllm._typing_compat import RefTuple, LiteralString
from openllm_core._typing_compat import RefTuple, LiteralString
all = openllm.utils.LazyLoader("all", globals(), "ghapi.all") # noqa: F811
all = openllm_core.utils.LazyLoader("all", globals(), "ghapi.all") # noqa: F811
logger = logging.getLogger(__name__)
_BUILDER = bentoml.container.get_backend("buildx")
ROOT_DIR = pathlib.Path(os.path.abspath("__file__")).parent.parent.parent
# TODO: support quay
LiteralContainerRegistry = t.Literal["docker", "gh", "ecr"]
LiteralContainerVersionStrategy = t.Literal["release", "nightly", "latest", "custom"]
# XXX: This registry will be hard code for now for easier to maintain
# but in the future, we can infer based on git repo and everything to make it more options for users
# to build the base image. For now, all of the base image will be <registry>/bentoml/openllm:...
@@ -31,10 +28,10 @@ _CONTAINER_REGISTRY: dict[LiteralContainerRegistry, str] = {"docker": "docker.io
_OWNER = "bentoml"
_REPO = "openllm"
_module_location = openllm.utils.pkg.source_locations("openllm")
_module_location = openllm_core.utils.pkg.source_locations("openllm")
@functools.lru_cache
@openllm.utils.apply(str.lower)
@openllm_core.utils.apply(str.lower)
def get_base_container_name(reg: LiteralContainerRegistry) -> str: return _CONTAINER_REGISTRY[reg]
def _convert_version_from_string(s: str) -> VersionInfo: return VersionInfo.from_version_string(s)
@@ -43,7 +40,7 @@ def _commit_time_range(r: int = 5) -> str: return (datetime.now(timezone.utc) -
class VersionNotSupported(openllm.exceptions.OpenLLMException):
"""Raised when the stable release is too low that it doesn't include OpenLLM base container."""
_RefTuple: type[RefTuple] = openllm.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class("_RefTuple", ["git_hash", "version", "strategy"])
def nightly_resolver(cls: type[RefResolver]) -> str:
# NOTE: all openllm container will have sha-<git_hash[:7]>
@@ -60,7 +57,7 @@ def nightly_resolver(cls: type[RefResolver]) -> str:
@attr.attrs(eq=False, order=False, slots=True, frozen=True)
class RefResolver:
git_hash: str = attr.field()
version: openllm.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
version: openllm_core.utils.VersionInfo = attr.field(converter=_convert_version_from_string)
strategy: LiteralContainerVersionStrategy = attr.field()
_ghapi: t.ClassVar[all.GhApi] = all.GhApi(owner=_OWNER, repo=_REPO)
@classmethod
@@ -74,7 +71,7 @@ class RefResolver:
version_str = meta["name"].lstrip("v")
version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")["object"]["sha"], version_str)
else: version = ("", version_str)
if openllm.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'")
return _RefTuple((*version, "release" if _use_base_strategy else "custom"))
@classmethod
@functools.lru_cache(maxsize=64)
@@ -101,7 +98,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
try:
if not _BUILDER.health(): raise openllm.exceptions.Error
except (openllm.exceptions.Error, subprocess.CalledProcessError): raise RuntimeError("Building base container requires BuildKit (via Buildx) to be installed. See https://docs.docker.com/build/buildx/install/ for instalation instruction.") from None
if openllm.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
if openllm_core.utils.device_count() == 0: raise RuntimeError("Building base container requires GPUs (None available)")
if not shutil.which("nvidia-container-runtime"): raise RuntimeError("NVIDIA Container Toolkit is required to compile CUDA kernel in container.")
if not _module_location: raise RuntimeError("Failed to determine source location of 'openllm'. (Possible broken installation)")
pyproject_path = pathlib.Path(_module_location).parent.parent / "pyproject.toml"
@@ -111,7 +108,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon
registries = [registries] if isinstance(registries, str) else list(registries)
tags = {name: f"{_CONTAINER_REGISTRY[name]}:{get_base_container_tag(version_strategy)}" for name in registries}
try:
outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm.utils.get_debug_mode() else "auto", quiet=machine)
outputs = _BUILDER.build(file=pathlib.Path(__file__).parent.joinpath("Dockerfile").resolve().__fspath__(), context_path=pyproject_path.parent.__fspath__(), tag=tuple(tags.values()), push=push, progress="plain" if openllm_core.utils.get_debug_mode() else "auto", quiet=machine)
if machine and outputs is not None: tags["image_sha"] = outputs.decode("utf-8").strip()
except Exception as err: raise openllm.exceptions.OpenLLMException(f"Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}") from err
return tags

View File

@@ -1,16 +1,16 @@
from __future__ import annotations
import functools, importlib.util, os, typing as t, logging
import click, click_option_group as cog, inflection, orjson, bentoml, openllm
import functools, importlib.util, os, typing as t, logging, click, click_option_group as cog, inflection, orjson, bentoml, openllm
from click import shell_completion as sc
from bentoml_cli.utils import BentoMLCommandGroup
from click.shell_completion import CompletionItem
from openllm.utils import DEBUG
from openllm_core.utils import DEBUG
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
from openllm_core._typing_compat import LiteralString, DictStrAny, ParamSpec, Concatenate
from . import termui
if t.TYPE_CHECKING:
import subprocess
from openllm._configuration import LLMConfig
from openllm_core._configuration import LLMConfig
logger = logging.getLogger(__name__)
@@ -20,6 +20,12 @@ LiteralOutput = t.Literal["json", "pretty", "porcelain"]
_AnyCallable = t.Callable[..., t.Any]
FC = t.TypeVar("FC", bound=t.Union[_AnyCallable, click.Command])
def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(str(it.tag), help="Bento") for it in bentoml.list() if str(it.tag).startswith(incomplete) and all(k in it.info.labels for k in {"start_name", "bundler"})]
def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]:
return [sc.CompletionItem(inflection.dasherize(it), help="Model") for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)]
def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny:
# TODO: Support amd.com/gpu on k8s
_bentoml_config_options_env = environ.pop("BENTOML_CONFIG_OPTIONS", "")
@@ -316,7 +322,7 @@ def cors_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC
def machine_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--machine", is_flag=True, default=False, hidden=True, **attrs)(f)
def model_id_option(f: _AnyCallable | None = None, *, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-id", type=click.STRING, default=None, envvar=model_env.model_id if model_env is not None else None, show_envvar=model_env is not None, help="Optional model_id name or path for (fine-tune) weight.", **attrs)(f)
def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option("--model-version", type=click.STRING, default=None, help="Optional model version to save for this model. It will be inferred automatically from model-id.", **attrs)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required)(f)
def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f)
def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, model_env: openllm.utils.EnvVarMixin | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--quantise", "--quantize", "quantize", type=click.Choice(["int8", "int4", "gptq"]), default=None, envvar=model_env.quantize if model_env is not None else None, show_envvar=model_env is not None, help="""Dynamic quantization for running this LLM.
@@ -382,7 +388,7 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal
)(f)
def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]:
return cli_option(
"--container-registry", "container_registry", type=str, default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
"--container-registry", "container_registry", type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), default="ecr", show_default=True, show_envvar=True, envvar="OPENLLM_CONTAINER_REGISTRY", callback=container_registry_callback, help="""The default container registry to get the base image for building BentoLLM.
Currently, it supports 'ecr', 'ghcr.io', 'docker.io'

View File

@@ -1,6 +1,5 @@
from __future__ import annotations
import itertools, logging, os, re, subprocess, sys, typing as t
import bentoml, openllm
import itertools, logging, os, re, subprocess, sys, typing as t, bentoml, openllm, openllm_core
from simple_di import Provide, inject
from bentoml._internal.configuration.containers import BentoMLContainer
from openllm.exceptions import OpenLLMException
@@ -8,10 +7,9 @@ from . import termui
from ._factory import start_command_factory
if t.TYPE_CHECKING:
from openllm._typing_compat import LiteralString, LiteralRuntime
from openllm_core._configuration import LLMConfig
from openllm_core._typing_compat import LiteralString, LiteralRuntime, LiteralContainerRegistry, LiteralContainerVersionStrategy
from bentoml._internal.bento import BentoStore
from openllm._configuration import LLMConfig
from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
logger = logging.getLogger(__name__)
@@ -58,7 +56,7 @@ def _start(model_name: str, /, *, model_id: str | None = None, timeout: int = 30
"""
from .entrypoint import start_command, start_grpc_command
llm_config = openllm.AutoConfig.for_model(model_name)
_ModelEnv = openllm.utils.EnvVarMixin(model_name, openllm.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
_ModelEnv = openllm_core.utils.EnvVarMixin(model_name, openllm_core.utils.first_not_none(framework, default=llm_config.default_implementation()), model_id=model_id, bettertransformer=bettertransformer, quantize=quantize, runtime=runtime)
os.environ[_ModelEnv.framework] = _ModelEnv["framework_value"]
args: list[str] = ["--runtime", runtime]
@@ -203,5 +201,5 @@ def _list_models() -> dict[str, t.Any]:
return models_command.main(args=["-o", "json", "--show-available", "--machine"], standalone_mode=False)
start, start_grpc, build, import_model, list_models = openllm.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm.utils.codegen.gen_sdk(_build), openllm.utils.codegen.gen_sdk(_import_model), openllm.utils.codegen.gen_sdk(_list_models)
start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk(_import_model), openllm_core.utils.codegen.gen_sdk(_list_models)
__all__ = ["start", "start_grpc", "build", "import_model", "list_models"]

View File

@@ -20,10 +20,9 @@ bentomodel = openllm.import_model("falcon", model_id='tiiuae/falcon-7b-instruct'
```
"""
from __future__ import annotations
import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t
import attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
import functools, http.client, inspect, itertools, logging, os, platform, re, subprocess, sys, time, traceback, typing as t, attr, click, click_option_group as cog, fs, fs.copy, fs.errors, inflection, orjson, bentoml, openllm
from simple_di import Provide, inject
from bentoml_cli.utils import BentoMLCommandGroup, opt_callback
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.models.model import ModelStore
from . import termui
@@ -56,8 +55,8 @@ from openllm.models.auto import (
AutoConfig,
AutoLLM,
)
from openllm._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
from openllm.utils import (
from openllm_core._typing_compat import DictStrAny, ParamSpec, Concatenate, LiteralString, Self, LiteralRuntime
from openllm_core.utils import (
DEBUG,
DEBUG_ENV_VAR,
OPTIONAL_DEPENDENCIES,
@@ -72,21 +71,20 @@ from openllm.utils import (
first_not_none,
get_debug_mode,
get_quiet_mode,
infer_auto_class,
is_torch_available,
is_transformers_supports_agent,
resolve_user_filepath,
set_debug_mode,
set_quiet_mode,
)
from openllm.utils import infer_auto_class
if t.TYPE_CHECKING:
import torch
from bentoml._internal.bento import BentoStore
from bentoml._internal.container import DefaultBuilder
from openllm.client import BaseClient
from openllm._schema import EmbeddingsOutput
from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
from openllm_core._schema import EmbeddingsOutput
from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
else: torch = LazyLoader("torch", globals(), "torch")
P = ParamSpec("P")
@@ -271,7 +269,7 @@ def cli() -> None:
\b
An open platform for operating large language models in production.
Fine-tune, serve, deploy, and monitor any LLMs with ease.
""" # noqa: D205
"""
@cli.group(cls=OpenLLMCommandGroup, context_settings=termui.CONTEXT_SETTINGS, name="start", aliases=["start-http"])
def start_command() -> None:
@@ -670,10 +668,8 @@ def instruct_command(endpoint: str, timeout: int, agent: LiteralString, output:
"""
client = openllm.client.HTTPClient(endpoint, timeout=timeout)
try:
client.call("metadata")
except http.client.BadStatusLine:
raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
try: client.call("metadata")
except http.client.BadStatusLine: raise click.ClickException(f"{endpoint} is neither a HTTP server nor reachable.") from None
if agent == "hf":
if not is_transformers_supports_agent(): raise click.UsageError("Transformers version should be at least 4.29 to support HfAgent. Upgrade with 'pip install -U transformers'")
_memoized = {k: v[0] for k, v in _memoized.items() if v}
@@ -700,7 +696,7 @@ def embed_command(ctx: click.Context, text: tuple[str, ...], endpoint: str, time
$ openllm embed --endpoint http://12.323.2.1:3000 "What is the meaning of life?" "How many stars are there in the sky?"
```
"""
client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
try:
gen_embed = client.embed(text)
except ValueError:
@@ -733,14 +729,14 @@ def query_command(ctx: click.Context, /, prompt: str, endpoint: str, timeout: in
"""
_memoized = {k: orjson.loads(v[0]) for k, v in _memoized.items() if v}
if server_type == "grpc": endpoint = re.sub(r"http://", "", endpoint)
client = t.cast("BaseClient[t.Any]", openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout))
client = openllm.client.HTTPClient(endpoint, timeout=timeout) if server_type == "http" else openllm.client.GrpcClient(endpoint, timeout=timeout)
input_fg, generated_fg = "magenta", "cyan"
if output != "porcelain":
termui.echo("==Input==\n", fg="white")
termui.echo(f"{prompt}", fg=input_fg)
res = client.query(prompt, return_response="raw", **{**client.configuration, **_memoized})
if output == "pretty":
response = client.llm.postprocess_generate(prompt, res["responses"])
response = client.config.postprocess_generate(prompt, res["responses"])
termui.echo("\n\n==Responses==\n", fg="white")
termui.echo(response, fg=generated_fg)
elif output == "json":

View File

@@ -1,37 +1,26 @@
from __future__ import annotations
import typing as t
import click
import orjson
import openllm
from .. import termui
from .._factory import machine_option
if t.TYPE_CHECKING:
from openllm.bundle.oci import LiteralContainerRegistry, LiteralContainerVersionStrategy
import typing as t, click, orjson, openllm
from openllm.cli import termui
from openllm.cli._factory import machine_option, container_registry_option
if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralContainerRegistry, LiteralContainerVersionStrategy
@click.command(
"build_base_container", context_settings=termui.CONTEXT_SETTINGS, help="""Base image builder for BentoLLM.
By default, the base image will include custom kernels (PagedAttention via vllm, FlashAttention-v2, etc.) built with CUDA 11.8, Python 3.9 on Ubuntu22.04.
Optionally, this can also be pushed directly to remote registry. Currently support ``docker.io``, ``ghcr.io`` and ``quay.io``.
\b
If '--machine' is passed, then it will run the process quietly, and output a JSON to the current running terminal.
This command is only useful for debugging and for building custom base image for extending BentoML with custom base images and custom kernels.
Note that we already release images on our CI to ECR and GHCR, so you don't need to build it yourself.
"""
)
@click.option("--registry", multiple=True, type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), help="Target registry to create image tag on.", default=None)
@container_registry_option
@click.option("--version-strategy", type=click.Choice(["release", "latest", "nightly"]), default="nightly", help="Version strategy to use for tagging the image.")
@click.option("--push/--no-push", help="Whether to push to remote repository", is_flag=True, default=False)
@machine_option
def cli(registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(registry, version_strategy, push, machine)
def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]:
mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine)
if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg="white")
return mapping

View File

@@ -1,24 +1,16 @@
from __future__ import annotations
import shutil
import subprocess
import typing as t
import click
import psutil
import shutil, subprocess, typing as t, click, psutil, bentoml
from simple_di import Provide, inject
import bentoml
from bentoml._internal.configuration.containers import BentoMLContainer
from .. import termui
from openllm.cli import termui
from openllm.cli._factory import bento_complete_envvar, machine_option
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
@click.command("dive_bentos", context_settings=termui.CONTEXT_SETTINGS)
@click.argument("bento", type=str)
@click.option("--machine", is_flag=True, default=False, hidden=True)
@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
@machine_option
@click.pass_context
@inject
def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None:
@@ -32,5 +24,5 @@ def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore
if machine: return bentomodel.path
# copy and paste this into a new shell
if psutil.WINDOWS: subprocess.check_call([shutil.which("dir") or "dir"], cwd=bentomodel.path)
else: subprocess.check_call([shutil.which("tree") or "tree"], cwd=bentomodel.path)
else: subprocess.check_call([shutil.which("ls") or "ls", "-Rrthla"], cwd=bentomodel.path)
ctx.exit(0)

View File

@@ -1,24 +1,18 @@
from __future__ import annotations
import typing as t
import click
import typing as t, click, bentoml
from simple_di import Provide, inject
import bentoml
from bentoml._internal.bento.bento import BentoInfo
from bentoml._internal.bento.build_config import DockerOptions
from bentoml._internal.configuration.containers import BentoMLContainer
from bentoml._internal.container.generate import generate_containerfile
from openllm.cli import termui
from openllm.cli._factory import bento_complete_envvar
from openllm_core.utils import bentoml_cattr
from .. import termui
from ...utils import bentoml_cattr
if t.TYPE_CHECKING:
from bentoml._internal.bento import BentoStore
if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore
@click.command("get_containerfile", context_settings=termui.CONTEXT_SETTINGS, help="Return Containerfile of any given Bento.")
@click.argument("bento", type=str)
@click.argument("bento", type=str, shell_complete=bento_complete_envvar)
@click.pass_context
@inject
def cli(ctx: click.Context, bento: str, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str:

View File

@@ -1,25 +1,18 @@
from __future__ import annotations
import typing as t
import click
import inflection
import orjson
import typing as t, click, inflection, orjson, openllm
from bentoml_cli.utils import opt_callback
import openllm
from .. import termui
from ..._prompt import process_prompt
from openllm.cli import termui
from openllm.cli._factory import model_complete_envvar, output_option, machine_option
from openllm_core._prompt import process_prompt
LiteralOutput = t.Literal["json", "pretty", "porcelain"]
@click.command("get_prompt", context_settings=termui.CONTEXT_SETTINGS)
@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]))
@click.argument("model_name", type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING.keys()]), shell_complete=model_complete_envvar)
@click.argument("prompt", type=click.STRING)
@click.option("-o", "--output", "output", type=click.Choice(["json", "pretty", "porcelain"]), default="pretty", help="Showing output type.", show_default=True, envvar="OPENLLM_OUTPUT", show_envvar=True)
@output_option
@click.option("--format", type=click.STRING, default=None)
@click.option("--machine", is_flag=True, default=False, hidden=True)
@machine_option
@click.option("--opt", help="Define additional prompt variables. (format: ``--opt system_prompt='You are a useful assistant'``)", required=False, multiple=True, callback=opt_callback, metavar="ARG=VALUE[,ARG=VALUE]")
@click.pass_context
def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], **_: t.Any) -> str | None:

View File

@@ -1,16 +1,8 @@
from __future__ import annotations
import click
import inflection
import orjson
import bentoml
import openllm
import click, inflection, orjson, bentoml, openllm
from bentoml._internal.utils import human_readable_size
from .. import termui
from .._factory import LiteralOutput, output_option
from openllm.cli import termui
from openllm.cli._factory import LiteralOutput, output_option
@click.command("list_bentos", context_settings=termui.CONTEXT_SETTINGS)
@output_option(default_value="json")

View File

@@ -1,14 +1,13 @@
from __future__ import annotations
import typing as t, bentoml, openllm, orjson, inflection ,click
from bentoml._internal.utils import human_readable_size
from openllm.cli import termui
from openllm.cli._factory import LiteralOutput, model_name_argument, output_option
from bentoml._internal.utils import human_readable_size
from openllm.cli._factory import LiteralOutput, model_name_argument, output_option, model_complete_envvar
if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
@click.command("list_models", context_settings=termui.CONTEXT_SETTINGS)
@model_name_argument(required=False)
@model_name_argument(required=False, shell_complete=model_complete_envvar)
@output_option(default_value="json")
def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny:
"""This is equivalent to openllm models --show-available less the nice table."""

View File

@@ -1,13 +1,12 @@
from __future__ import annotations
import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t
import click, yaml
import importlib.machinery, logging, os, pkgutil, subprocess, sys, tempfile, typing as t, click, yaml
from openllm.cli import termui
from openllm import playground
from openllm.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
from openllm_core.utils import is_jupyter_available, is_jupytext_available, is_notebook_available
if t.TYPE_CHECKING:
import jupytext, nbformat
from openllm._typing_compat import DictStrAny
from openllm_core._typing_compat import DictStrAny
logger = logging.getLogger(__name__)
@@ -38,7 +37,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None:
\b
> [!NOTE]
> This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"'
""" # noqa: D301
"""
if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available():
raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'")
metadata = load_notebook_metadata()

View File

@@ -1,6 +1,6 @@
from __future__ import annotations
import os, typing as t, click, inflection, openllm
if t.TYPE_CHECKING: from openllm._typing_compat import DictStrAny
if t.TYPE_CHECKING: from openllm_core._typing_compat import DictStrAny
def echo(text: t.Any, fg: str = "green", _with_style: bool = True, **attrs: t.Any) -> None:
attrs["fg"] = fg if not openllm.utils.get_debug_mode() else None

View File

@@ -0,0 +1,17 @@
"""OpenLLM Python client.
```python
client = openllm.client.HTTPClient("http://localhost:8080")
client.query("What is the difference between gather and scatter?")
```
If the server has embedding supports, use it via `client.embed`:
```python
client.embed("What is the difference between gather and scatter?")
```
"""
from __future__ import annotations
import openllm_client, typing as t
if t.TYPE_CHECKING: from openllm_client import AsyncHTTPClient as AsyncHTTPClient, BaseAsyncClient as BaseAsyncClient, BaseClient as BaseClient, HTTPClient as HTTPClient, GrpcClient as GrpcClient, AsyncGrpcClient as AsyncGrpcClient
def __dir__() -> t.Sequence[str]: return sorted(dir(openllm_client))
def __getattr__(it: str) -> t.Any: return getattr(openllm_client, it)

View File

@@ -1,22 +0,0 @@
"""OpenLLM Python client.
```python
client = openllm.client.HTTPClient("http://localhost:8080")
client.query("What is the difference between gather and scatter?")
```
If the server has embedding supports, use it via `client.embed`:
```python
client.embed("What is the difference between gather and scatter?")
```
"""
from __future__ import annotations
from openllm.client.runtimes import (
AsyncGrpcClient as AsyncGrpcClient,
AsyncHTTPClient as AsyncHTTPClient,
BaseAsyncClient as BaseAsyncClient,
BaseClient as BaseClient,
GrpcClient as GrpcClient,
HTTPClient as HTTPClient,
)

View File

@@ -1,15 +0,0 @@
"""Client that supports REST/gRPC protocol to interact with a LLMServer."""
from __future__ import annotations
from openllm.client.runtimes.base import (
BaseAsyncClient as BaseAsyncClient,
BaseClient as BaseClient,
)
from openllm.client.runtimes.grpc import (
AsyncGrpcClient as AsyncGrpcClient,
GrpcClient as GrpcClient,
)
from openllm.client.runtimes.http import (
AsyncHTTPClient as AsyncHTTPClient,
HTTPClient as HTTPClient,
)

View File

@@ -1,238 +0,0 @@
# mypy: disable-error-code="name-defined"
from __future__ import annotations
import asyncio, logging, typing as t
import bentoml, bentoml.client, openllm, httpx
from abc import abstractmethod
from http import HTTPStatus
from urllib.parse import urljoin
from openllm._typing_compat import overload, LiteralString
T = t.TypeVar("T")
T_co = t.TypeVar("T_co", covariant=True)
if t.TYPE_CHECKING:
import transformers
from openllm._typing_compat import DictStrAny, LiteralRuntime
else: transformers = openllm.utils.LazyLoader("transformers", globals(), "transformers")
class AnnotatedClient(t.Protocol[T_co]):
server_url: str
_svc: bentoml.Service
endpoints: list[str]
def health(self, *args: t.Any, **attrs: t.Any) -> t.Any: ...
async def async_health(self) -> t.Any: ...
def generate_v1(self, qa: openllm.GenerationInput) -> T_co: ...
def metadata_v1(self) -> T_co: ...
def embeddings_v1(self) -> t.Sequence[float]: ...
def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
async def async_call(self, name: str, *args: t.Any, **attrs: t.Any) -> T_co: ...
@staticmethod
def wait_until_server_ready(host: str, port: int, timeout: float = 30, **kwargs: t.Any) -> None: ...
@staticmethod
def from_url(server_url: str) -> AnnotatedClient[t.Any]: ...
logger = logging.getLogger(__name__)
def in_async_context() -> bool:
try:
_ = asyncio.get_running_loop()
return True
except RuntimeError: return False
class ClientMeta(t.Generic[T]):
_api_version: str
_client_type: t.Literal["GrpcClient", "HTTPClient"]
_host: str
_port: str
__client__: AnnotatedClient[T] | None = None
__agent__: transformers.HfAgent | None = None
__llm__: openllm.LLM[t.Any, t.Any] | None = None
def __init__(self, address: str, timeout: int = 30): self._address,self._timeout = address,timeout
def __init_subclass__(cls, *, client_type: t.Literal["http", "grpc"] = "http", api_version: str = "v1"): cls._client_type, cls._api_version = "HTTPClient" if client_type == "http" else "GrpcClient", api_version
@property
def _hf_agent(self) -> transformers.HfAgent:
if not self.supports_hf_agent: raise openllm.exceptions.OpenLLMException(f"{self.model_name} ({self.framework}) does not support running HF agent.")
if self.__agent__ is None:
if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'")
self.__agent__ = transformers.HfAgent(urljoin(self._address, "/hf/agent"))
return self.__agent__
@property
def _metadata(self) -> T: return httpx.post(urljoin(self._address, f"/{self._api_version}/metadata")).json() if in_async_context() else self.call("metadata")
@property
@abstractmethod
def model_name(self) -> str: raise NotImplementedError
@property
@abstractmethod
def framework(self) -> LiteralRuntime: raise NotImplementedError
@property
@abstractmethod
def timeout(self) -> int: raise NotImplementedError
@property
@abstractmethod
def model_id(self) -> str: raise NotImplementedError
@property
@abstractmethod
def configuration(self) -> dict[str, t.Any]: raise NotImplementedError
@property
@abstractmethod
def supports_embeddings(self) -> bool: raise NotImplementedError
@property
@abstractmethod
def supports_hf_agent(self) -> bool: raise NotImplementedError
@abstractmethod
def postprocess(self, result: t.Any) -> openllm.GenerationOutput: ...
@abstractmethod
def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: ...
@property
def config(self) -> openllm.LLMConfig: return self.llm.config
@property
def llm(self) -> openllm.LLM[t.Any, t.Any]:
# XXX: if the server runs vllm or any framework that is not available from the user client, client will fail.
if self.__llm__ is None: self.__llm__ = openllm.infer_auto_class(self.framework).for_model(self.model_name)
return self.__llm__
def call(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return self._cached.call(f"{name}_{self._api_version}", *args, **attrs)
async def acall(self, name: str, *args: t.Any, **attrs: t.Any) -> T: return await self._cached.async_call(f"{name}_{self._api_version}", *args, **attrs)
@property
def _cached(self) -> AnnotatedClient[T]:
client_class = t.cast(AnnotatedClient[T], getattr(bentoml.client, self._client_type))
if self.__client__ is None:
client_class.wait_until_server_ready(self._host, int(self._port), timeout=self._timeout)
self.__client__ = client_class.from_url(self._address)
return self.__client__
class BaseClient(ClientMeta[T]):
def health(self) -> t.Any: raise NotImplementedError
def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
@overload
def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
return_raw_response = attrs.pop("return_raw_response", None)
if return_raw_response is not None:
logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
if return_raw_response is True: return_response = "raw"
return_attrs = attrs.pop("return_attrs", None)
if return_attrs is not None:
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = "attrs"
use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
if in_async_context(): result = httpx.post(urljoin(self._address, f"/{self._api_version}/generate"), json=inputs.model_dump(), timeout=self.timeout).json()
else: result = self.call("generate", inputs.model_dump())
r = self.postprocess(result)
if return_response == "attrs": return r
elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
# NOTE: Scikit interface
@overload
def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], self.query(prompt, **attrs))
def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
if agent_type == "hf": return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
task = kwargs.pop("task", args[0])
return_code = kwargs.pop("return_code", False)
remote = kwargs.pop("remote", False)
try: return self._hf_agent.run(task, return_code=return_code, remote=remote, **kwargs)
except Exception as err:
logger.error("Exception caught while sending instruction to HF agent: %s", err, exc_info=err)
logger.info("Tip: LLMServer at '%s' might not support 'generate_one'.", self._address)
class BaseAsyncClient(ClientMeta[T]):
async def health(self) -> t.Any: raise NotImplementedError
async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError
async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput: raise NotImplementedError
@overload
async def query(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
async def query(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
async def query(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
async def query(self, prompt: str, return_response: t.Literal["attrs", "raw", "processed"] = "processed", **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str:
return_raw_response = attrs.pop("return_raw_response", None)
if return_raw_response is not None:
logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.")
if return_raw_response is True: return_response = "raw"
return_attrs = attrs.pop("return_attrs", None)
if return_attrs is not None:
logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.")
if return_attrs is True: return_response = "attrs"
use_default_prompt_template = attrs.pop("use_default_prompt_template", False)
prompt, generate_kwargs, postprocess_kwargs = self.llm.sanitize_parameters(prompt, use_default_prompt_template=use_default_prompt_template, **attrs)
inputs = openllm.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs))
res = await self.acall("generate", inputs.model_dump())
r = self.postprocess(res)
if return_response == "attrs": return r
elif return_response == "raw": return openllm.utils.bentoml_cattr.unstructure(r)
else: return self.llm.postprocess_generate(prompt, r.responses, **postprocess_kwargs)
# NOTE: Scikit interface
@overload
async def predict(self, prompt: str, *, return_response: t.Literal["processed"], **attrs: t.Any) -> str: ...
@overload
async def predict(self, prompt: str, *, return_response: t.Literal["raw"], **attrs: t.Any) -> DictStrAny: ...
@overload
async def predict(self, prompt: str, *, return_response: t.Literal["attrs"], **attrs: t.Any) -> openllm.GenerationOutput: ...
async def predict(self, prompt: str, **attrs: t.Any) -> openllm.GenerationOutput | DictStrAny | str: return t.cast(t.Union[openllm.GenerationOutput, DictStrAny, str], await self.query(prompt, **attrs))
async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = "hf", **attrs: t.Any) -> t.Any:
"""Async version of agent.run."""
if agent_type == "hf": return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs)
else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'")
async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any:
if not openllm.utils.is_transformers_supports_agent(): raise RuntimeError("This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0")
if len(args) > 1: raise ValueError("'args' should only take one positional argument.")
task = kwargs.pop("task", args[0])
return_code = kwargs.pop("return_code", False)
remote = kwargs.pop("remote", False)
from transformers.tools.agents import clean_code_for_run, get_tool_creation_code, resolve_tools
from transformers.tools.python_interpreter import evaluate
_hf_agent = self._hf_agent
prompt = t.cast(str, _hf_agent.format_prompt(task))
stop = ["Task:"]
async with httpx.AsyncClient(timeout=httpx.Timeout(self.timeout)) as client:
response = await client.post(_hf_agent.url_endpoint, json={"inputs": prompt, "parameters": {"max_new_tokens": 200, "return_full_text": False, "stop": stop},},)
if response.status_code != HTTPStatus.OK:
raise ValueError(f"Error {response.status_code}: {response.json()}")
result = response.json()[0]["generated_text"]
# Inference API returns the stop sequence
for stop_seq in stop:
if result.endswith(stop_seq):
result = result[:-len(stop_seq)]
break
# the below have the same logic as agent.run API
explanation, code = clean_code_for_run(result)
_hf_agent.log(f"==Explanation from the agent==\n{explanation}")
_hf_agent.log(f"\n\n==Code generated by the agent==\n{code}")
if not return_code:
_hf_agent.log("\n\n==Result==")
_hf_agent.cached_tools = resolve_tools(code, _hf_agent.toolbox, remote=remote, cached_tools=_hf_agent.cached_tools)
return evaluate(code, _hf_agent.cached_tools, state=kwargs.copy())
else:
tool_code = get_tool_creation_code(code, _hf_agent.toolbox, remote=remote)
return f"{tool_code}\n{code}"

View File

@@ -1,93 +0,0 @@
from __future__ import annotations
import asyncio, logging, typing as t
import orjson, openllm
from openllm._typing_compat import LiteralRuntime
from .base import BaseAsyncClient, BaseClient
if t.TYPE_CHECKING:
from grpc_health.v1 import health_pb2
from bentoml.grpc.v1.service_pb2 import Response
logger = logging.getLogger(__name__)
class GrpcClient(BaseClient["Response"], client_type="grpc"):
def __init__(self, address: str, timeout: int = 30):
self._host, self._port = address.split(":")
super().__init__(address, timeout)
def health(self) -> health_pb2.HealthCheckResponse: return asyncio.run(self._cached.health("bentoml.grpc.v1.BentoService"))
@property
def model_name(self) -> str:
try: return self._metadata.json.struct_value.fields["model_name"].string_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def framework(self) -> LiteralRuntime:
try:
value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
return value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def timeout(self) -> int:
try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def model_id(self) -> str:
try: return self._metadata.json.struct_value.fields["model_id"].string_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def configuration(self) -> dict[str, t.Any]:
try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
from google.protobuf.json_format import MessageToDict
if isinstance(result, dict): return openllm.GenerationOutput(**result)
return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))
class AsyncGrpcClient(BaseAsyncClient["Response"], client_type="grpc"):
def __init__(self, address: str, timeout: int = 30):
self._host, self._port = address.split(":")
super().__init__(address, timeout)
async def health(self) -> health_pb2.HealthCheckResponse: return await self._cached.health("bentoml.grpc.v1.BentoService")
@property
def model_name(self) -> str:
try: return self._metadata.json.struct_value.fields["model_name"].string_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def framework(self) -> LiteralRuntime:
try:
value = t.cast(LiteralRuntime, self._metadata.json.struct_value.fields["framework"].string_value)
if value not in ("pt", "flax", "tf", "vllm"): raise KeyError
return value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def timeout(self) -> int:
try: return int(self._metadata.json.struct_value.fields["timeout"].number_value)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def model_id(self) -> str:
try: return self._metadata.json.struct_value.fields["model_id"].string_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def configuration(self) -> dict[str, t.Any]:
try: return orjson.loads(self._metadata.json.struct_value.fields["configuration"].string_value)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try: return self._metadata.json.struct_value.fields["supports_embeddings"].bool_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try: return self._metadata.json.struct_value.fields["supports_hf_agent"].bool_value
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: Response | dict[str, t.Any]) -> openllm.GenerationOutput:
from google.protobuf.json_format import MessageToDict
if isinstance(result, dict): return openllm.GenerationOutput(**result)
return openllm.GenerationOutput(**MessageToDict(result.json, preserving_proto_field_name=True))

View File

@@ -1,96 +0,0 @@
from __future__ import annotations
import logging, typing as t
from urllib.parse import urljoin, urlparse
import httpx, orjson, openllm
from .base import BaseAsyncClient, BaseClient, in_async_context
from openllm._typing_compat import DictStrAny, LiteralRuntime
logger = logging.getLogger(__name__)
def process_address(self: AsyncHTTPClient | HTTPClient, address: str) -> None:
address = address if "://" in address else "http://" + address
parsed = urlparse(address)
self._host, *_port = parsed.netloc.split(":")
if len(_port) == 0: self._port = "80" if parsed.scheme == "http" else "443"
else: self._port = next(iter(_port))
class HTTPClient(BaseClient[DictStrAny]):
def __init__(self, address: str, timeout: int = 30):
process_address(self, address)
super().__init__(address, timeout)
def health(self) -> t.Any: return self._cached.health()
def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
if isinstance(prompt, str): prompt = [prompt]
result = httpx.post(urljoin(self._address, f"/{self._api_version}/embeddings"), json=list(prompt), timeout=self.timeout).json() if in_async_context() else self.call("embeddings", list(prompt))
return openllm.EmbeddingsOutput(**result)
@property
def model_name(self) -> str:
try: return self._metadata["model_name"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def model_id(self) -> str:
try: return self._metadata["model_name"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def framework(self) -> LiteralRuntime:
try: return self._metadata["framework"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def timeout(self) -> int:
try: return self._metadata["timeout"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def configuration(self) -> dict[str, t.Any]:
try: return orjson.loads(self._metadata["configuration"])
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try: return self._metadata.get("supports_embeddings", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try: return self._metadata.get("supports_hf_agent", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)
class AsyncHTTPClient(BaseAsyncClient[DictStrAny]):
def __init__(self, address: str, timeout: int = 30):
process_address(self, address)
super().__init__(address, timeout)
async def health(self) -> t.Any: return await self._cached.async_health()
async def embed(self, prompt: t.Sequence[str] | str) -> openllm.EmbeddingsOutput:
if isinstance(prompt, str): prompt = [prompt]
res = await self.acall("embeddings", list(prompt))
return openllm.EmbeddingsOutput(**res)
@property
def model_name(self) -> str:
try: return self._metadata["model_name"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def model_id(self) -> str:
try: return self._metadata["model_name"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def framework(self) -> LiteralRuntime:
try: return self._metadata["framework"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def timeout(self) -> int:
try: return self._metadata["timeout"]
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def configuration(self) -> dict[str, t.Any]:
try: return orjson.loads(self._metadata["configuration"])
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_embeddings(self) -> bool:
try: return self._metadata.get("supports_embeddings", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
@property
def supports_hf_agent(self) -> bool:
try: return self._metadata.get("supports_hf_agent", False)
except KeyError: raise RuntimeError("Malformed service endpoint. (Possible malicious)") from None
def postprocess(self, result: dict[str, t.Any]) -> openllm.GenerationOutput: return openllm.GenerationOutput(**result)

View File

@@ -1,19 +1,3 @@
"""Base exceptions for OpenLLM. This extends BentoML exceptions."""
from __future__ import annotations
import bentoml
class OpenLLMException(bentoml.exceptions.BentoMLException):
"""Base class for all OpenLLM exceptions. This extends BentoMLException."""
class GpuNotAvailableError(OpenLLMException):
"""Raised when there is no GPU available in given system."""
class ValidationError(OpenLLMException):
"""Raised when a validation fails."""
class ForbiddenAttributeError(OpenLLMException):
"""Raised when using an _internal field."""
class MissingAnnotationAttributeError(OpenLLMException):
"""Raised when a field under openllm.LLMConfig is missing annotations."""
class MissingDependencyError(BaseException):
"""Raised when a dependency is missing."""
class Error(BaseException):
"""To be used instead of naked raise."""
class FineTuneStrategyNotSupportedError(OpenLLMException):
"""Raised when a fine-tune strategy is not supported for given LLM."""
from openllm_core.exceptions import OpenLLMException as OpenLLMException, GpuNotAvailableError as GpuNotAvailableError, ValidationError as ValidationError, ForbiddenAttributeError as ForbiddenAttributeError, MissingAnnotationAttributeError as MissingAnnotationAttributeError, MissingDependencyError as MissingDependencyError, Error as Error, FineTuneStrategyNotSupportedError as FineTuneStrategyNotSupportedError

View File

@@ -1,11 +1,11 @@
# This file is generated by tools/update-models-import.py. DO NOT EDIT MANUALLY!
# To update this, run ./tools/update-models-import.py
from __future__ import annotations
import typing as t, os
from openllm.utils import LazyModule
_MODELS: set[str] = {"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
if t.TYPE_CHECKING: from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
__lazy=LazyModule(__name__, os.path.abspath("__file__"), {k: [] for k in _MODELS})
import typing as t
from openllm_core.utils import LazyModule
_MODELS:set[str]={"auto", "baichuan", "chatglm", "dolly_v2", "falcon", "flan_t5", "gpt_neox", "llama", "mpt", "opt", "stablelm", "starcoder"}
if t.TYPE_CHECKING:from . import auto as auto,baichuan as baichuan,chatglm as chatglm,dolly_v2 as dolly_v2,falcon as falcon,flan_t5 as flan_t5,gpt_neox as gpt_neox,llama as llama,mpt as mpt,opt as opt,stablelm as stablelm,starcoder as starcoder
__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})
__all__=__lazy.__all__
__dir__=__lazy.__dir__
__getattr__=__lazy.__getattr__

View File

@@ -1,15 +1,11 @@
from __future__ import annotations
import typing as t, os
import openllm
from openllm.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
from openllm_core.utils import LazyModule, is_flax_available, is_tf_available, is_torch_available, is_vllm_available
from openllm_core.config import AutoConfig as AutoConfig, CONFIG_MAPPING as CONFIG_MAPPING, CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES
_import_structure: dict[str, list[str]] = {"configuration_auto": ["AutoConfig", "CONFIG_MAPPING", "CONFIG_MAPPING_NAMES"], "modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
_import_structure: dict[str, list[str]] = {"modeling_auto": ["MODEL_MAPPING_NAMES"], "modeling_flax_auto": ["MODEL_FLAX_MAPPING_NAMES"], "modeling_tf_auto": ["MODEL_TF_MAPPING_NAMES"], "modeling_vllm_auto": ["MODEL_VLLM_MAPPING_NAMES"]}
if t.TYPE_CHECKING:
from .configuration_auto import (
CONFIG_MAPPING as CONFIG_MAPPING,
CONFIG_MAPPING_NAMES as CONFIG_MAPPING_NAMES,
AutoConfig as AutoConfig,
)
from .modeling_auto import MODEL_MAPPING_NAMES as MODEL_MAPPING_NAMES
from .modeling_flax_auto import MODEL_FLAX_MAPPING_NAMES as MODEL_FLAX_MAPPING_NAMES
from .modeling_tf_auto import MODEL_TF_MAPPING_NAMES as MODEL_TF_MAPPING_NAMES

View File

@@ -3,10 +3,10 @@ from __future__ import annotations
import importlib, inspect, logging, typing as t
from collections import OrderedDict
import inflection, openllm
from openllm.utils import ReprMixin
from openllm_core.utils import ReprMixin
if t.TYPE_CHECKING:
from openllm._typing_compat import LiteralString, LLMRunner
from openllm_core._typing_compat import LiteralString, LLMRunner
import types
from collections import _odict_items, _odict_keys, _odict_values

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import typing as t
from collections import OrderedDict
from .configuration_auto import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass, _LazyAutoMapping
from openllm_core.config import CONFIG_MAPPING_NAMES
MODEL_MAPPING_NAMES = OrderedDict([("chatglm", "ChatGLM"), ("dolly_v2", "DollyV2"), ("falcon", "Falcon"), ("flan_t5", "FlanT5"), ("gpt_neox", "GPTNeoX"), ("llama", "Llama"), ("mpt", "MPT"), ("opt", "OPT"), ("stablelm", "StableLM"), ("starcoder", "StarCoder"), ("baichuan", "Baichuan")])
MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES)

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import typing as t
from collections import OrderedDict
from .configuration_auto import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass, _LazyAutoMapping
from openllm_core.config import CONFIG_MAPPING_NAMES
MODEL_FLAX_MAPPING_NAMES = OrderedDict([("flan_t5", "FlaxFlanT5"), ("opt", "FlaxOPT")])
MODEL_FLAX_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FLAX_MAPPING_NAMES)

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import typing as t
from collections import OrderedDict
from .configuration_auto import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass, _LazyAutoMapping
from openllm_core.config import CONFIG_MAPPING_NAMES
MODEL_TF_MAPPING_NAMES = OrderedDict([("flan_t5", "TFFlanT5"), ("opt", "TFOPT")])
MODEL_TF_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_TF_MAPPING_NAMES)

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import typing as t
from collections import OrderedDict
from .configuration_auto import CONFIG_MAPPING_NAMES
from .factory import BaseAutoLLMClass, _LazyAutoMapping
from openllm_core.config import CONFIG_MAPPING_NAMES
MODEL_VLLM_MAPPING_NAMES = OrderedDict([("baichuan", "VLLMBaichuan"), ("dolly_v2", "VLLMDollyV2"), ("falcon", "VLLMFalcon"), ("gpt_neox", "VLLMGPTNeoX"), ("mpt", "VLLMMPT"), ("opt", "VLLMOPT"), ("stablelm", "VLLMStableLM"), ("starcoder", "VLLMStarCoder"), ("llama", "VLLMLlama")])
MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES)

View File

@@ -2,14 +2,13 @@ from __future__ import annotations
import sys, typing as t
from openllm.exceptions import MissingDependencyError
from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available, is_vllm_available
from openllm_core.config.configuration_baichuan import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
BaichuanConfig as BaichuanConfig,
)
_import_structure: dict[str, list[str]] = {"configuration_baichuan": ["BaichuanConfig", "START_BAICHUAN_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
if t.TYPE_CHECKING:
from .configuration_baichuan import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_BAICHUAN_COMMAND_DOCSTRING as START_BAICHUAN_COMMAND_DOCSTRING,
BaichuanConfig as BaichuanConfig,
)
_import_structure: dict[str, list[str]] = {}
try:
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass

View File

@@ -1,16 +1,11 @@
from __future__ import annotations
import typing as t, openllm
from openllm._prompt import process_prompt
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
if t.TYPE_CHECKING: import torch, transformers
else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
if t.TYPE_CHECKING: import transformers
class Baichuan(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
import torch
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): # type: ignore[attr-defined]
outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config())

View File

@@ -1,10 +1,7 @@
from __future__ import annotations
import typing as t, openllm
from openllm._prompt import process_prompt
from .configuration_baichuan import DEFAULT_PROMPT_TEMPLATE
if t.TYPE_CHECKING: import vllm, transformers
class VLLMBaichuan(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
tokenizer_id = "local"
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_p: float | None = None, temperature: float | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_p": top_p, "temperature": temperature, **attrs}, {}

View File

@@ -2,14 +2,13 @@ from __future__ import annotations
import sys, typing as t
from openllm.exceptions import MissingDependencyError
from openllm.utils import LazyModule, is_cpm_kernels_available, is_torch_available
from openllm_core.config.configuration_chatglm import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
ChatGLMConfig as ChatGLMConfig,
)
_import_structure: dict[str, list[str]] = {"configuration_chatglm": ["ChatGLMConfig", "START_CHATGLM_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
if t.TYPE_CHECKING:
from .configuration_chatglm import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_CHATGLM_COMMAND_DOCSTRING as START_CHATGLM_COMMAND_DOCSTRING,
ChatGLMConfig as ChatGLMConfig,
)
_import_structure: dict[str, list[str]] = {}
try:
if not is_torch_available() or not is_cpm_kernels_available(): raise MissingDependencyError
except MissingDependencyError: pass

View File

@@ -1,32 +1,17 @@
from __future__ import annotations
import typing as t, openllm
if t.TYPE_CHECKING: import torch, transformers, torch.nn.functional as F
else: torch, transformers, F = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("F", globals(), "torch.nn.functional")
if t.TYPE_CHECKING: import transformers
class ChatGLM(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTrainedTokenizerFast"]):
__openllm_internal__ = True
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, num_beams: int | None = None, top_p: float | None = None, temperature: float | None = None, chat_history: list[tuple[str, str]] | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]:
prompt_text = ""
if use_default_prompt_template and chat_history is not None:
for i, (old_query, response) in enumerate(chat_history): prompt_text += f"[Round {i}]\n问:{old_query}\n答:{response}\n"
prompt_text += f"[Round {len(chat_history)}]\n问:{prompt}\n答:"
else: prompt_text = prompt
postprocess_generate_kwargs = {"chat_history": chat_history if chat_history is not None else None}
return prompt_text, {"max_new_tokens": max_new_tokens, "num_beams": num_beams, "top_p": top_p, "temperature": temperature, **attrs}, postprocess_generate_kwargs
def postprocess_generate(self, prompt: str, generation_result: tuple[str, list[tuple[str, str]]], *, chat_history: list[tuple[str, str]] | None = None, **attrs: t.Any) -> str:
generated, history = generation_result
if self.config.retain_history:
if chat_history is None: raise ValueError("'retain_history' is True while there is no history provided.")
chat_history.extend(history)
return generated
def generate(self, prompt: str, **attrs: t.Any) -> tuple[str, list[tuple[str, str]]]:
import torch
with torch.inference_mode():
self.model.eval()
# Only use half precision if the model is not yet quantized
if self.config.use_half_precision: self.model.half()
return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config())
def embeddings(self, prompts: list[str]) -> openllm.LLMEmbeddings:
import torch, torch.nn.functional as F
embeddings: list[list[float]] = []
num_tokens = 0
for prompt in prompts:

View File

@@ -2,14 +2,13 @@ from __future__ import annotations
import sys, typing as t
from openllm.exceptions import MissingDependencyError
from openllm.utils import LazyModule, is_torch_available, is_vllm_available
from openllm_core.config.configuration_dolly_v2 import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
DollyV2Config as DollyV2Config,
)
_import_structure: dict[str, list[str]] = {"configuration_dolly_v2": ["DollyV2Config", "START_DOLLY_V2_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
if t.TYPE_CHECKING:
from .configuration_dolly_v2 import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_DOLLY_V2_COMMAND_DOCSTRING as START_DOLLY_V2_COMMAND_DOCSTRING,
DollyV2Config as DollyV2Config,
)
_import_structure: dict[str, list[str]] = {}
try:
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass

View File

@@ -1,8 +1,7 @@
from __future__ import annotations
import logging, re, typing as t, openllm
from openllm._prompt import process_prompt
from openllm._typing_compat import overload
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
from openllm_core._typing_compat import overload
from openllm_core.config.configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE, END_KEY, RESPONSE_KEY, get_special_token_id
if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf
else: torch, transformers, tf = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers"), openllm.utils.LazyLoader("tf", globals(), "tensorflow")
@@ -102,8 +101,6 @@ class DollyV2(openllm.LLM["transformers.Pipeline", "transformers.PreTrainedToken
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, "torch_dtype": torch.bfloat16}, {}
def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), self.tokenizer, _init=True, return_full_text=self.config.return_full_text)
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal["generated_text"], str]], **_: t.Any) -> str: return generation_result[0]["generated_text"]
def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal["generated_text"], str]]:
llm_config = self.config.model_construct_env(**attrs)
with torch.inference_mode(): return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config())

View File

@@ -1,11 +1,8 @@
from __future__ import annotations
import logging, typing as t, openllm
from openllm._prompt import process_prompt
from .configuration_dolly_v2 import DEFAULT_PROMPT_TEMPLATE
if t.TYPE_CHECKING: import vllm, transformers
logger = logging.getLogger(__name__)
class VLLMDollyV2(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizer"]):
__openllm_internal__ = True
tokenizer_id = "local"
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, temperature: float | None = None, top_k: int | None = None, top_p: float | None = None, use_default_prompt_template: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "top_p": top_p, "temperature": temperature, **attrs}, {}

View File

@@ -2,14 +2,13 @@ from __future__ import annotations
import sys, typing as t
from openllm.exceptions import MissingDependencyError
from openllm.utils import LazyModule, is_torch_available, is_vllm_available
from openllm_core.config.configuration_falcon import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
FalconConfig as FalconConfig,
)
_import_structure: dict[str, list[str]] = {"configuration_falcon": ["FalconConfig", "START_FALCON_COMMAND_DOCSTRING", "DEFAULT_PROMPT_TEMPLATE"]}
if t.TYPE_CHECKING:
from .configuration_falcon import (
DEFAULT_PROMPT_TEMPLATE as DEFAULT_PROMPT_TEMPLATE,
START_FALCON_COMMAND_DOCSTRING as START_FALCON_COMMAND_DOCSTRING,
FalconConfig as FalconConfig,
)
_import_structure: dict[str, list[str]] = {}
try:
if not is_torch_available(): raise MissingDependencyError
except MissingDependencyError: pass

View File

@@ -1,7 +1,5 @@
from __future__ import annotations
import typing as t, openllm
from openllm._prompt import process_prompt
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
if t.TYPE_CHECKING: import torch, transformers
else: torch, transformers = openllm.utils.LazyLoader("torch", globals(), "torch"), openllm.utils.LazyLoader("transformers", globals(), "transformers")
@@ -9,8 +7,6 @@ class Falcon(openllm.LLM["transformers.PreTrainedModel", "transformers.PreTraine
__openllm_internal__ = True
@property
def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: return {"torch_dtype": torch.bfloat16, "device_map": "auto" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {}
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}
def postprocess_generate(self, prompt: str, generation_result: t.Sequence[str], **_: t.Any) -> str: return generation_result[0]
def generate(self, prompt: str, **attrs: t.Any) -> list[str]:
eos_token_id, inputs = attrs.pop("eos_token_id", self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors="pt").to(self.device)
with torch.inference_mode(), torch.autocast("cuda", dtype=torch.float16): # type: ignore[attr-defined]

View File

@@ -1,11 +1,8 @@
from __future__ import annotations
import logging, typing as t, openllm
from openllm._prompt import process_prompt
from .configuration_falcon import DEFAULT_PROMPT_TEMPLATE
if t.TYPE_CHECKING: import vllm, transformers
logger = logging.getLogger(__name__)
class VLLMFalcon(openllm.LLM["vllm.LLMEngine", "transformers.PreTrainedTokenizerBase"]):
__openllm_internal__ = True
tokenizer_id = "local"
def sanitize_parameters(self, prompt: str, max_new_tokens: int | None = None, top_k: int | None = None, num_return_sequences: int | None = None, eos_token_id: int | None = None, use_default_prompt_template: bool = False, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: return process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, **attrs), {"max_new_tokens": max_new_tokens, "top_k": top_k, "num_return_sequences": num_return_sequences, "eos_token_id": eos_token_id, **attrs}, {}

Some files were not shown because too many files have changed in this diff Show More