mirror of
https://github.com/bentoml/OpenLLM.git
synced 2026-04-24 00:47:44 -04:00
refactor: delete unused code (#716)
Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"configuration": {
|
||||
"generation_config": {
|
||||
"diversity_penalty": 0.0,
|
||||
"early_stopping": false,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"encoder_repetition_penalty": 1.0,
|
||||
"epsilon_cutoff": 0.0,
|
||||
"eta_cutoff": 0.0,
|
||||
"length_penalty": 1.0,
|
||||
"max_new_tokens": 10,
|
||||
"min_length": 0,
|
||||
"no_repeat_ngram_size": 0,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 1,
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"remove_invalid_values": false,
|
||||
"renormalize_logits": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"temperature": 0.9,
|
||||
"top_k": 50,
|
||||
"top_p": 0.9,
|
||||
"typical_p": 1.0,
|
||||
"use_cache": true
|
||||
}
|
||||
},
|
||||
"responses": [
|
||||
"life is a complete physical life"
|
||||
]
|
||||
}
|
||||
@@ -1,33 +0,0 @@
|
||||
{
|
||||
"configuration": {
|
||||
"generation_config": {
|
||||
"diversity_penalty": 0.0,
|
||||
"early_stopping": false,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"encoder_repetition_penalty": 1.0,
|
||||
"epsilon_cutoff": 0.0,
|
||||
"eta_cutoff": 0.0,
|
||||
"length_penalty": 1.0,
|
||||
"max_new_tokens": 10,
|
||||
"min_length": 0,
|
||||
"no_repeat_ngram_size": 0,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 1,
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"remove_invalid_values": false,
|
||||
"renormalize_logits": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"temperature": 0.9,
|
||||
"top_k": 50,
|
||||
"top_p": 0.9,
|
||||
"typical_p": 1.0,
|
||||
"use_cache": true
|
||||
}
|
||||
},
|
||||
"responses": [
|
||||
"life is a state"
|
||||
]
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"configuration": {
|
||||
"format_outputs": false,
|
||||
"generation_config": {
|
||||
"diversity_penalty": 0.0,
|
||||
"early_stopping": false,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"encoder_repetition_penalty": 1.0,
|
||||
"epsilon_cutoff": 0.0,
|
||||
"eta_cutoff": 0.0,
|
||||
"length_penalty": 1.0,
|
||||
"max_new_tokens": 20,
|
||||
"min_length": 0,
|
||||
"no_repeat_ngram_size": 0,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 1,
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"remove_invalid_values": false,
|
||||
"renormalize_logits": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"temperature": 0.75,
|
||||
"top_k": 15,
|
||||
"top_p": 1.0,
|
||||
"typical_p": 1.0,
|
||||
"use_cache": true
|
||||
}
|
||||
},
|
||||
"responses": [
|
||||
"What is Deep learning?\nDeep learning is a new way of studying the content and making an informed decision. It is the"
|
||||
]
|
||||
}
|
||||
@@ -1,34 +0,0 @@
|
||||
{
|
||||
"configuration": {
|
||||
"format_outputs": false,
|
||||
"generation_config": {
|
||||
"diversity_penalty": 0.0,
|
||||
"early_stopping": false,
|
||||
"encoder_no_repeat_ngram_size": 0,
|
||||
"encoder_repetition_penalty": 1.0,
|
||||
"epsilon_cutoff": 0.0,
|
||||
"eta_cutoff": 0.0,
|
||||
"length_penalty": 1.0,
|
||||
"max_new_tokens": 20,
|
||||
"min_length": 0,
|
||||
"no_repeat_ngram_size": 0,
|
||||
"num_beam_groups": 1,
|
||||
"num_beams": 1,
|
||||
"num_return_sequences": 1,
|
||||
"output_attentions": false,
|
||||
"output_hidden_states": false,
|
||||
"output_scores": false,
|
||||
"remove_invalid_values": false,
|
||||
"renormalize_logits": false,
|
||||
"repetition_penalty": 1.0,
|
||||
"temperature": 0.75,
|
||||
"top_k": 15,
|
||||
"top_p": 1.0,
|
||||
"typical_p": 1.0,
|
||||
"use_cache": true
|
||||
}
|
||||
},
|
||||
"responses": [
|
||||
"What is Deep learning?\n\nDeep learning is a new, highly-advanced, and powerful tool for the deep learning"
|
||||
]
|
||||
}
|
||||
@@ -1,266 +0,0 @@
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import contextlib
|
||||
import functools
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
import typing as t
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import attr
|
||||
import docker
|
||||
import docker.errors
|
||||
import docker.types
|
||||
import orjson
|
||||
import pytest
|
||||
from syrupy.extensions.json import JSONSnapshotExtension
|
||||
|
||||
import openllm
|
||||
from bentoml._internal.types import LazyType
|
||||
from openllm._llm import self
|
||||
from openllm_core._typing_compat import DictStrAny, ListAny, LiteralQuantise
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import subprocess
|
||||
|
||||
from syrupy.assertion import SnapshotAssertion
|
||||
from syrupy.types import PropertyFilter, PropertyMatcher, SerializableData, SerializedData
|
||||
|
||||
from openllm.client import BaseAsyncClient
|
||||
|
||||
|
||||
class ResponseComparator(JSONSnapshotExtension):
|
||||
def serialize(
|
||||
self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None
|
||||
) -> SerializedData:
|
||||
if LazyType(ListAny).isinstance(data):
|
||||
data = [d.unmarshaled for d in data]
|
||||
else:
|
||||
data = data.unmarshaled
|
||||
data = self._filter(data=data, depth=0, path=(), exclude=exclude, matcher=matcher)
|
||||
return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode()
|
||||
|
||||
def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool:
|
||||
def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]:
|
||||
try:
|
||||
data = orjson.loads(data)
|
||||
except orjson.JSONDecodeError as err:
|
||||
raise ValueError(f'Failed to decode JSON data: {data}') from err
|
||||
if LazyType(DictStrAny).isinstance(data):
|
||||
return openllm.GenerationOutput(**data)
|
||||
elif LazyType(ListAny).isinstance(data):
|
||||
return [openllm.GenerationOutput(**d) for d in data]
|
||||
else:
|
||||
raise NotImplementedError(f'Data {data} has unsupported type.')
|
||||
|
||||
serialized_data = convert_data(serialized_data)
|
||||
snapshot_data = convert_data(snapshot_data)
|
||||
|
||||
if LazyType(ListAny).isinstance(serialized_data):
|
||||
serialized_data = [serialized_data]
|
||||
if LazyType(ListAny).isinstance(snapshot_data):
|
||||
snapshot_data = [snapshot_data]
|
||||
|
||||
def eq_output(s: openllm.GenerationOutput, t: openllm.GenerationOutput) -> bool:
|
||||
return len(s.outputs) == len(t.outputs)
|
||||
|
||||
return len(serialized_data) == len(snapshot_data) and all(
|
||||
[eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def response_snapshot(snapshot: SnapshotAssertion):
|
||||
return snapshot.use_extension(ResponseComparator)
|
||||
|
||||
|
||||
@attr.define(init=False)
|
||||
class _Handle(ABC):
|
||||
port: int
|
||||
deployment_mode: t.Literal['container', 'local']
|
||||
|
||||
client: BaseAsyncClient[t.Any] = attr.field(init=False)
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
|
||||
def __attrs_init__(self, *args: t.Any, **attrs: t.Any): ...
|
||||
|
||||
def __attrs_post_init__(self):
|
||||
self.client = openllm.client.AsyncHTTPClient(f'http://localhost:{self.port}')
|
||||
|
||||
@abstractmethod
|
||||
def status(self) -> bool:
|
||||
raise NotImplementedError
|
||||
|
||||
async def health(self, timeout: int = 240):
|
||||
start_time = time.time()
|
||||
while time.time() - start_time < timeout:
|
||||
if not self.status():
|
||||
raise RuntimeError(f'Failed to initialise {self.__class__.__name__}')
|
||||
await self.client.health()
|
||||
try:
|
||||
await self.client.query('sanity')
|
||||
return
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
raise RuntimeError(f'Handle failed to initialise within {timeout} seconds.')
|
||||
|
||||
|
||||
@attr.define(init=False)
|
||||
class LocalHandle(_Handle):
|
||||
process: subprocess.Popen[bytes]
|
||||
|
||||
def __init__(self, process: subprocess.Popen[bytes], port: int, deployment_mode: t.Literal['container', 'local']):
|
||||
self.__attrs_init__(port, deployment_mode, process)
|
||||
|
||||
def status(self) -> bool:
|
||||
return self.process.poll() is None
|
||||
|
||||
|
||||
class HandleProtocol(t.Protocol):
|
||||
@contextlib.contextmanager
|
||||
def __call__(
|
||||
*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None
|
||||
) -> t.Generator[_Handle, None, None]: ...
|
||||
|
||||
|
||||
@attr.define(init=False)
|
||||
class DockerHandle(_Handle):
|
||||
container_name: str
|
||||
docker_client: docker.DockerClient
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
docker_client: docker.DockerClient,
|
||||
container_name: str,
|
||||
port: int,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
):
|
||||
self.__attrs_init__(port, deployment_mode, container_name, docker_client)
|
||||
|
||||
def status(self) -> bool:
|
||||
container = self.docker_client.containers.get(self.container_name)
|
||||
return container.status in ['running', 'created']
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _local_handle(
|
||||
model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: LiteralQuantise | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
with openllm.utils.reserve_free_port() as port:
|
||||
pass
|
||||
|
||||
if not _serve_grpc:
|
||||
proc = openllm.start(
|
||||
model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True
|
||||
)
|
||||
else:
|
||||
proc = openllm.start_grpc(
|
||||
model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True
|
||||
)
|
||||
|
||||
yield LocalHandle(proc, port, deployment_mode)
|
||||
proc.terminate()
|
||||
proc.wait(60)
|
||||
|
||||
process_output = proc.stdout.read()
|
||||
print(process_output, file=sys.stderr)
|
||||
|
||||
proc.stdout.close()
|
||||
if proc.stderr:
|
||||
proc.stderr.close()
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _container_handle(
|
||||
model: str,
|
||||
model_id: str,
|
||||
image_tag: str,
|
||||
deployment_mode: t.Literal['container', 'local'],
|
||||
quantize: LiteralQuantise | None = None,
|
||||
*,
|
||||
_serve_grpc: bool = False,
|
||||
):
|
||||
with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port:
|
||||
pass
|
||||
container_name = f'openllm-{model}-{self(model_id)}'.replace('-', '_')
|
||||
client = docker.from_env()
|
||||
try:
|
||||
container = client.containers.get(container_name)
|
||||
container.stop()
|
||||
container.wait()
|
||||
container.remove()
|
||||
except docker.errors.NotFound:
|
||||
pass
|
||||
|
||||
args = ['serve' if not _serve_grpc else 'serve-grpc']
|
||||
|
||||
env: DictStrAny = {}
|
||||
|
||||
if quantize is not None:
|
||||
env['OPENLLM_QUANTIZE'] = quantize
|
||||
|
||||
gpus = openllm.utils.device_count() or -1
|
||||
devs = [docker.types.DeviceRequest(count=gpus, capabilities=[['gpu']])] if gpus > 0 else None
|
||||
|
||||
container = client.containers.run(
|
||||
image_tag,
|
||||
command=args,
|
||||
name=container_name,
|
||||
environment=env,
|
||||
auto_remove=False,
|
||||
detach=True,
|
||||
device_requests=devs,
|
||||
ports={'3000/tcp': port, '3001/tcp': prom_port},
|
||||
)
|
||||
|
||||
yield DockerHandle(client, container.name, port, deployment_mode)
|
||||
|
||||
try:
|
||||
container.stop()
|
||||
container.wait()
|
||||
except docker.errors.NotFound:
|
||||
pass
|
||||
|
||||
container_output = container.logs().decode('utf-8')
|
||||
print(container_output, file=sys.stderr)
|
||||
|
||||
container.remove()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session', autouse=True)
|
||||
def clean_context() -> t.Generator[contextlib.ExitStack, None, None]:
|
||||
stack = contextlib.ExitStack()
|
||||
yield stack
|
||||
stack.close()
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def el() -> t.Generator[asyncio.AbstractEventLoop, None, None]:
|
||||
loop = asyncio.get_event_loop()
|
||||
yield loop
|
||||
loop.close()
|
||||
|
||||
|
||||
@pytest.fixture(params=['container', 'local'], scope='session')
|
||||
def deployment_mode(request: pytest.FixtureRequest) -> str:
|
||||
return request.param
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def handler(el: asyncio.AbstractEventLoop, deployment_mode: t.Literal['container', 'local']):
|
||||
if deployment_mode == 'container':
|
||||
return functools.partial(_container_handle, deployment_mode=deployment_mode)
|
||||
elif deployment_mode == 'local':
|
||||
return functools.partial(_local_handle, deployment_mode=deployment_mode)
|
||||
else:
|
||||
raise ValueError(f'Unknown deployment mode: {deployment_mode}')
|
||||
@@ -1,40 +0,0 @@
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import pytest
|
||||
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import contextlib
|
||||
|
||||
from .conftest import HandleProtocol, ResponseComparator, _Handle
|
||||
|
||||
model = 'flan_t5'
|
||||
model_id = 'google/flan-t5-small'
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def flan_t5_handle(
|
||||
handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack
|
||||
):
|
||||
with openllm.testing.prepare(
|
||||
model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context
|
||||
) as image_tag:
|
||||
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
async def flan_t5(flan_t5_handle: _Handle):
|
||||
await flan_t5_handle.health(240)
|
||||
return flan_t5_handle.client
|
||||
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
async def test_flan_t5(flan_t5: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
|
||||
client = await flan_t5
|
||||
response = await client.query('What is the meaning of life?', max_new_tokens=10, top_p=0.9, return_response='attrs')
|
||||
|
||||
assert response.configuration['generation_config']['max_new_tokens'] == 10
|
||||
assert response == response_snapshot
|
||||
@@ -1,40 +0,0 @@
|
||||
from __future__ import annotations
|
||||
import typing as t
|
||||
|
||||
import pytest
|
||||
|
||||
import openllm
|
||||
|
||||
if t.TYPE_CHECKING:
|
||||
import contextlib
|
||||
|
||||
from .conftest import HandleProtocol, ResponseComparator, _Handle
|
||||
|
||||
model = 'opt'
|
||||
model_id = 'facebook/opt-125m'
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def opt_125m_handle(
|
||||
handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack
|
||||
):
|
||||
with openllm.testing.prepare(
|
||||
model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context
|
||||
) as image_tag:
|
||||
with handler(model=model, model_id=model_id, image_tag=image_tag) as handle:
|
||||
yield handle
|
||||
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
async def opt_125m(opt_125m_handle: _Handle):
|
||||
await opt_125m_handle.health(240)
|
||||
return opt_125m_handle.client
|
||||
|
||||
|
||||
@pytest.mark.asyncio()
|
||||
async def test_opt_125m(opt_125m: t.Awaitable[openllm.client.AsyncHTTPClient], response_snapshot: ResponseComparator):
|
||||
client = await opt_125m
|
||||
response = await client.query('What is Deep learning?', max_new_tokens=20, return_response='attrs')
|
||||
|
||||
assert response.configuration['generation_config']['max_new_tokens'] == 20
|
||||
assert response == response_snapshot
|
||||
Reference in New Issue
Block a user