diff --git a/cz.py b/cz.py index c372afd0..a1fd3cb4 100755 --- a/cz.py +++ b/cz.py @@ -19,10 +19,7 @@ def run_cz(dir: str, package: str): with tokenize.open(filepath) as file_: tokens = [t for t in tokenize.generate_tokens(file_.readline) if t.type in TOKEN_WHITELIST] token_count, line_count = len(tokens), len(set([t.start[0] for t in tokens])) - table.append([ - filepath.replace(os.path.join(dir, 'src'), ''), line_count, - token_count / line_count if line_count != 0 else 0 - ]) + table.append([filepath.replace(os.path.join(dir, 'src'), ''), line_count, token_count / line_count if line_count != 0 else 0]) print(tabulate([headers, *sorted(table, key=lambda x: -x[1])], headers='firstrow', floatfmt='.1f') + '\n') for dir_name, group in itertools.groupby(sorted([(x[0].rsplit('/', 1)[0], x[1]) for x in table]), key=lambda x: x[0]): print(f'{dir_name:35s} : {sum([x[1] for x in group]):6d}') diff --git a/examples/langchain-chains-demo/service.py b/examples/langchain-chains-demo/service.py index f6ea3ffb..6917f8b4 100644 --- a/examples/langchain-chains-demo/service.py +++ b/examples/langchain-chains-demo/service.py @@ -44,17 +44,11 @@ svc = bentoml.Service("fb-ads-copy", runners=[llm.runner]) def download(_: bentoml.Context): llm.runner.download_model() -SAMPLE_INPUT = Query( - industry="SAAS", - product_name="BentoML", - keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"], - llm_config=llm.runner.config.model_dump(), -) +SAMPLE_INPUT = Query(industry="SAAS", + product_name="BentoML", + keywords=["open source", "developer tool", "AI application platform", "serverless", "cost-efficient"], + llm_config=llm.runner.config.model_dump()) @svc.api(input=JSON.from_sample(sample=SAMPLE_INPUT), output=Text()) def generate(query: Query): - return chain.run({ - "industry": query.industry, - "product_name": query.product_name, - "keywords": ", ".join(query.keywords) - }) + return chain.run({"industry": query.industry, "product_name": query.product_name, "keywords": ", ".join(query.keywords)}) diff --git a/openllm-client/src/openllm_client/_base.py b/openllm-client/src/openllm_client/_base.py index f2216958..0f19974a 100644 --- a/openllm-client/src/openllm_client/_base.py +++ b/openllm-client/src/openllm_client/_base.py @@ -65,10 +65,7 @@ class _ClientAttr: ... @abc.abstractmethod - def query(self, - prompt: str, - return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', - **attrs: t.Any) -> t.Any: + def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any: raise NotImplementedError # NOTE: Scikit interface @@ -84,8 +81,7 @@ class _ClientAttr: @overload @abc.abstractmethod - def predict(self, prompt: str, *, return_response: t.Literal['attrs'], - **attrs: t.Any) -> openllm_core.GenerationOutput: + def predict(self, prompt: str, *, return_response: t.Literal['attrs'], **attrs: t.Any) -> openllm_core.GenerationOutput: ... @abc.abstractmethod @@ -95,14 +91,12 @@ class _ClientAttr: @functools.cached_property def _hf_agent(self) -> transformers.HfAgent: if not is_transformers_available(): - raise RuntimeError( - "transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.") + raise RuntimeError("transformers is required to use HF agent. Install with 'pip install \"openllm-client[agents]\"'.") if not self.supports_hf_agent: raise RuntimeError(f'{self.model_name} ({self.backend}) does not support running HF agent.') if not is_transformers_supports_agent(): raise RuntimeError( - "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'" - ) + "Current 'transformers' does not support Agent. Make sure to upgrade to at least 4.29: 'pip install -U \"transformers>=4.29\"'") import transformers return transformers.HfAgent(urljoin(self._address, '/hf/agent')) @@ -183,13 +177,7 @@ class _Client(_ClientAttr): return BentoClient.from_url(self._address) # Agent integration - def ask_agent(self, - task: str, - *, - return_code: bool = False, - remote: bool = False, - agent_type: LiteralString = 'hf', - **attrs: t.Any) -> t.Any: + def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any: if agent_type == 'hf': return self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs) else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'") @@ -223,20 +211,13 @@ class _AsyncClient(_ClientAttr): return ensure_exec_coro(AsyncBentoClient.from_url(self._address)) # Agent integration - async def ask_agent(self, - task: str, - *, - return_code: bool = False, - remote: bool = False, - agent_type: LiteralString = 'hf', - **attrs: t.Any) -> t.Any: + async def ask_agent(self, task: str, *, return_code: bool = False, remote: bool = False, agent_type: LiteralString = 'hf', **attrs: t.Any) -> t.Any: if agent_type == 'hf': return await self._run_hf_agent(task, return_code=return_code, remote=remote, **attrs) else: raise RuntimeError(f"Unknown 'agent_type={agent_type}'") async def _run_hf_agent(self, *args: t.Any, **kwargs: t.Any) -> t.Any: if not is_transformers_supports_agent(): - raise RuntimeError( - 'This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0') + raise RuntimeError('This version of transformers does not support agent.run. Make sure to upgrade to transformers>4.30.0') if len(args) > 1: raise ValueError("'args' should only take one positional argument.") from transformers.tools.agents import clean_code_for_run from transformers.tools.agents import get_tool_creation_code @@ -272,31 +253,23 @@ class _AsyncClient(_ClientAttr): self._hf_agent.log(f'\n\n==Code generated by the agent==\n{code}') if not return_code: self._hf_agent.log('\n\n==Result==') - self._hf_agent.cached_tools = resolve_tools(code, - self._hf_agent.toolbox, - remote=remote, - cached_tools=self._hf_agent.cached_tools) + self._hf_agent.cached_tools = resolve_tools(code, self._hf_agent.toolbox, remote=remote, cached_tools=self._hf_agent.cached_tools) return evaluate(code, self._hf_agent.cached_tools, state=kwargs.copy()) else: tool_code = get_tool_creation_code(code, self._hf_agent.toolbox, remote=remote) return f'{tool_code}\n{code}' class BaseClient(_Client): - def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: - return openllm_core.EmbeddingsOutput( - **self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt))) + return openllm_core.EmbeddingsOutput(**self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt))) def predict(self, prompt: str, **attrs: t.Any) -> openllm_core.GenerationOutput | DictStrAny | str: return self.query(prompt, **attrs) - def query(self, - prompt: str, - return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', - **attrs: t.Any) -> t.Any: + def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any: return_raw_response = attrs.pop('return_raw_response', None) if return_raw_response is not None: logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.") @@ -306,32 +279,27 @@ class BaseClient(_Client): logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.") if return_attrs is True: return_response = 'attrs' use_default_prompt_template = attrs.pop('use_default_prompt_template', False) - prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters( - prompt, use_default_prompt_template=use_default_prompt_template, **attrs) - r = openllm_core.GenerationOutput(**self.call( - 'generate', - openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env( - **generate_kwargs)).model_dump())) + prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, + use_default_prompt_template=use_default_prompt_template, + **attrs) + r = openllm_core.GenerationOutput( + **self.call('generate', + openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump())) if return_response == 'attrs': return r elif return_response == 'raw': return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) class BaseAsyncClient(_AsyncClient): - async def chat(self, prompt: str, history: list[str], **attrs: t.Any) -> str: raise NotImplementedError async def embed(self, prompt: t.Sequence[str] | str) -> openllm_core.EmbeddingsOutput: - return openllm_core.EmbeddingsOutput( - **(await self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt)))) + return openllm_core.EmbeddingsOutput(**(await self.call('embeddings', list([prompt] if isinstance(prompt, str) else prompt)))) async def predict(self, prompt: str, **attrs: t.Any) -> t.Any: return await self.query(prompt, **attrs) - async def query(self, - prompt: str, - return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', - **attrs: t.Any) -> t.Any: + async def query(self, prompt: str, return_response: t.Literal['attrs', 'raw', 'processed'] = 'processed', **attrs: t.Any) -> t.Any: return_raw_response = attrs.pop('return_raw_response', None) if return_raw_response is not None: logger.warning("'return_raw_response' is now deprecated. Please use 'return_response=\"raw\"' instead.") @@ -341,12 +309,12 @@ class BaseAsyncClient(_AsyncClient): logger.warning("'return_attrs' is now deprecated. Please use 'return_response=\"attrs\"' instead.") if return_attrs is True: return_response = 'attrs' use_default_prompt_template = attrs.pop('use_default_prompt_template', False) - prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters( - prompt, use_default_prompt_template=use_default_prompt_template, **attrs) - r = openllm_core.GenerationOutput(**(await self.call( - 'generate', - openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env( - **generate_kwargs)).model_dump()))) + prompt, generate_kwargs, postprocess_kwargs = self.config.sanitize_parameters(prompt, + use_default_prompt_template=use_default_prompt_template, + **attrs) + r = openllm_core.GenerationOutput( + **(await self.call('generate', + openllm_core.GenerationInput(prompt=prompt, llm_config=self.config.model_construct_env(**generate_kwargs)).model_dump()))) if return_response == 'attrs': return r elif return_response == 'raw': return bentoml_cattr.unstructure(r) else: return self.config.postprocess_generate(prompt, r.responses, **postprocess_kwargs) diff --git a/openllm-client/src/openllm_client/benmin/_grpc.py b/openllm-client/src/openllm_client/benmin/_grpc.py index 44222215..4525143b 100644 --- a/openllm-client/src/openllm_client/benmin/_grpc.py +++ b/openllm-client/src/openllm_client/benmin/_grpc.py @@ -69,16 +69,10 @@ def dispatch_channel(server_url: str, credentials = None if ssl: if ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'") - credentials = grpc.ssl_channel_credentials(**{ - k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items() - }) + credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in ssl_client_credentials.items()}) if typ == 'async' and ssl: - return aio.secure_channel(server_url, - credentials=credentials, - options=options, - compression=compression, - interceptors=interceptors) + return aio.secure_channel(server_url, credentials=credentials, options=options, compression=compression, interceptors=interceptors) elif typ == 'async': return aio.insecure_channel(server_url, options=options, compression=compression, interceptors=interceptors) elif typ == 'sync' and ssl: @@ -109,21 +103,12 @@ class GrpcClient(Client): def inner(self) -> grpc.Channel: if self.ssl: if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'") - credentials = grpc.ssl_channel_credentials(**{ - k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items() - }) - return grpc.secure_channel(self.server_url, - credentials=credentials, - options=self.options, - compression=self.compression) + credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()}) + return grpc.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression) return grpc.insecure_channel(self.server_url, options=self.options, compression=self.compression) @staticmethod - def wait_until_server_ready(host: str, - port: int, - timeout: float = 30, - check_interval: int = 1, - **kwargs: t.Any) -> None: + def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ='sync', options=kwargs.get('options', None), @@ -167,18 +152,17 @@ class GrpcClient(Client): reflection = bentoml.Service(metadata.name) for api in metadata.apis: try: - reflection.apis[api.name] = InferenceAPI[t.Any]( - None, - bentoml.io.from_spec({ - 'id': api.input.descriptor_id, - 'args': json_format.MessageToDict(api.input.attributes).get('args', None) - }), - bentoml.io.from_spec({ - 'id': api.output.descriptor_id, - 'args': json_format.MessageToDict(api.output.attributes).get('args', None) - }), - name=api.name, - doc=api.docs) + reflection.apis[api.name] = InferenceAPI[t.Any](None, + bentoml.io.from_spec({ + 'id': api.input.descriptor_id, + 'args': json_format.MessageToDict(api.input.attributes).get('args', None) + }), + bentoml.io.from_spec({ + 'id': api.output.descriptor_id, + 'args': json_format.MessageToDict(api.output.attributes).get('args', None) + }), + name=api.name, + doc=api.docs) except Exception as e: logger.error('Failed to instantiate client for API %s: ', api.name, e) return cls(url, reflection, **kwargs) @@ -187,24 +171,16 @@ class GrpcClient(Client): return services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service='')) def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: - channel_kwargs = { - k: kwargs.pop(f'_grpc_channel_{k}', None) - for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'} - } + channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}} if _inference_api.multi_input: if data is not None: - raise ValueError( - f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") + raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = ensure_exec_coro(_inference_api.input.to_proto(kwargs)) else: fake_resp = ensure_exec_coro(_inference_api.input.to_proto(data)) api_fn = {v: k for k, v in self.svc.apis.items()} stubs = services.BentoServiceStub(self.inner) - proto = stubs.Call( - pb.Request(**{ - 'api_name': api_fn[_inference_api], - _inference_api.input.proto_fields[0]: fake_resp - }), **channel_kwargs) + proto = stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs) return ensure_exec_coro(_inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content')))) class AsyncGrpcClient(AsyncClient): @@ -230,25 +206,16 @@ class AsyncGrpcClient(AsyncClient): def inner(self) -> aio.Channel: if self.ssl: if self.ssl_client_credentials is None: raise RuntimeError("'ssl=True' requires 'ssl_client_credentials'") - credentials = grpc.ssl_channel_credentials(**{ - k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items() - }) + credentials = grpc.ssl_channel_credentials(**{k: load_from_file(v) if isinstance(v, str) else v for k, v in self.ssl_client_credentials.items()}) return aio.secure_channel(self.server_url, credentials=credentials, options=self.options, compression=self.compression, interceptors=self.interceptors) - return aio.insecure_channel(self.server_url, - options=self.options, - compression=self.compression, - interceptors=self.interceptors) + return aio.insecure_channel(self.server_url, options=self.options, compression=self.compression, interceptors=self.interceptors) @staticmethod - async def wait_until_server_ready(host: str, - port: int, - timeout: float = 30, - check_interval: int = 1, - **kwargs: t.Any) -> None: + async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: async with dispatch_channel(f"{host.replace(r'localhost', '0.0.0.0')}:{port}", typ='async', options=kwargs.get('options', None), @@ -293,18 +260,17 @@ class AsyncGrpcClient(AsyncClient): reflection = bentoml.Service(metadata.name) for api in metadata.apis: try: - reflection.apis[api.name] = InferenceAPI[t.Any]( - None, - bentoml.io.from_spec({ - 'id': api.input.descriptor_id, - 'args': json_format.MessageToDict(api.input.attributes).get('args', None) - }), - bentoml.io.from_spec({ - 'id': api.output.descriptor_id, - 'args': json_format.MessageToDict(api.output.attributes).get('args', None) - }), - name=api.name, - doc=api.docs) + reflection.apis[api.name] = InferenceAPI[t.Any](None, + bentoml.io.from_spec({ + 'id': api.input.descriptor_id, + 'args': json_format.MessageToDict(api.input.attributes).get('args', None) + }), + bentoml.io.from_spec({ + 'id': api.output.descriptor_id, + 'args': json_format.MessageToDict(api.output.attributes).get('args', None) + }), + name=api.name, + doc=api.docs) except Exception as e: logger.error('Failed to instantiate client for API %s: ', api.name, e) return cls(url, reflection, **kwargs) @@ -313,25 +279,17 @@ class AsyncGrpcClient(AsyncClient): return await services_health.HealthStub(self.inner).Check(pb_health.HealthCheckRequest(service='')) async def _call(self, data: t.Any, /, *, _inference_api: InferenceAPI[t.Any], **kwargs: t.Any) -> t.Any: - channel_kwargs = { - k: kwargs.pop(f'_grpc_channel_{k}', None) - for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'} - } + channel_kwargs = {k: kwargs.pop(f'_grpc_channel_{k}', None) for k in {'timeout', 'metadata', 'credentials', 'wait_for_ready', 'compression'}} state = self.inner.get_state(try_to_connect=True) if state != grpc.ChannelConnectivity.READY: await self.inner.channel_ready() if _inference_api.multi_input: if data is not None: - raise ValueError( - f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") + raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = await _inference_api.input.to_proto(kwargs) else: fake_resp = await _inference_api.input.to_proto(data) api_fn = {v: k for k, v in self.svc.apis.items()} async with self.inner: stubs = services.BentoServiceStub(self.inner) - proto = await stubs.Call( - pb.Request(**{ - 'api_name': api_fn[_inference_api], - _inference_api.input.proto_fields[0]: fake_resp - }), **channel_kwargs) + proto = await stubs.Call(pb.Request(**{'api_name': api_fn[_inference_api], _inference_api.input.proto_fields[0]: fake_resp}), **channel_kwargs) return await _inference_api.output.from_proto(getattr(proto, proto.WhichOneof('content'))) diff --git a/openllm-client/src/openllm_client/benmin/_http.py b/openllm-client/src/openllm_client/benmin/_http.py index 6cd26c81..af8d6cb1 100644 --- a/openllm-client/src/openllm_client/benmin/_http.py +++ b/openllm-client/src/openllm_client/benmin/_http.py @@ -24,18 +24,13 @@ from openllm_core.utils import ensure_exec_coro logger = logging.getLogger(__name__) class HttpClient(Client): - @functools.cached_property def inner(self) -> httpx.Client: if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}') return httpx.Client(base_url=self.server_url) @staticmethod - def wait_until_server_ready(host: str, - port: int, - timeout: float = 30, - check_interval: int = 1, - **kwargs: t.Any) -> None: + def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: host = host if '://' in host else 'http://' + host logger.debug('Waiting for server @ `%s:%d` to be ready...', host, port) start = time.time() @@ -63,9 +58,7 @@ class HttpClient(Client): url = url if '://' in url else 'http://' + url resp = httpx.get(f'{url}/docs.json') if resp.status_code != 200: - raise ValueError( - f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}' - ) + raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{resp.content.decode()}') _spec = orjson.loads(resp.content) reflection = bentoml.Service(_spec['info']['title']) @@ -96,8 +89,7 @@ class HttpClient(Client): kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')} if _inference_api.multi_input: if data is not None: - raise ValueError( - f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") + raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(kwargs, None)) else: fake_resp = ensure_exec_coro(_inference_api.input.to_http_response(data, None)) @@ -106,8 +98,7 @@ class HttpClient(Client): if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None else: body = fake_resp.body - resp = self.inner.post('/' + - _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route, + resp = self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route, data=body, headers={'content-type': fake_resp.headers['content-type']}, timeout=self.timeout) @@ -120,18 +111,13 @@ class HttpClient(Client): return ensure_exec_coro(_inference_api.output.from_http_request(fake_req)) class AsyncHttpClient(AsyncClient): - @functools.cached_property def inner(self) -> httpx.AsyncClient: if not urlparse(self.server_url).netloc: raise ValueError(f'Invalid server url: {self.server_url}') return httpx.AsyncClient(base_url=self.server_url) @staticmethod - async def wait_until_server_ready(host: str, - port: int, - timeout: float = 30, - check_interval: int = 1, - **kwargs: t.Any) -> None: + async def wait_until_server_ready(host: str, port: int, timeout: float = 30, check_interval: int = 1, **kwargs: t.Any) -> None: host = host if '://' in host else 'http://' + host logger.debug('Waiting for server @ `%s:%d` to be ready...', host, port) start = time.time() @@ -148,8 +134,7 @@ class AsyncHttpClient(AsyncClient): async with httpx.AsyncClient(base_url=f'{host}:{port}') as sess: resp = await sess.get('/readyz') if resp.status_code != 200: - raise TimeoutError( - f'Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}') + raise TimeoutError(f'Timeout while waiting for server @ `{host}:{port}` to be ready: {resp.status_code}: {resp.content!s}') async def health(self) -> httpx.Response: return await self.inner.get('/readyz') @@ -160,9 +145,7 @@ class AsyncHttpClient(AsyncClient): async with httpx.AsyncClient(base_url=url) as session: resp = await session.get('/docs.json') if resp.status_code != 200: - raise ValueError( - f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}' - ) + raise ValueError(f'Failed to get OpenAPI schema from the server: {resp.status_code} {resp.reason_phrase}:\n{(await resp.aread()).decode()}') _spec = orjson.loads(await resp.aread()) reflection = bentoml.Service(_spec['info']['title']) @@ -193,8 +176,7 @@ class AsyncHttpClient(AsyncClient): kwargs = {k: v for k, v in kwargs.items() if not k.startswith('_grpc_')} if _inference_api.multi_input: if data is not None: - raise ValueError( - f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") + raise ValueError(f"'{_inference_api.name}' takes multiple inputs, and thus required to pass as keyword arguments.") fake_resp = await _inference_api.input.to_http_response(kwargs, None) else: fake_resp = await _inference_api.input.to_http_response(data, None) @@ -203,11 +185,10 @@ class AsyncHttpClient(AsyncClient): if isinstance(fake_resp, starlette.responses.StreamingResponse): body = None else: body = t.cast(t.Any, fake_resp.body) - resp = await self.inner.post( - '/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route, - data=body, - headers={'content-type': fake_resp.headers['content-type']}, - timeout=self.timeout) + resp = await self.inner.post('/' + _inference_api.route if not _inference_api.route.startswith('/') else _inference_api.route, + data=body, + headers={'content-type': fake_resp.headers['content-type']}, + timeout=self.timeout) if resp.status_code != 200: raise ValueError(f'Error making request: {resp.status_code}: {(await resp.aread())!s}') fake_req = starlette.requests.Request(scope={'type': 'http'}) headers = starlette.datastructures.Headers(headers=resp.headers) diff --git a/openllm-client/src/openllm_client/client.py b/openllm-client/src/openllm_client/client.py index 29a266c5..4bee4248 100644 --- a/openllm-client/src/openllm_client/client.py +++ b/openllm-client/src/openllm_client/client.py @@ -16,25 +16,21 @@ def process_http_address(self: AsyncHTTPClient | HTTPClient, address: str) -> No else: self._port = next(iter(_port)) class HTTPClient(BaseClient): - def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) class AsyncHTTPClient(BaseAsyncClient): - def __init__(self, address: str, timeout: int = 30): process_http_address(self, address) super().__init__(address, timeout) class GrpcClient(BaseClient): - def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(':') super().__init__(address, timeout) class AsyncGrpcClient(BaseAsyncClient): - def __init__(self, address: str, timeout: int = 30): self._host, self._port = address.split(':') super().__init__(address, timeout) diff --git a/openllm-core/src/openllm_core/_configuration.py b/openllm-core/src/openllm_core/_configuration.py index 043f9356..62a558da 100644 --- a/openllm-core/src/openllm_core/_configuration.py +++ b/openllm-core/src/openllm_core/_configuration.py @@ -104,7 +104,6 @@ config_merger = Merger([(dict, 'merge')], ['override'], ['override']) # case insensitive, but rename to conform with type class _PeftEnumMeta(enum.EnumMeta): - def __getitem__(self, __key: str | t.Any, /) -> t.Any: if isinstance(__key, str): __key = inflection.underscore(__key).upper() return self._member_map_[__key] @@ -198,35 +197,26 @@ class FineTuneConfig: adapter_type: PeftType = dantic.Field( 'lora', - description= - f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", + description=f"The type of adapter to use for fine-tuning. Available supported methods: {PeftType.supported()}, default to 'lora'", use_default_converter=False, converter=_adapter_converter) - adapter_config: t.Dict[str, t.Any] = dantic.Field( - None, - description='The configuration for the adapter. The content of the dict depends on the adapter type.', - validator=attr.validators.optional(attr.validators.instance_of(dict)), - converter=attr.converters.default_if_none(factory=dict), - use_default_converter=False) - inference_mode: bool = dantic.Field(False, - description='Whether to use this Adapter for inference', - use_default_converter=False) - llm_config_class: type[LLMConfig] = dantic.Field(None, - description='The reference class to openllm.LLMConfig', - use_default_converter=False) + adapter_config: t.Dict[str, + t.Any] = dantic.Field(None, + description='The configuration for the adapter. The content of the dict depends on the adapter type.', + validator=attr.validators.optional(attr.validators.instance_of(dict)), + converter=attr.converters.default_if_none(factory=dict), + use_default_converter=False) + inference_mode: bool = dantic.Field(False, description='Whether to use this Adapter for inference', use_default_converter=False) + llm_config_class: type[LLMConfig] = dantic.Field(None, description='The reference class to openllm.LLMConfig', use_default_converter=False) def to_peft_config(self) -> peft.PeftConfig: # type: ignore[name-defined] adapter_config = self.adapter_config.copy() # no need for peft_type since it is internally managed by OpenLLM and PEFT if 'peft_type' in adapter_config: adapter_config.pop('peft_type') # respect user set task_type if it is passed, otherwise use one managed by OpenLLM - task_type, inference_mode = adapter_config.pop( - 'task_type', - peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop('inference_mode', - self.inference_mode) - return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, - inference_mode=inference_mode, - **adapter_config) + task_type, inference_mode = adapter_config.pop('task_type', peft.TaskType[self.llm_config_class.peft_task_type()]), adapter_config.pop( + 'inference_mode', self.inference_mode) + return peft.PEFT_TYPE_TO_CONFIG_MAPPING[self.adapter_type.to_str()](task_type=task_type, inference_mode=inference_mode, **adapter_config) def train(self) -> FineTuneConfig: _object_setattr(self, 'inference_mode', False) @@ -237,14 +227,10 @@ class FineTuneConfig: return self def with_config(self, **attrs: t.Any) -> FineTuneConfig: - adapter_type, inference_mode = attrs.pop('adapter_type', - self.adapter_type), attrs.get('inference_mode', self.inference_mode) + adapter_type, inference_mode = attrs.pop('adapter_type', self.adapter_type), attrs.get('inference_mode', self.inference_mode) if 'llm_config_class' in attrs: raise ForbiddenAttributeError("'llm_config_class' should not be passed when using 'with_config'.") - return attr.evolve(self, - adapter_type=adapter_type, - inference_mode=inference_mode, - adapter_config=config_merger.merge(self.adapter_config, attrs)) + return attr.evolve(self, adapter_type=adapter_type, inference_mode=inference_mode, adapter_config=config_merger.merge(self.adapter_config, attrs)) @attr.frozen(slots=True, repr=False, init=False) class GenerationConfig(ReprMixin): @@ -254,16 +240,14 @@ class GenerationConfig(ReprMixin): to be used conjunction with LLMConfig. The instance of the generation config can then be accessed via ``LLMConfig.generation_config``. ''' - max_new_tokens: int = dantic.Field( - 20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.') + max_new_tokens: int = dantic.Field(20, ge=0, description='The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt.') min_length: int = dantic.Field( 0, ge=0, description= 'The minimum length of the sequence to be generated. Corresponds to the length of the input prompt + `min_new_tokens`. Its effect is overridden by `min_new_tokens`, if also set.' ) - min_new_tokens: int = dantic.Field( - description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.') + min_new_tokens: int = dantic.Field(description='The minimum numbers of tokens to generate, ignoring the number of tokens in the prompt.') early_stopping: bool = dantic.Field( False, description= @@ -280,24 +264,15 @@ class GenerationConfig(ReprMixin): 'Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.' ) penalty_alpha: float = dantic.Field( - description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.' - ) + description='The values balance the model confidence and the degeneration penalty in contrastive search decoding.') use_cache: bool = dantic.Field( - True, - description= - 'Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.' - ) - temperature: float = dantic.Field(1.0, - ge=0.0, - le=1.0, - description='The value used to modulate the next token probabilities.') - top_k: int = dantic.Field( - 50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.') + True, description='Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding.') + temperature: float = dantic.Field(1.0, ge=0.0, le=1.0, description='The value used to modulate the next token probabilities.') + top_k: int = dantic.Field(50, description='The number of highest probability vocabulary tokens to keep for top-k-filtering.') top_p: float = dantic.Field( 1.0, description= - 'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.' - ) + 'If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.') typical_p: float = dantic.Field( 1.0, description= @@ -320,21 +295,18 @@ class GenerationConfig(ReprMixin): ) repetition_penalty: float = dantic.Field( 1.0, - description= - 'The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.' + description='The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.' ) encoder_repetition_penalty: float = dantic.Field( 1.0, description= - 'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.' - ) + 'The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the original input. 1.0 means no penalty.') length_penalty: float = dantic.Field( 1.0, description= 'Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to the sequence length, which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), `length_penalty` > 0.0 promotes longer sequences, while `length_penalty` < 0.0 encourages shorter sequences.' ) - no_repeat_ngram_size: int = dantic.Field( - 0, description='If set to int > 0, all ngrams of that size can only occur once.') + no_repeat_ngram_size: int = dantic.Field(0, description='If set to int > 0, all ngrams of that size can only occur once.') bad_words_ids: t.List[t.List[int]] = dantic.Field( description= 'List of token ids that are not allowed to be generated. In order to get the token ids of the words that should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, add_special_tokens=False).input_ids`.' @@ -381,35 +353,22 @@ class GenerationConfig(ReprMixin): description= 'A list of pairs of integers which indicates a mapping from generation indices to token indices that will be forced before sampling. For example, `[[1, 123]]` means the second generated token will always be a token of index 123.' ) - num_return_sequences: int = dantic.Field( - 1, description='The number of independently computed returned sequences for each element in the batch.') + num_return_sequences: int = dantic.Field(1, description='The number of independently computed returned sequences for each element in the batch.') output_attentions: bool = dantic.Field( False, - description= - 'Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.' - ) + description='Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more details.') output_hidden_states: bool = dantic.Field( - False, - description= - 'Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.' - ) + False, description='Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more details.') output_scores: bool = dantic.Field( - False, - description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.' - ) + False, description='Whether or not to return the prediction scores. See `scores` under returned tensors for more details.') pad_token_id: int = dantic.Field(description='The id of the *padding* token.') bos_token_id: int = dantic.Field(description='The id of the *beginning-of-sequence* token.') eos_token_id: t.Union[int, t.List[int]] = dantic.Field( - description= - 'The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.') + description='The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.') encoder_no_repeat_ngram_size: int = dantic.Field( - 0, - description= - 'If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.' - ) + 0, description='If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.') decoder_start_token_id: int = dantic.Field( - description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.' - ) + description='If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.') if t.TYPE_CHECKING and not MYPY: # stubs this for pyright as mypy already has a attr plugin builtin @@ -418,9 +377,7 @@ class GenerationConfig(ReprMixin): def __init__(self, *, _internal: bool = False, **attrs: t.Any): if not _internal: - raise RuntimeError( - 'GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config' - ) + raise RuntimeError('GenerationConfig is not meant to be used directly, but you can access this via a LLMConfig.generation_config') self.__attrs_init__(**attrs) def __getitem__(self, item: str) -> t.Any: @@ -438,9 +395,7 @@ bentoml_cattr.register_unstructure_hook_factory( _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{ - k: override(omit=True) - for k, v in attr.fields_dict(cls).items() - if v.default in (None, attr.NOTHING) + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) })) @attr.frozen(slots=True, repr=False, init=False) @@ -471,13 +426,8 @@ class SamplingParams(ReprMixin): ) use_beam_search: bool = dantic.Field(False, description='Whether to use beam search instead of sampling.') stop: t.List[str] = dantic.Field( - None, - description= - 'List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.' - ) - ignore_eos: bool = dantic.Field( - False, - description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.') + None, description='List of strings that stop the generation when they are generated. The returned output will not contain the stop strings.') + ignore_eos: bool = dantic.Field(False, description='Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.') logprobs: int = dantic.Field(None, description='Number of log probabilities to return per output token.') if t.TYPE_CHECKING: @@ -526,9 +476,7 @@ class SamplingParams(ReprMixin): temperature = first_not_none(attrs.pop('temperature', None), default=generation_config['temperature']) top_k = first_not_none(attrs.pop('top_k', None), default=generation_config['top_k']) top_p = first_not_none(attrs.pop('top_p', None), default=generation_config['top_p']) - max_tokens = first_not_none(attrs.pop('max_tokens', None), - attrs.pop('max_new_tokens', None), - default=generation_config['max_new_tokens']) + max_tokens = first_not_none(attrs.pop('max_tokens', None), attrs.pop('max_new_tokens', None), default=generation_config['max_new_tokens']) return cls(_internal=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, **attrs) bentoml_cattr.register_unstructure_hook_factory( @@ -538,13 +486,11 @@ bentoml_cattr.register_unstructure_hook_factory( _cattrs_omit_if_default=False, _cattrs_use_linecache=True, **{ - k: override(omit=True) - for k, v in attr.fields_dict(cls).items() - if v.default in (None, attr.NOTHING) + k: override(omit=True) for k, v in attr.fields_dict(cls).items() if v.default in (None, attr.NOTHING) })) bentoml_cattr.register_structure_hook_factory( - lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), lambda cls: make_dict_structure_fn( - cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens'))) + lambda cls: attr.has(cls) and lenient_issubclass(cls, SamplingParams), + lambda cls: make_dict_structure_fn(cls, bentoml_cattr, _cattrs_forbid_extra_keys=True, max_new_tokens=override(rename='max_tokens'))) # cached it here to save one lookup per assignment _object_getattribute = object.__getattribute__ @@ -607,12 +553,10 @@ _transformed_type: DictStrAny = { use_default_converter=False, type=_transformed_type.get(k, ann), metadata={'target': f'__openllm_{k}__'}, - description=f'ModelSettings field for {k}.')) - for k, ann in t.get_type_hints(ModelSettings).items() + description=f'ModelSettings field for {k}.')) for k, ann in t.get_type_hints(ModelSettings).items() ]) class _ModelSettingsAttr: '''Internal attrs representation of ModelSettings.''' - def __getitem__(self, key: str) -> t.Any: if key in codegen.get_annotations(ModelSettings): return _object_getattribute(self, key) @@ -676,8 +620,7 @@ def get_default_backend(backend_mapping: dict[LiteralResourceSpec, LiteralBacken def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ModelSettingsAttr: if 'generation_class' in cl_.__config__: raise ValueError( - f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead." - ) + f"'generation_class' shouldn't be defined in '__config__', rather defining all required attributes under '{cl_}.GenerationConfig' instead.") required_fields = {k for k, ann in t.get_type_hints(ModelSettings).items() if t.get_origin(ann) is Required} if any(i not in cl_.__config__ for i in required_fields): @@ -689,11 +632,9 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ _final_value_dct: DictStrAny = {} if not has_custom_name: - _final_value_dct['model_name'] = inflection.underscore( - _cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower() + _final_value_dct['model_name'] = inflection.underscore(_cl_name) if _settings_attr['name_type'] == 'dasherize' else _cl_name.lower() _final_value_dct['start_name'] = inflection.dasherize( - _final_value_dct['model_name'] - ) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name'] + _final_value_dct['model_name']) if _settings_attr['name_type'] == 'dasherize' else _final_value_dct['model_name'] model_name = _final_value_dct['model_name'] if 'model_name' in _final_value_dct else _settings_attr.model_name # if the default implementation dependencies doesn't exist, then always fallback to 'pt' @@ -703,9 +644,7 @@ def structure_settings(cl_: type[LLMConfig], cls: type[_ModelSettingsAttr]) -> _ if not BACKENDS_MAPPING[library_stub][0](): default_backend[rs] = 'pt' _final_value_dct['default_backend'] = default_backend - env = openllm_core.utils.EnvVarMixin(model_name, - backend=get_default_backend(default_backend), - model_id=_settings_attr.default_id) + env = openllm_core.utils.EnvVarMixin(model_name, backend=get_default_backend(default_backend), model_id=_settings_attr.default_id) _final_value_dct['env'] = env _final_value_dct['service_name'] = f'generated_{model_name}_service.py' @@ -729,16 +668,10 @@ bentoml_cattr.register_structure_hook(_ModelSettingsAttr, structure_settings) def _setattr_class(attr_name: str, value_var: t.Any) -> str: return f"setattr(cls, '{attr_name}', {value_var})" -def _make_assignment_script(cls: type[LLMConfig], - attributes: attr.AttrsInstance, - _prefix: LiteralString = 'openllm') -> t.Callable[..., None]: +def _make_assignment_script(cls: type[LLMConfig], attributes: attr.AttrsInstance, _prefix: LiteralString = 'openllm') -> t.Callable[..., None]: '''Generate the assignment script with prefix attributes __openllm___.''' args: ListStr = [] - globs: DictStrAny = { - 'cls': cls, - '_cached_attribute': attributes, - '_cached_getattribute_get': _object_getattribute.__get__ - } + globs: DictStrAny = {'cls': cls, '_cached_attribute': attributes, '_cached_getattribute_get': _object_getattribute.__get__} annotations: DictStrAny = {'return': None} lines: ListStr = [] @@ -748,18 +681,12 @@ def _make_assignment_script(cls: type[LLMConfig], lines.append(_setattr_class(arg_name, attr_name)) annotations[attr_name] = field.type - return codegen.generate_function(cls, - '__assign_attr', - lines, - args=('cls', *args), - globs=globs, - annotations=annotations) + return codegen.generate_function(cls, '__assign_attr', lines, args=('cls', *args), globs=globs, annotations=annotations) _reserved_namespace = {'__config__', 'GenerationConfig', 'SamplingParams'} @attr.define(slots=True) class _ConfigAttr: - @staticmethod def Field(default: t.Any = None, **attrs: t.Any) -> t.Any: '''Field is a alias to the internal dantic utilities to easily create @@ -825,7 +752,6 @@ class _ConfigAttr: '''The result generated SamplingParams class for this LLMConfig. This will be used to create arguments for vLLM LLMEngine that can be used throughout the lifecycle. This class will also be managed internally by OpenLLM.''' - def __attrs_init__(self, *args: t.Any, **attrs: t.Any) -> None: '''Generated __attrs_init__ for LLMConfig subclass that follows the attrs contract.''' @@ -917,8 +843,7 @@ class _ConfigBuilder: It takes `these` arguments as a fully parsed attr.Attribute[t.Any] from __init_subclass__ """ - __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', - '_has_pre_init', '_has_post_init') + __slots__ = ('_cls', '_cls_dict', '_attr_names', '_attrs', '_model_name', '_base_attr_map', '_base_names', '_has_pre_init', '_has_post_init') def __init__(self, cls: type[LLMConfig], @@ -931,8 +856,7 @@ class _ConfigBuilder: auto_attribs, kw_only, collect_by_mro, - field_transformer=codegen.make_env_transformer( - cls, cls.__openllm_model_name__)) + field_transformer=codegen.make_env_transformer(cls, cls.__openllm_model_name__)) self._cls, self._model_name, self._cls_dict, self._attrs, self._base_names, self._base_attr_map = cls, cls.__openllm_model_name__, dict( cls.__dict__), attrs, {a.name for a in base_attrs}, base_attr_map self._attr_names = tuple(a.name for a in attrs) @@ -957,13 +881,11 @@ class _ConfigBuilder: existing_slots: DictStrAny = {} for base_cls in self._cls.__mro__[1:-1]: if base_cls.__dict__.get('__weakref__', None) is not None: weakref_inherited = True - existing_slots.update( - {name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])}) + existing_slots.update({name: getattr(base_cls, name, codegen._sentinel) for name in getattr(base_cls, '__slots__', [])}) names = self._attr_names base_names = set(self._base_names) - if '__weakref__' not in getattr(self._cls, '__slots__', - ()) and '__weakref__' not in names and not weakref_inherited: + if '__weakref__' not in getattr(self._cls, '__slots__', ()) and '__weakref__' not in names and not weakref_inherited: names += ('__weakref__',) # We only add the names of attributes that aren't inherited. # Setting __slots__ to inherited attributes wastes memory. @@ -1022,16 +944,14 @@ class _ConfigBuilder: def add_attrs_init(self) -> Self: self._cls_dict['__attrs_init__'] = codegen.add_method_dunders( self._cls, - _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, - self._base_attr_map, False, None, True)) + _make_init(self._cls, self._attrs, self._has_pre_init, self._has_post_init, False, True, False, self._base_attr_map, False, None, True)) return self def add_repr(self) -> Self: for key, fn in ReprMixin.__dict__.items(): if key in ('__repr__', '__str__', '__repr_name__', '__repr_str__', '__repr_args__'): self._cls_dict[key] = codegen.add_method_dunders(self._cls, fn) - self._cls_dict['__repr_keys__'] = property( - lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'}) + self._cls_dict['__repr_keys__'] = property(lambda _: {i.name for i in self._attrs} | {'generation_config', 'sampling_config'}) return self @attr.define(slots=True, init=False) @@ -1124,7 +1044,6 @@ class LLMConfig(_ConfigAttr): Future work: - Support pydantic-core as validation backend. """ - def __init_subclass__(cls, **_: t.Any): """The purpose of this ``__init_subclass__`` is to offer pydantic UX while adhering to attrs contract. @@ -1144,10 +1063,7 @@ class LLMConfig(_ConfigAttr): # auto assignment attributes generated from __config__ after create the new slot class. _make_assignment_script(cls, bentoml_cattr.structure(cls, _ModelSettingsAttr))(cls) - def _make_subclass(class_attr: str, - base: type[At], - globs: dict[str, t.Any] | None = None, - suffix_env: LiteralString | None = None) -> type[At]: + def _make_subclass(class_attr: str, base: type[At], globs: dict[str, t.Any] | None = None, suffix_env: LiteralString | None = None) -> type[At]: camel_name = cls.__name__.replace('Config', '') klass = attr.make_class(f'{camel_name}{class_attr}', [], bases=(base,), @@ -1162,8 +1078,7 @@ class LLMConfig(_ConfigAttr): cls.__openllm_model_name__, suffix=suffix_env, globs=globs, - default_callback=lambda field_name, field_default: getattr( - getattr(cls, class_attr), field_name, field_default) + default_callback=lambda field_name, field_default: getattr(getattr(cls, class_attr), field_name, field_default) if codegen.has_own_attribute(cls, class_attr) else field_default)) # For pickling to work, the __module__ variable needs to be set to the # frame where the class is created. This respect the module that is created from cls @@ -1195,13 +1110,11 @@ class LLMConfig(_ConfigAttr): unannotated = ca_names - annotated_names if len(unannotated) > 0: missing_annotated = sorted(unannotated, key=lambda n: t.cast('_CountingAttr', cd.get(n)).counter) - raise openllm_core.exceptions.MissingAnnotationAttributeError( - f"The following field doesn't have a type annotation: {missing_annotated}") + raise openllm_core.exceptions.MissingAnnotationAttributeError(f"The following field doesn't have a type annotation: {missing_annotated}") # We need to set the accepted key before generation_config # as generation_config is a special field that users shouldn't pass. - cls.__openllm_accepted_keys__ = set(these.keys()) | { - a.name for a in attr.fields(cls.__openllm_generation_class__) - } | {a.name for a in attr.fields(cls.__openllm_sampling_class__)} + cls.__openllm_accepted_keys__ = set(these.keys()) | {a.name for a in attr.fields(cls.__openllm_generation_class__) + } | {a.name for a in attr.fields(cls.__openllm_sampling_class__)} cls = _ConfigBuilder(cls, these).add_attrs_init().add_repr().build_class() # Finally, resolve the types @@ -1214,10 +1127,9 @@ class LLMConfig(_ConfigAttr): cls = attr.resolve_types(cls, globalns=globs) # the hint cache for easier access cls.__openllm_hints__ = { - f.name: f.type for ite in - [attr.fields(cls), - attr.fields(cls.__openllm_generation_class__), - attr.fields(cls.__openllm_sampling_class__),] for f in ite + f.name: f.type + for ite in [attr.fields(cls), attr.fields(cls.__openllm_generation_class__), + attr.fields(cls.__openllm_sampling_class__)] for f in ite } # for pickling to work, need to set the module to the correct outer frame @@ -1233,19 +1145,13 @@ class LLMConfig(_ConfigAttr): ) super().__setattr__(attr, value) - def __init__(self, - *, - generation_config: DictStrAny | None = None, - __openllm_extras__: DictStrAny | None = None, - **attrs: t.Any): + def __init__(self, *, generation_config: DictStrAny | None = None, __openllm_extras__: DictStrAny | None = None, **attrs: t.Any): # create a copy of the keys as cache _cached_keys = tuple(attrs.keys()) _generation_cl_dict = attr.fields_dict(self.__openllm_generation_class__) if generation_config is None: generation_config = {k: v for k, v in attrs.items() if k in _generation_cl_dict} else: - generation_config = config_merger.merge(generation_config, { - k: v for k, v in attrs.items() if k in _generation_cl_dict - }) + generation_config = config_merger.merge(generation_config, {k: v for k, v in attrs.items() if k in _generation_cl_dict}) sampling_config = {k: v for k, v in attrs.items() if k in attr.fields_dict(self.__openllm_sampling_class__)} for k in _cached_keys: @@ -1432,8 +1338,7 @@ class LLMConfig(_ConfigAttr): if item is None: raise TypeError(f"{self} doesn't understand how to index None.") item = inflection.underscore(item) if item in _reserved_namespace: - raise ForbiddenAttributeError( - f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.") + raise ForbiddenAttributeError(f"'{item}' is a reserved namespace for {self.__class__} and should not be access nor modified.") internal_attributes = f'__openllm_{item}__' if hasattr(self, internal_attributes): return getattr(self, internal_attributes) elif hasattr(self, item): return getattr(self, item) @@ -1448,8 +1353,7 @@ class LLMConfig(_ConfigAttr): def __getattribute__(self, item: str) -> t.Any: if item in _reserved_namespace: - raise ForbiddenAttributeError( - f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.") + raise ForbiddenAttributeError(f"'{item}' belongs to a private namespace for {self.__class__} and should not be access nor modified.") return _object_getattribute.__get__(self)(item) def __len__(self) -> int: @@ -1461,13 +1365,12 @@ class LLMConfig(_ConfigAttr): def values(self) -> list[t.Any]: return ([getattr(self, k.name) for k in attr.fields(self.__class__)] + [getattr(self.generation_config, k.name) for k in attr.fields(self.__openllm_generation_class__)] + - [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + - list(self.__openllm_extras__.values())) + [getattr(self.sampling_config, k.name) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.values())) def items(self) -> list[tuple[str, t.Any]]: - return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + [ - (k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__) - ] + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + + return ([(k.name, getattr(self, k.name)) for k in attr.fields(self.__class__)] + + [(k.name, getattr(self.generation_config, k.name)) for k in attr.fields(self.__openllm_generation_class__)] + + [(k.name, getattr(self.sampling_config, k.name)) for k in attr.fields(self.__openllm_sampling_class__)] + list(self.__openllm_extras__.items())) def __iter__(self) -> t.Iterator[str]: @@ -1617,8 +1520,7 @@ class LLMConfig(_ConfigAttr): f = dantic.attrs_to_options(name, field, cls.__openllm_model_name__, typ=ty, suffix_sampling=True)(f) f = cog.optgroup.group(f'{cls.__openllm_sampling_class__.__name__} sampling options')(f) - total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set( - attr.fields_dict(cls.__openllm_sampling_class__)) + total_keys = set(attr.fields_dict(cls.__openllm_generation_class__)) | set(attr.fields_dict(cls.__openllm_sampling_class__)) if len(cls.__openllm_accepted_keys__.difference(total_keys)) == 0: return t.cast('click.Command', f) # We pop out 'generation_config' as it is a attribute that we don't need to expose to CLI. @@ -1637,8 +1539,7 @@ class LLMConfig(_ConfigAttr): @classmethod def default_backend(cls) -> LiteralBackend: - return first_not_none(cls.__openllm_env__['backend_value'], - default=get_default_backend(cls.__openllm_default_backend__)) + return first_not_none(cls.__openllm_env__['backend_value'], default=get_default_backend(cls.__openllm_default_backend__)) def sanitize_parameters(self, prompt: str, **attrs: t.Any) -> tuple[str, DictStrAny, DictStrAny]: '''This handler will sanitize all attrs and setup prompt text. @@ -1694,6 +1595,4 @@ def structure_llm_config(data: t.Any, cls: type[LLMConfig]) -> LLMConfig: bentoml_cattr.register_structure_hook_func(lambda cls: lenient_issubclass(cls, LLMConfig), structure_llm_config) openllm_home = os.path.expanduser( - os.environ.get( - 'OPENLLM_HOME', - os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))) + os.environ.get('OPENLLM_HOME', os.path.join(os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')), 'openllm'))) diff --git a/openllm-core/src/openllm_core/_prompt.py b/openllm-core/src/openllm_core/_prompt.py index ba9b2dbb..d0e64caf 100644 --- a/openllm-core/src/openllm_core/_prompt.py +++ b/openllm-core/src/openllm_core/_prompt.py @@ -4,13 +4,11 @@ import typing as t class PromptFormatter(string.Formatter): """This PromptFormatter is largely based on langchain's implementation.""" - def vformat(self, format_string: str, args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> t.Any: if len(args) > 0: raise ValueError('Positional arguments are not supported') return super().vformat(format_string, args, kwargs) - def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, - t.Any]) -> None: + def check_unused_args(self, used_args: set[int | str], args: t.Sequence[t.Any], kwargs: t.Mapping[str, t.Any]) -> None: extras = set(kwargs).difference(used_args) if extras: raise KeyError(f'Extra params passed: {extras}') @@ -26,8 +24,7 @@ def process_prompt(prompt: str, template: str | None = None, use_prompt_template template_variables = default_formatter.extract_template_variables(template) prompt_variables = {k: v for k, v in attrs.items() if k in template_variables} if 'instruction' in prompt_variables: - raise RuntimeError( - "'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'") + raise RuntimeError("'instruction' should be passed as the first argument instead of kwargs when 'use_prompt_template=True'") try: return template.format(instruction=prompt, **prompt_variables) except KeyError as e: diff --git a/openllm-core/src/openllm_core/_schema.py b/openllm-core/src/openllm_core/_schema.py index 9e28451b..b7396507 100644 --- a/openllm-core/src/openllm_core/_schema.py +++ b/openllm-core/src/openllm_core/_schema.py @@ -21,11 +21,7 @@ class GenerationInput: adapter_name: str | None = attr.field(default=None) def model_dump(self) -> dict[str, t.Any]: - return { - 'prompt': self.prompt, - 'llm_config': self.llm_config.model_dump(flatten=True), - 'adapter_name': self.adapter_name - } + return {'prompt': self.prompt, 'llm_config': self.llm_config.model_dump(flatten=True), 'adapter_name': self.adapter_name} @staticmethod def convert_llm_config(data: dict[str, t.Any] | LLMConfig, cls: type[LLMConfig] | None = None) -> LLMConfig: @@ -43,15 +39,11 @@ class GenerationInput: def from_llm_config(cls, llm_config: LLMConfig) -> type[GenerationInput]: return attr.make_class(inflection.camelize(llm_config['model_name']) + 'GenerationInput', attrs={ - 'prompt': - attr.field(type=str), - 'llm_config': - attr.field(type=llm_config.__class__, - default=llm_config, - converter=functools.partial(cls.convert_llm_config, - cls=llm_config.__class__)), - 'adapter_name': - attr.field(default=None, type=str) + 'prompt': attr.field(type=str), + 'llm_config': attr.field(type=llm_config.__class__, + default=llm_config, + converter=functools.partial(cls.convert_llm_config, cls=llm_config.__class__)), + 'adapter_name': attr.field(default=None, type=str) }) @attr.frozen(slots=True) diff --git a/openllm-core/src/openllm_core/_strategies.py b/openllm-core/src/openllm_core/_strategies.py index a1b976c3..f6cc5d19 100644 --- a/openllm-core/src/openllm_core/_strategies.py +++ b/openllm-core/src/openllm_core/_strategies.py @@ -151,8 +151,7 @@ def _from_spec(cls: type[DynResource], spec: t.Any) -> list[str]: elif isinstance(spec, list): return [str(x) for x in spec] else: - raise TypeError( - f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") + raise TypeError(f"'{cls.__name__}.from_spec' only supports parsing spec of type int, str, or list, got '{type(spec)}' instead.") def _raw_device_uuid_nvml() -> list[str] | None: from ctypes import CDLL @@ -278,10 +277,8 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): TODO: Support CloudTPUResource """ - @classmethod - def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, - workers_per_resource: float) -> int: + def get_worker_count(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: float) -> int: '''Return the number of workers to be used for the given runnable class. Note that for all available GPU, the number of workers will always be 1. @@ -313,8 +310,8 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): ) @classmethod - def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, - workers_per_resource: int | float, worker_index: int) -> dict[str, t.Any]: + def get_worker_env(cls, runnable_class: type[bentoml.Runnable], resource_request: dict[str, t.Any] | None, workers_per_resource: int | float, + worker_index: int) -> dict[str, t.Any]: '''Get worker env for this given worker_index. Args: @@ -379,19 +376,15 @@ class CascadingResourceStrategy(bentoml.Strategy, ReprMixin): # then it will round down to 2. If workers_per_source=0.6, then it will also round up to 2. assigned_resource_per_worker = round(1 / workers_per_resource) if len(gpus) < assigned_resource_per_worker: - logger.warning( - 'Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, - worker_index, assigned_resource_per_worker) - raise IndexError( - f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}]." - ) + logger.warning('Failed to allocate %s GPUs for %s (number of available GPUs < assigned workers per resource [%s])', gpus, worker_index, + assigned_resource_per_worker) + raise IndexError(f"There aren't enough assigned GPU(s) for given worker id '{worker_index}' [required: {assigned_resource_per_worker}].") assigned_gpu = gpus[assigned_resource_per_worker * worker_index:assigned_resource_per_worker * (worker_index + 1)] dev = ','.join(assigned_gpu) else: idx = worker_index // workers_per_resource if idx >= len(gpus): - raise ValueError( - f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}') + raise ValueError(f'Number of available GPU ({gpus}) preceeds the given workers_per_resource {workers_per_resource}') dev = str(gpus[idx]) return dev diff --git a/openllm-core/src/openllm_core/_typing_compat.py b/openllm-core/src/openllm_core/_typing_compat.py index d28e53fd..21939026 100644 --- a/openllm-core/src/openllm_core/_typing_compat.py +++ b/openllm-core/src/openllm_core/_typing_compat.py @@ -31,11 +31,7 @@ M = t.TypeVar( bound= 't.Union[transformers.PreTrainedModel, transformers.Pipeline, transformers.TFPreTrainedModel, transformers.FlaxPreTrainedModel, vllm.LLMEngine, peft.PeftModel, autogptq.modeling.BaseGPTQForCausalLM]' ) -T = t.TypeVar( - 'T', - bound= - 't.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]' -) +T = t.TypeVar('T', bound='t.Union[transformers.PreTrainedTokenizerFast, transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerBase]') def get_literal_args(typ: t.Any) -> tuple[str, ...]: return getattr(typ, '__args__') @@ -132,7 +128,7 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): max_latency_ms: int | None = ..., method_configs: dict[str, dict[str, int]] | None = ..., embedded: bool = False, - ) -> None: + ) -> None: ... def __call__(self, prompt: str, **attrs: t.Any) -> t.Any: @@ -163,23 +159,19 @@ class LLMRunner(bentoml.Runner, t.Generic[M, T]): ... class load_model_protocol(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: ... class load_tokenizer_protocol(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], **attrs: t.Any) -> T: ... _R = t.TypeVar('_R', covariant=True) class import_model_protocol(t.Generic[_R, M, T], t.Protocol): - def __call__(self, llm: LLM[M, T], *decls: t.Any, trust_remote_code: bool, **attrs: t.Any) -> _R: ... class llm_post_init_protocol(t.Generic[M, T], t.Protocol): - def __call__(self, llm: LLM[M, T]) -> T: ... diff --git a/openllm-core/src/openllm_core/config/configuration_auto.py b/openllm-core/src/openllm_core/config/configuration_auto.py index fde09558..85535498 100644 --- a/openllm-core/src/openllm_core/config/configuration_auto.py +++ b/openllm-core/src/openllm_core/config/configuration_auto.py @@ -24,14 +24,12 @@ if t.TYPE_CHECKING: ConfigItemsView = _odict_items[str, type[openllm_core.LLMConfig]] # NOTE: This is the entrypoint when adding new model config -CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), - ('falcon', 'FalconConfig'), ('flan_t5', 'FlanT5Config'), - ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'), - ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), - ('starcoder', 'StarCoderConfig'), ('baichuan', 'BaichuanConfig')]) +CONFIG_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLMConfig'), ('dolly_v2', 'DollyV2Config'), ('falcon', 'FalconConfig'), + ('flan_t5', 'FlanT5Config'), ('gpt_neox', 'GPTNeoXConfig'), ('llama', 'LlamaConfig'), ('mpt', 'MPTConfig'), + ('opt', 'OPTConfig'), ('stablelm', 'StableLMConfig'), ('starcoder', 'StarCoderConfig'), + ('baichuan', 'BaichuanConfig')]) class _LazyConfigMapping(OrderedDict, ReprMixin): - def __init__(self, mapping: OrderedDict[LiteralString, LiteralString]): self._mapping = mapping self._extra_content: dict[str, t.Any] = {} @@ -79,32 +77,21 @@ class _LazyConfigMapping(OrderedDict, ReprMixin): CONFIG_MAPPING: dict[str, type[openllm_core.LLMConfig]] = _LazyConfigMapping(CONFIG_MAPPING_NAMES) # The below handle special alias when we call underscore to the name directly without processing camelcase first. -CONFIG_NAME_ALIASES: dict[str, str] = { - 'chat_glm': 'chatglm', - 'stable_lm': 'stablelm', - 'star_coder': 'starcoder', - 'gpt_neo_x': 'gpt_neox', -} +CONFIG_NAME_ALIASES: dict[str, str] = {'chat_glm': 'chatglm', 'stable_lm': 'stablelm', 'star_coder': 'starcoder', 'gpt_neo_x': 'gpt_neox'} class AutoConfig: - def __init__(self, *_: t.Any, **__: t.Any): - raise EnvironmentError( - 'Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.') + raise EnvironmentError('Cannot instantiate AutoConfig directly. Please use `AutoConfig.for_model(model_name)` instead.') @classmethod def for_model(cls, model_name: str, **attrs: t.Any) -> openllm_core.LLMConfig: model_name = inflection.underscore(model_name) if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name].model_construct_env(**attrs) - raise ValueError( - f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}." - ) + raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.") @classmethod def infer_class_from_name(cls, name: str) -> type[openllm_core.LLMConfig]: model_name = inflection.underscore(name) if model_name in CONFIG_NAME_ALIASES: model_name = CONFIG_NAME_ALIASES[model_name] if model_name in CONFIG_MAPPING: return CONFIG_MAPPING[model_name] - raise ValueError( - f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}." - ) + raise ValueError(f"Unrecognized configuration class for {model_name}. Model name should be one of {', '.join(CONFIG_MAPPING.keys())}.") diff --git a/openllm-core/src/openllm_core/config/configuration_baichuan.py b/openllm-core/src/openllm_core/config/configuration_baichuan.py index 36a82fbe..dd70745e 100644 --- a/openllm-core/src/openllm_core/config/configuration_baichuan.py +++ b/openllm-core/src/openllm_core/config/configuration_baichuan.py @@ -37,24 +37,17 @@ class BaichuanConfig(openllm_core.LLMConfig): Refer to [Baichuan-7B's GitHub page](https://github.com/baichuan-inc/Baichuan-7B) for more information. """ __config__ = { - 'name_type': - 'lowercase', - 'trust_remote_code': - True, - 'timeout': - 3600000, - 'requires_gpu': - True, - 'url': - 'https://github.com/baichuan-inc/Baichuan-7B', + 'name_type': 'lowercase', + 'trust_remote_code': True, + 'timeout': 3600000, + 'requires_gpu': True, + 'url': 'https://github.com/baichuan-inc/Baichuan-7B', 'requirements': ['cpm-kernels', 'sentencepiece'], - 'architecture': - 'BaiChuanForCausalLM', - 'default_id': - 'baichuan-inc/baichuan-7b', + 'architecture': 'BaiChuanForCausalLM', + 'default_id': 'baichuan-inc/baichuan-7b', 'model_ids': [ - 'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat', - 'fireballoon/baichuan-vicuna-chinese-7b', 'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft' + 'baichuan-inc/baichuan-7b', 'baichuan-inc/baichuan-13b-base', 'baichuan-inc/baichuan-13b-chat', 'fireballoon/baichuan-vicuna-chinese-7b', + 'fireballoon/baichuan-vicuna-7b', 'hiyouga/baichuan-7b-sft' ] } diff --git a/openllm-core/src/openllm_core/config/configuration_chatglm.py b/openllm-core/src/openllm_core/config/configuration_chatglm.py index 5831f36e..2cf948b2 100644 --- a/openllm-core/src/openllm_core/config/configuration_chatglm.py +++ b/openllm-core/src/openllm_core/config/configuration_chatglm.py @@ -41,30 +41,18 @@ class ChatGLMConfig(openllm_core.LLMConfig): Refer to [ChatGLM's GitHub page](https://github.com/THUDM/ChatGLM-6B) for more information. """ __config__ = { - 'name_type': - 'lowercase', - 'trust_remote_code': - True, - 'timeout': - 3600000, - 'requires_gpu': - True, - 'url': - 'https://github.com/THUDM/ChatGLM-6B', + 'name_type': 'lowercase', + 'trust_remote_code': True, + 'timeout': 3600000, + 'requires_gpu': True, + 'url': 'https://github.com/THUDM/ChatGLM-6B', 'requirements': ['cpm-kernels', 'sentencepiece'], - 'architecture': - 'ChatGLMForConditionalGeneration', - 'default_id': - 'thudm/chatglm-6b', - 'model_ids': [ - 'thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b', - 'thudm/chatglm2-6b-int4' - ] + 'architecture': 'ChatGLMForConditionalGeneration', + 'default_id': 'thudm/chatglm-6b', + 'model_ids': ['thudm/chatglm-6b', 'thudm/chatglm-6b-int8', 'thudm/chatglm-6b-int4', 'thudm/chatglm2-6b', 'thudm/chatglm2-6b-int4'] } retain_history: bool = dantic.Field( - False, - description= - 'Whether to retain history given to the model. If set to True, then the model will retain given history.') + False, description='Whether to retain history given to the model. If set to True, then the model will retain given history.') use_half_precision: bool = dantic.Field(True, description='Whether to use half precision for model.') class GenerationConfig: diff --git a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py index 6ab24a99..42078ad9 100644 --- a/openllm-core/src/openllm_core/config/configuration_dolly_v2.py +++ b/openllm-core/src/openllm_core/config/configuration_dolly_v2.py @@ -105,6 +105,5 @@ class DollyV2Config(openllm_core.LLMConfig): **attrs }, {} - def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]], - **_: t.Any) -> str: + def postprocess_generate(self, prompt: str, generation_result: list[dict[t.Literal['generated_text'], str]], **_: t.Any) -> str: return generation_result[0]['generated_text'] diff --git a/openllm-core/src/openllm_core/config/configuration_falcon.py b/openllm-core/src/openllm_core/config/configuration_falcon.py index 84dd88e8..d18df361 100644 --- a/openllm-core/src/openllm_core/config/configuration_falcon.py +++ b/openllm-core/src/openllm_core/config/configuration_falcon.py @@ -39,21 +39,14 @@ class FalconConfig(openllm_core.LLMConfig): Refer to [Falcon's HuggingFace page](https://huggingface.co/tiiuae/falcon-7b) for more information. """ __config__ = { - 'name_type': - 'lowercase', - 'trust_remote_code': - True, - 'requires_gpu': - True, - 'timeout': - int(36e6), - 'url': - 'https://falconllm.tii.ae/', + 'name_type': 'lowercase', + 'trust_remote_code': True, + 'requires_gpu': True, + 'timeout': int(36e6), + 'url': 'https://falconllm.tii.ae/', 'requirements': ['einops', 'xformers'], - 'architecture': - 'FalconForCausalLM', - 'default_id': - 'tiiuae/falcon-7b', + 'architecture': 'FalconForCausalLM', + 'default_id': 'tiiuae/falcon-7b', 'model_ids': ['tiiuae/falcon-7b', 'tiiuae/falcon-40b', 'tiiuae/falcon-7b-instruct', 'tiiuae/falcon-40b-instruct'], 'fine_tune_strategies': ({ 'adapter_type': 'lora', diff --git a/openllm-core/src/openllm_core/config/configuration_flan_t5.py b/openllm-core/src/openllm_core/config/configuration_flan_t5.py index 1c5eddc9..aa027ac4 100644 --- a/openllm-core/src/openllm_core/config/configuration_flan_t5.py +++ b/openllm-core/src/openllm_core/config/configuration_flan_t5.py @@ -40,18 +40,11 @@ class FlanT5Config(openllm_core.LLMConfig): Refer to [FLAN-T5's page](https://huggingface.co/docs/transformers/model_doc/flan-t5) for more information. """ __config__ = { - 'url': - 'https://huggingface.co/docs/transformers/model_doc/flan-t5', - 'architecture': - 'T5ForConditionalGeneration', - 'model_type': - 'seq2seq_lm', - 'default_id': - 'google/flan-t5-large', - 'model_ids': [ - 'google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', - 'google/flan-t5-xxl', - ] + 'url': 'https://huggingface.co/docs/transformers/model_doc/flan-t5', + 'architecture': 'T5ForConditionalGeneration', + 'model_type': 'seq2seq_lm', + 'default_id': 'google/flan-t5-large', + 'model_ids': ['google/flan-t5-small', 'google/flan-t5-base', 'google/flan-t5-large', 'google/flan-t5-xl', 'google/flan-t5-xxl'] } class GenerationConfig: diff --git a/openllm-core/src/openllm_core/config/configuration_llama.py b/openllm-core/src/openllm_core/config/configuration_llama.py index b96a9785..ef01c1cb 100644 --- a/openllm-core/src/openllm_core/config/configuration_llama.py +++ b/openllm-core/src/openllm_core/config/configuration_llama.py @@ -66,31 +66,24 @@ class LlamaConfig(openllm_core.LLMConfig): Refer to [Llama's model card](https://huggingface.co/docs/transformers/main/model_doc/llama) for more information. """ - use_llama2_prompt: bool = dantic.Field( - False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.') + use_llama2_prompt: bool = dantic.Field(False, description='Whether to use the prompt format for Llama 2. Disable this when working with Llama 1.') __config__ = { - 'name_type': - 'lowercase', - 'url': - 'https://github.com/facebookresearch/llama', + 'name_type': 'lowercase', + 'url': 'https://github.com/facebookresearch/llama', 'default_backend': { 'cpu': 'pt', 'nvidia.com/gpu': 'pt' }, - 'architecture': - 'LlamaForCausalLM', + 'architecture': 'LlamaForCausalLM', 'requirements': ['fairscale', 'sentencepiece'], - 'tokenizer_class': - 'LlamaTokenizerFast', - 'default_id': - 'NousResearch/llama-2-7b-hf', + 'tokenizer_class': 'LlamaTokenizerFast', + 'default_id': 'NousResearch/llama-2-7b-hf', 'model_ids': [ - 'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', - 'meta-llama/Llama-2-70b-hf', 'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf', - 'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf', 'NousResearch/llama-2-7b-chat-hf', - 'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf', - 'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b', - 'huggyllama/llama-65b', 'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b' + 'meta-llama/Llama-2-70b-chat-hf', 'meta-llama/Llama-2-13b-chat-hf', 'meta-llama/Llama-2-7b-chat-hf', 'meta-llama/Llama-2-70b-hf', + 'meta-llama/Llama-2-13b-hf', 'meta-llama/Llama-2-7b-hf', 'NousResearch/llama-2-70b-chat-hf', 'NousResearch/llama-2-13b-chat-hf', + 'NousResearch/llama-2-7b-chat-hf', 'NousResearch/llama-2-70b-hf', 'NousResearch/llama-2-13b-hf', 'NousResearch/llama-2-7b-hf', + 'openlm-research/open_llama_7b_v2', 'openlm-research/open_llama_3b_v2', 'openlm-research/open_llama_13b', 'huggyllama/llama-65b', + 'huggyllama/llama-30b', 'huggyllama/llama-13b', 'huggyllama/llama-7b' ], 'fine_tune_strategies': ({ 'adapter_type': 'lora', @@ -120,15 +113,14 @@ class LlamaConfig(openllm_core.LLMConfig): use_default_prompt_template: bool = False, use_llama2_prompt: bool = True, **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: - return process_prompt( - prompt, - DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None, - use_default_prompt_template, **attrs), { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_p': top_p, - 'top_k': top_k - }, {} + return process_prompt(prompt, + DEFAULT_PROMPT_TEMPLATE('v2' if use_llama2_prompt else 'v1') if use_default_prompt_template else None, + use_default_prompt_template, **attrs), { + 'max_new_tokens': max_new_tokens, + 'temperature': temperature, + 'top_p': top_p, + 'top_k': top_k + }, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/config/configuration_mpt.py b/openllm-core/src/openllm_core/config/configuration_mpt.py index 789264ee..1105a95a 100644 --- a/openllm-core/src/openllm_core/config/configuration_mpt.py +++ b/openllm-core/src/openllm_core/config/configuration_mpt.py @@ -44,12 +44,7 @@ _chat_prompt, _default_prompt, _instruct_prompt = '''{instruction}''', '''{instr {instruction} {response_key} '''.format(intro=INTRO_BLURB, instruction_key=INSTRUCTION_KEY, instruction='{instruction}', response_key=RESPONSE_KEY) -PROMPT_MAPPING = { - 'default': _default_prompt, - 'instruct': _instruct_prompt, - 'storywriter': _default_prompt, - 'chat': _chat_prompt -} +PROMPT_MAPPING = {'default': _default_prompt, 'instruct': _instruct_prompt, 'storywriter': _default_prompt, 'chat': _chat_prompt} def _get_prompt(model_type: str) -> str: return PROMPT_MAPPING[model_type] @@ -66,27 +61,20 @@ class MPTConfig(openllm_core.LLMConfig): for more details on specific models. """ __config__ = { - 'name_type': - 'lowercase', - 'trust_remote_code': - True, - 'url': - 'https://huggingface.co/mosaicml', - 'timeout': - int(36e6), + 'name_type': 'lowercase', + 'trust_remote_code': True, + 'url': 'https://huggingface.co/mosaicml', + 'timeout': int(36e6), 'requirements': ['triton', 'einops'], - 'architecture': - 'MPTForCausalLM', - 'default_id': - 'mosaicml/mpt-7b-instruct', + 'architecture': 'MPTForCausalLM', + 'default_id': 'mosaicml/mpt-7b-instruct', 'model_ids': [ - 'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', - 'mosaicml/mpt-30b', 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat' + 'mosaicml/mpt-7b', 'mosaicml/mpt-7b-instruct', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-30b', + 'mosaicml/mpt-30b-instruct', 'mosaicml/mpt-30b-chat' ] } - prompt_type: MPTPromptType = dantic.Field( - '"default"', - description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.') + prompt_type: MPTPromptType = dantic.Field('"default"', + description='Given prompt type for running MPT. Default will be inferred from model name if pretrained.') max_sequence_length: int = dantic.Field( 2048, description= @@ -106,7 +94,7 @@ class MPTConfig(openllm_core.LLMConfig): prompt_type: MPTPromptType | None = None, use_default_prompt_template: bool = True, **attrs: t.Any, - ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: + ) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: _template = None if use_default_prompt_template: if prompt_type is None: diff --git a/openllm-core/src/openllm_core/config/configuration_opt.py b/openllm-core/src/openllm_core/config/configuration_opt.py index 2ddf0cdc..bf362654 100644 --- a/openllm-core/src/openllm_core/config/configuration_opt.py +++ b/openllm-core/src/openllm_core/config/configuration_opt.py @@ -44,20 +44,12 @@ class OPTConfig(openllm_core.LLMConfig): Refer to [OPT's HuggingFace page](https://huggingface.co/docs/transformers/model_doc/opt) for more information. """ __config__ = { - 'name_type': - 'lowercase', - 'trust_remote_code': - False, - 'url': - 'https://huggingface.co/docs/transformers/model_doc/opt', - 'default_id': - 'facebook/opt-1.3b', - 'architecture': - 'OPTForCausalLM', - 'model_ids': [ - 'facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', - 'facebook/opt-66b' - ], + 'name_type': 'lowercase', + 'trust_remote_code': False, + 'url': 'https://huggingface.co/docs/transformers/model_doc/opt', + 'default_id': 'facebook/opt-1.3b', + 'architecture': 'OPTForCausalLM', + 'model_ids': ['facebook/opt-125m', 'facebook/opt-350m', 'facebook/opt-1.3b', 'facebook/opt-2.7b', 'facebook/opt-6.7b', 'facebook/opt-66b'], 'fine_tune_strategies': ({ 'adapter_type': 'lora', 'r': 16, @@ -67,8 +59,7 @@ class OPTConfig(openllm_core.LLMConfig): 'bias': 'none' },) } - format_outputs: bool = dantic.Field( - False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''') + format_outputs: bool = dantic.Field(False, description='''Whether to format the outputs. This can be used when num_return_sequences > 1.''') class GenerationConfig: top_k: int = 15 diff --git a/openllm-core/src/openllm_core/config/configuration_stablelm.py b/openllm-core/src/openllm_core/config/configuration_stablelm.py index df7afb6f..6dcd50ed 100644 --- a/openllm-core/src/openllm_core/config/configuration_stablelm.py +++ b/openllm-core/src/openllm_core/config/configuration_stablelm.py @@ -47,17 +47,13 @@ class StableLMConfig(openllm_core.LLMConfig): for more information. """ __config__ = { - 'name_type': - 'lowercase', - 'url': - 'https://github.com/Stability-AI/StableLM', - 'architecture': - 'GPTNeoXForCausalLM', - 'default_id': - 'stabilityai/stablelm-tuned-alpha-3b', + 'name_type': 'lowercase', + 'url': 'https://github.com/Stability-AI/StableLM', + 'architecture': 'GPTNeoXForCausalLM', + 'default_id': 'stabilityai/stablelm-tuned-alpha-3b', 'model_ids': [ - 'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', - 'stabilityai/stablelm-base-alpha-3b', 'stabilityai/stablelm-base-alpha-7b' + 'stabilityai/stablelm-tuned-alpha-3b', 'stabilityai/stablelm-tuned-alpha-7b', 'stabilityai/stablelm-base-alpha-3b', + 'stabilityai/stablelm-base-alpha-7b' ] } @@ -77,19 +73,10 @@ class StableLMConfig(openllm_core.LLMConfig): **attrs: t.Any) -> tuple[str, dict[str, t.Any], dict[str, t.Any]]: if 'tuned' in self._model_id and use_default_prompt_template: system_prompt = attrs.pop('system_prompt', SYSTEM_PROMPT) - prompt_text = process_prompt(prompt, - DEFAULT_PROMPT_TEMPLATE, - use_default_prompt_template, - system_prompt=system_prompt, - **attrs) + prompt_text = process_prompt(prompt, DEFAULT_PROMPT_TEMPLATE, use_default_prompt_template, system_prompt=system_prompt, **attrs) else: prompt_text = prompt - return prompt_text, { - 'max_new_tokens': max_new_tokens, - 'temperature': temperature, - 'top_k': top_k, - 'top_p': top_p - }, {} + return prompt_text, {'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_k': top_k, 'top_p': top_p}, {} def postprocess_generate(self, prompt: str, generation_result: list[str], **_: t.Any) -> str: return generation_result[0] diff --git a/openllm-core/src/openllm_core/utils/__init__.py b/openllm-core/src/openllm_core/utils/__init__.py index 94948144..efdfd08a 100644 --- a/openllm-core/src/openllm_core/utils/__init__.py +++ b/openllm-core/src/openllm_core/utils/__init__.py @@ -113,8 +113,7 @@ def field_env_key(key: str, suffix: str | None = None) -> str: # Special debug flag controled via OPENLLMDEVDEBUG DEBUG: bool = sys.flags.dev_mode or (not sys.flags.ignore_environment and check_bool_env(DEV_DEBUG_VAR, default=False)) # Whether to show the codenge for debug purposes -SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and - int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3) +SHOW_CODEGEN: bool = DEBUG and (os.environ.get(DEV_DEBUG_VAR, str(0)).isdigit() and int(os.environ.get(DEV_DEBUG_VAR, str(0))) > 3) # MYPY is like t.TYPE_CHECKING, but reserved for Mypy plugins MYPY = False @@ -125,7 +124,6 @@ def get_quiet_mode() -> bool: return not DEBUG and _get_quiet_mode() class ExceptionFilter(logging.Filter): - def __init__(self, exclude_exceptions: list[type[Exception]] | None = None, **kwargs: t.Any): '''A filter of all exception.''' if exclude_exceptions is None: exclude_exceptions = [ConflictError] @@ -142,7 +140,6 @@ class ExceptionFilter(logging.Filter): return True class InfoFilter(logging.Filter): - def filter(self, record: logging.LogRecord) -> bool: return logging.INFO <= record.levelno < logging.WARNING @@ -246,7 +243,6 @@ def compose(*funcs: AnyCallable) -> AnyCallable: >>> [f(3*x, x+1) for x in range(1,10)] [1.5, 2.0, 2.25, 2.4, 2.5, 2.571, 2.625, 2.667, 2.7] ''' - def compose_two(f1: AnyCallable, f2: AnyCallable) -> AnyCallable: return lambda *args, **kwargs: f1(f2(*args, **kwargs)) @@ -303,11 +299,7 @@ def generate_context(framework_name: str) -> _ModelContext: from bentoml._internal.frameworks.utils.tensorflow import get_tf_version framework_versions['tensorflow'] = get_tf_version() if openllm_core.utils.is_flax_available(): - framework_versions.update({ - 'flax': pkg.get_pkg_version('flax'), - 'jax': pkg.get_pkg_version('jax'), - 'jaxlib': pkg.get_pkg_version('jaxlib') - }) + framework_versions.update({'flax': pkg.get_pkg_version('flax'), 'jax': pkg.get_pkg_version('jax'), 'jaxlib': pkg.get_pkg_version('jaxlib')}) return _ModelContext(framework_name=framework_name, framework_versions=framework_versions) _TOKENIZER_PREFIX = '_tokenizer_' @@ -327,9 +319,7 @@ _whitelist_modules = {'pkg'} # XXX: define all classes, functions import above this line # since _extras will be the locals() import from this file. _extras: dict[str, t.Any] = { - k: v - for k, v in locals().items() - if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_')) + k: v for k, v in locals().items() if k in _whitelist_modules or (not isinstance(v, types.ModuleType) and not k.startswith('_')) } _extras['__openllm_migration__'] = {'ModelEnv': 'EnvVarMixin'} _import_structure: dict[str, list[str]] = { @@ -339,11 +329,10 @@ _import_structure: dict[str, list[str]] = { 'lazy': [], 'representation': ['ReprMixin'], 'import_utils': [ - 'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', - 'is_einops_available', 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', - 'is_bitsandbytes_available', 'is_peft_available', 'is_datasets_available', 'is_transformers_supports_kbit', - 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available', 'is_notebook_available', - 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available', + 'OPTIONAL_DEPENDENCIES', 'DummyMetaclass', 'EnvVarMixin', 'require_backends', 'is_cpm_kernels_available', 'is_einops_available', + 'is_flax_available', 'is_tf_available', 'is_vllm_available', 'is_torch_available', 'is_bitsandbytes_available', 'is_peft_available', + 'is_datasets_available', 'is_transformers_supports_kbit', 'is_transformers_supports_agent', 'is_jupyter_available', 'is_jupytext_available', + 'is_notebook_available', 'is_triton_available', 'is_autogptq_available', 'is_sentencepiece_available', 'is_xformers_available', 'is_fairscale_available', 'is_grpc_available', 'is_grpc_health_available', 'is_transformers_available' ] } diff --git a/openllm-core/src/openllm_core/utils/analytics.py b/openllm-core/src/openllm_core/utils/analytics.py index 3e680ccf..1388dbfc 100644 --- a/openllm-core/src/openllm_core/utils/analytics.py +++ b/openllm-core/src/openllm_core/utils/analytics.py @@ -34,7 +34,6 @@ def _usage_event_debugging() -> bool: return os.environ.get('__BENTOML_DEBUG_USAGE', str(False)).lower() == 'true' def silent(func: t.Callable[P, T]) -> t.Callable[P, T]: - @functools.wraps(func) def wrapper(*args: P.args, **kwargs: P.kwargs) -> t.Any: try: @@ -62,7 +61,6 @@ def set_bentoml_tracking() -> t.Generator[None, None, None]: os.environ[_internal_analytics.BENTOML_DO_NOT_TRACK] = original_value class EventMeta: - @property def event_name(self) -> str: # camel case to snake case diff --git a/openllm-core/src/openllm_core/utils/codegen.py b/openllm-core/src/openllm_core/utils/codegen.py index 7141bdc1..b59ff697 100644 --- a/openllm-core/src/openllm_core/utils/codegen.py +++ b/openllm-core/src/openllm_core/utils/codegen.py @@ -110,8 +110,7 @@ def generate_function(typ: type[t.Any], globs: dict[str, t.Any], annotations: dict[str, t.Any] | None = None) -> AnyCallable: from openllm_core.utils import SHOW_CODEGEN - script = 'def %s(%s):\n %s\n' % (func_name, ', '.join(args) if args is not None else '', - '\n '.join(lines) if lines else 'pass') + script = 'def %s(%s):\n %s\n' % (func_name, ', '.join(args) if args is not None else '', '\n '.join(lines) if lines else 'pass') meth = _make_method(func_name, script, generate_unique_filename(typ, func_name), globs) if annotations: meth.__annotations__ = annotations if SHOW_CODEGEN: print('Generated script for {typ}:\n\n', script) @@ -122,7 +121,7 @@ def make_env_transformer(cls: type[openllm_core.LLMConfig], suffix: LiteralString | None = None, default_callback: t.Callable[[str, t.Any], t.Any] | None = None, globs: DictStrAny | None = None, - ) -> AnyCallable: + ) -> AnyCallable: from openllm_core.utils import dantic from openllm_core.utils import field_env_key @@ -171,16 +170,15 @@ def gen_sdk(func: _T, name: str | None = None, **attrs: t.Any) -> _T: return t.cast( _T, functools.update_wrapper( - types.new_class( - name, (t.cast('PartialAny', functools.partial), ReprMixin), - exec_body=lambda ns: ns.update({ - '__repr_keys__': property(lambda _: [i for i in _signatures.keys() if not i.startswith('_')]), - '__repr_args__': _repr_args, - '__repr__': _repr, - '__doc__': inspect.cleandoc(doc), - '__module__': 'openllm' - }), - )(func, **attrs), func, + types.new_class(name, (t.cast('PartialAny', functools.partial), ReprMixin), + exec_body=lambda ns: ns.update({ + '__repr_keys__': property(lambda _: [i for i in _signatures.keys() if not i.startswith('_')]), + '__repr_args__': _repr_args, + '__repr__': _repr, + '__doc__': inspect.cleandoc(doc), + '__module__': 'openllm' + }), + )(func, **attrs), func, )) __all__ = ['gen_sdk', 'make_attr_tuple_class', 'make_env_transformer', 'generate_unique_filename', 'generate_function'] diff --git a/openllm-core/src/openllm_core/utils/dantic.py b/openllm-core/src/openllm_core/utils/dantic.py index 70ff9ea3..d77a2757 100644 --- a/openllm-core/src/openllm_core/utils/dantic.py +++ b/openllm-core/src/openllm_core/utils/dantic.py @@ -25,9 +25,8 @@ AnyCallable = t.Callable[..., t.Any] FC = t.TypeVar('FC', bound=t.Union[AnyCallable, click.Command]) __all__ = [ - 'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice', - 'LiteralChoice', 'allows_multiple', 'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg', - 'CUDA', 'JsonType', 'BytesType' + 'FC', 'attrs_to_options', 'Field', 'parse_type', 'is_typing', 'is_literal', 'ModuleType', 'EnumChoice', 'LiteralChoice', 'allows_multiple', + 'is_mapping', 'is_container', 'parse_container_args', 'parse_single_arg', 'CUDA', 'JsonType', 'BytesType' ] def __dir__() -> list[str]: @@ -64,7 +63,7 @@ def attrs_to_options(name: str, help=field.metadata.get('description', '(No description provided)'), show_envvar=True, envvar=envvar, - ) + ) def env_converter(value: t.Any, env: str | None = None) -> t.Any: if env is not None: diff --git a/openllm-core/src/openllm_core/utils/import_utils.py b/openllm-core/src/openllm_core/utils/import_utils.py index 8ea867b2..2f42ca8f 100644 --- a/openllm-core/src/openllm_core/utils/import_utils.py +++ b/openllm-core/src/openllm_core/utils/import_utils.py @@ -27,9 +27,7 @@ if t.TYPE_CHECKING: BackendOrderedDict = OrderedDict[str, t.Tuple[t.Callable[[], bool], str]] logger = logging.getLogger(__name__) -OPTIONAL_DEPENDENCIES = { - 'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc' -} +OPTIONAL_DEPENDENCIES = {'opt', 'flan-t5', 'vllm', 'fine-tune', 'ggml', 'agents', 'openai', 'playground', 'gptq', 'grpc'} ENV_VARS_TRUE_VALUES = {'1', 'ON', 'YES', 'TRUE'} ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({'AUTO'}) USE_TF = os.environ.get('USE_TF', 'AUTO').upper() @@ -144,10 +142,9 @@ def is_tf_available() -> bool: _tf_version = None if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES: if _tf_available: - candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', - 'tf-nightly-gpu', 'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', - 'tensorflow-macos', 'tensorflow-aarch64', - ) + candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', + 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', 'tensorflow-aarch64', + ) _tf_version = None # For the metadata, we have to look for both tensorflow and tensorflow-cpu for _pkg in candidates: @@ -285,18 +282,20 @@ You can install it with pip: `pip install fairscale`. Please note that you may n your runtime after installation. ''' -BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([ - ('flax', (is_flax_available, FLAX_IMPORT_ERROR)), ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)), - ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)), - ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), - ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)), ('triton', (is_triton_available, TRITON_IMPORT_ERROR)), - ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)), ('peft', (is_peft_available, PEFT_IMPORT_ERROR)), - ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), - ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), - ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), - ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), - ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR)) -]) +BACKENDS_MAPPING: BackendOrderedDict = OrderedDict([('flax', (is_flax_available, FLAX_IMPORT_ERROR)), + ('tf', (is_tf_available, TENSORFLOW_IMPORT_ERROR)), + ('torch', (is_torch_available, PYTORCH_IMPORT_ERROR)), + ('vllm', (is_vllm_available, VLLM_IMPORT_ERROR)), + ('cpm_kernels', (is_cpm_kernels_available, CPM_KERNELS_IMPORT_ERROR)), + ('einops', (is_einops_available, EINOPS_IMPORT_ERROR)), + ('triton', (is_triton_available, TRITON_IMPORT_ERROR)), + ('datasets', (is_datasets_available, DATASETS_IMPORT_ERROR)), + ('peft', (is_peft_available, PEFT_IMPORT_ERROR)), + ('bitsandbytes', (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)), + ('auto-gptq', (is_autogptq_available, AUTOGPTQ_IMPORT_ERROR)), + ('sentencepiece', (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), + ('xformers', (is_xformers_available, XFORMERS_IMPORT_ERROR)), + ('fairscale', (is_fairscale_available, FAIRSCALE_IMPORT_ERROR))]) class DummyMetaclass(abc.ABCMeta): '''Metaclass for dummy object. @@ -326,9 +325,7 @@ def require_backends(o: t.Any, backends: t.MutableSequence[str]) -> None: raise ImportError(VLLM_IMPORT_ERROR_WITH_TF.format(name)) if 'flax' not in backends and is_flax_available() and not is_vllm_available(): raise ImportError(VLLM_IMPORT_ERROR_WITH_FLAX.format(name)) - failed = [ - msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available() - ] + failed = [msg.format(name) for available, msg in (BACKENDS_MAPPING[backend] for backend in backends) if not available()] if failed: raise ImportError(''.join(failed)) class EnvVarMixin(ReprMixin): @@ -371,11 +368,7 @@ class EnvVarMixin(ReprMixin): elif hasattr(self, item): return getattr(self, item) raise KeyError(f'Key {item} not found in {self}') - def __init__(self, - model_name: str, - backend: LiteralBackend = 'pt', - model_id: str | None = None, - quantize: LiteralString | None = None) -> None: + def __init__(self, model_name: str, backend: LiteralBackend = 'pt', model_id: str | None = None, quantize: LiteralString | None = None) -> None: '''EnvVarMixin is a mixin class that returns the value extracted from environment variables.''' from openllm_core.utils import field_env_key self.model_name = inflection.underscore(model_name) @@ -387,8 +380,7 @@ class EnvVarMixin(ReprMixin): def _quantize_value(self) -> t.Literal['int8', 'int4', 'gptq'] | None: from . import first_not_none - return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], - first_not_none(os.environ.get(self['quantize']), default=self._quantize)) + return t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], first_not_none(os.environ.get(self['quantize']), default=self._quantize)) def _backend_value(self) -> LiteralBackend: from . import first_not_none diff --git a/openllm-core/src/openllm_core/utils/lazy.py b/openllm-core/src/openllm_core/utils/lazy.py index b14c130f..b4b7131f 100644 --- a/openllm-core/src/openllm_core/utils/lazy.py +++ b/openllm-core/src/openllm_core/utils/lazy.py @@ -110,8 +110,7 @@ class LazyModule(types.ModuleType): It also contains a special case for all of the metadata information, such as __version__ and __version_info__. ''' if name in _reserved_namespace: - raise openllm_core.exceptions.ForbiddenAttributeError( - f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.") + raise openllm_core.exceptions.ForbiddenAttributeError(f"'{name}' is a reserved namespace for {self._name} and should not be access nor modified.") dunder_to_metadata = { '__title__': 'Name', '__copyright__': '', @@ -147,10 +146,9 @@ class LazyModule(types.ModuleType): if '__openllm_migration__' in self._objects: cur_value = self._objects['__openllm_migration__'].get(name, _sentinel) if cur_value is not _sentinel: - warnings.warn( - f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", - DeprecationWarning, - stacklevel=3) + warnings.warn(f"'{name}' is deprecated and will be removed in future version. Make sure to use '{cur_value}' instead", + DeprecationWarning, + stacklevel=3) return getattr(self, cur_value) if name in self._objects: return self._objects.__getitem__(name) if name in self._modules: value = self._get_module(name) @@ -165,9 +163,7 @@ class LazyModule(types.ModuleType): try: return importlib.import_module('.' + module_name, self.__name__) except Exception as e: - raise RuntimeError( - f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}' - ) from e + raise RuntimeError(f'Failed to import {self.__name__}.{module_name} because of the following error (look up to see its traceback):\n{e}') from e # make sure this module is picklable def __reduce__(self) -> tuple[type[LazyModule], tuple[str, str | None, dict[str, list[str]]]]: diff --git a/openllm-core/src/openllm_core/utils/representation.py b/openllm-core/src/openllm_core/utils/representation.py index 65227f78..554827a5 100644 --- a/openllm-core/src/openllm_core/utils/representation.py +++ b/openllm-core/src/openllm_core/utils/representation.py @@ -14,7 +14,6 @@ if t.TYPE_CHECKING: ReprArgs: TypeAlias = t.Generator[t.Tuple[t.Optional[str], t.Any], None, None] class ReprMixin: - @property @abstractmethod def __repr_keys__(self) -> set[str]: diff --git a/openllm-python/src/openllm/__init__.py b/openllm-python/src/openllm/__init__.py index a0772855..695ae271 100644 --- a/openllm-python/src/openllm/__init__.py +++ b/openllm-python/src/openllm/__init__.py @@ -26,14 +26,11 @@ else: # configuration for bitsandbytes before import _os.environ["BITSANDBYTES_NOWELCOME"] = _os.environ.get("BITSANDBYTES_NOWELCOME", "1") # NOTE: The following warnings from bitsandbytes, and probably not that important for users to see when DEBUG is False - _warnings.filterwarnings( - "ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") - _warnings.filterwarnings( - "ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization") + _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization") + _warnings.filterwarnings("ignore", message="MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization") _warnings.filterwarnings("ignore", message="The installed version of bitsandbytes was compiled without GPU support.") # NOTE: ignore the following warning from ghapi as it is not important for users - _warnings.filterwarnings("ignore", - message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated") + _warnings.filterwarnings("ignore", message="Neither GITHUB_TOKEN nor GITHUB_JWT_TOKEN found: running as unauthenticated") _import_structure: dict[str, list[str]] = { "exceptions": [], @@ -48,13 +45,8 @@ _import_structure: dict[str, list[str]] = { "_quantisation": ["infer_quantisation_config"], "_embeddings": ["GenericEmbeddingRunnable"], "_llm": ["LLM", "Runner", "LLMRunner", "LLMRunnable", "EmbeddingsOutput"], - "_generation": [ - "StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", - "prepare_logits_processor" - ], - "models.auto": [ - "MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES" - ], + "_generation": ["StopSequenceCriteria", "StopOnTokens", "LogitsProcessorList", "StoppingCriteriaList", "prepare_logits_processor"], + "models.auto": ["MODEL_MAPPING_NAMES", "MODEL_FLAX_MAPPING_NAMES", "MODEL_TF_MAPPING_NAMES", "MODEL_VLLM_MAPPING_NAMES"], "models.chatglm": [], "models.baichuan": [], "models.dolly_v2": [], @@ -114,8 +106,7 @@ try: if not openllm_core.utils.is_torch_available(): raise exceptions.MissingDependencyError except exceptions.MissingDependencyError: _import_structure["utils.dummy_pt_objects"] = [ - name for name in dir(utils.dummy_pt_objects) - if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations") + name for name in dir(utils.dummy_pt_objects) if not name.startswith("_") and name not in ("ChatGLM", "Baichuan", "MPT", "Falcon", "annotations") ] else: _import_structure["models.flan_t5"].extend(["FlanT5"]) diff --git a/openllm-python/src/openllm/_assign.py b/openllm-python/src/openllm/_assign.py index 8026204e..8113800e 100644 --- a/openllm-python/src/openllm/_assign.py +++ b/openllm-python/src/openllm/_assign.py @@ -36,7 +36,6 @@ else: vllm = LazyLoader('vllm', globals(), 'vllm') def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[LLM[M, T]], bentoml.Model]: - @functools.wraps(fn) def inner(self: LLM[M, T], *decls: t.Any, trust_remote_code: bool | None = None, **attrs: t.Any) -> bentoml.Model: trust_remote_code = first_not_none(trust_remote_code, default=self.trust_remote_code) @@ -48,7 +47,6 @@ def import_model(fn: import_model_protocol[bentoml.Model, M, T]) -> t.Callable[[ return inner def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vllm.LLMEngine]: - @functools.wraps(fn) def inner(self: LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M | vllm.LLMEngine: if self.__llm_backend__ == 'vllm': @@ -71,7 +69,6 @@ def load_model(fn: load_model_protocol[M, T]) -> t.Callable[[LLM[M, T]], M | vll return inner def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], T]: - @functools.wraps(fn) def inner(self: LLM[M, T], **tokenizer_attrs: t.Any) -> T: return fn(self, **{**self.llm_parameters[-1], **tokenizer_attrs}) @@ -79,7 +76,6 @@ def load_tokenizer(fn: load_tokenizer_protocol[M, T]) -> t.Callable[[LLM[M, T]], return inner def llm_post_init(fn: llm_post_init_protocol[M, T]) -> t.Callable[[LLM[M, T]], None]: - @functools.wraps(fn) def inner(self: LLM[M, T]) -> None: if self.__llm_backend__ == 'pt' and is_torch_available(): @@ -98,8 +94,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N args: ListStr = [] globs: DictStrAny = {'cls': cls, '__wrapped_llm_post_init': llm_post_init, 'LLM': LLM} # _cached_LLMFunction_get and _ccached_LLMSerialisation_get - globs.update( - {f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}}) + globs.update({f'_cached_{cl_.__name__}_get': _object_getattribute.__get__(cl_) for cl_ in {LLMSerialisation, LLMFunction}}) # llm_post_init implementation lines: ListStr = [ f'_impl_{cls.__name__}_func=cls.llm_post_init', @@ -112,17 +107,13 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N globs.update({f'__serialisation_{func}': getattr(openllm.serialisation, func, None), impl_name: impl}) cached_func_name = f'_cached_{cls.__name__}_func' func_call = f"_impl_{cls.__name__}_{func}={cached_func_name} if {cached_func_name} is not _cached_LLMSerialisation_get('{func}') else __serialisation_{func}" - lines.extend([ - f'{cached_func_name}=cls.{func}', func_call, - _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})') - ]) + lines.extend([f'{cached_func_name}=cls.{func}', func_call, _setattr_class(func, f'{impl_name}(_impl_{cls.__name__}_{func})')]) # assign vLLM implementation if cls.__llm_backend__ == 'vllm': vllm_func = { f'_vllm_{it}': fn - for it, fn in zip(('generate', 'generate_iterator', - 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate)) + for it, fn in zip(('generate', 'generate_iterator', 'postprocess_generate'), (vllm_generate, vllm_generate_iterator, vllm_postprocess_generate)) } globs.update(vllm_func) lines.extend([_setattr_class(it[6:], it) for it in vllm_func]) @@ -141,8 +132,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N return f'__llm_supports_{key}__' bool_attr = {it[15:-2] for it in interface_anns if it.startswith('__llm_supports_')} - lines.extend( - [_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr]) + lines.extend([_setattr_class(dunder_support(fn), f"cls.{fn} is not _cached_LLMFunction_get('{fn}')") for fn in bool_attr]) return codegen.generate_function(cls, '__assign_llm_attr', @@ -154,8 +144,7 @@ def make_llm_attributes(cls: type[LLM[M, T]]) -> t.Callable[[type[LLM[M, T]]], N 'return': None }) -def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], - **_: t.Any) -> str: +def vllm_postprocess_generate(self: LLM['vllm.LLMEngine', T], prompt: str, generation_result: list[dict[str, t.Any]], **_: t.Any) -> str: return generation_result[0]['outputs'][0]['text'] def vllm_generate_iterator(self: LLM['vllm.LLMEngine', T], @@ -193,9 +182,7 @@ def vllm_generate(self: LLM['vllm.LLMEngine', T], prompt: str, **attrs: t.Any) - if request_id is None: raise ValueError('request_id must not be None.') outputs: list[vllm.RequestOutput] = [] # TODO: support prompt_token_ids - self.model.add_request(request_id=request_id, - prompt=prompt, - sampling_params=self.config.model_construct_env(**attrs).to_sampling_config()) + self.model.add_request(request_id=request_id, prompt=prompt, sampling_params=self.config.model_construct_env(**attrs).to_sampling_config()) while self.model.has_unfinished_requests(): outputs.extend([r for r in self.model.step() if r.finished]) return [unmarshal_vllm_outputs(i) for i in outputs] diff --git a/openllm-python/src/openllm/_embeddings.py b/openllm-python/src/openllm/_embeddings.py index 8a0cda9c..18392443 100644 --- a/openllm-python/src/openllm/_embeddings.py +++ b/openllm-python/src/openllm/_embeddings.py @@ -25,9 +25,8 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: return bentoml.transformers.get(ids) except bentoml.exceptions.NotFound: model_signatures = { - k: ModelSignature(batchable=False) - for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', - 'group_beam_search', 'constrained_beam_search', '__call__') + k: ModelSignature(batchable=False) for k in ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', + 'beam_sample', 'group_beam_search', 'constrained_beam_search', '__call__') } with bentoml.models.create(ids, module=MODULE_NAME, @@ -39,11 +38,10 @@ def get_or_download(ids: str = _BENTOMODEL_ID) -> bentoml.Model: 'framework': 'openllm' }, signatures=model_signatures) as bentomodel: - snapshot_download( - _GENERIC_EMBEDDING_ID, - local_dir=bentomodel.path, - local_dir_use_symlinks=False, - ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt']) + snapshot_download(_GENERIC_EMBEDDING_ID, + local_dir=bentomodel.path, + local_dir_use_symlinks=False, + ignore_patterns=['*.safetensors', '*.h5', '*.ot', '*.pdf', '*.md', '.gitattributes', 'LICENSE.txt']) return bentomodel class GenericEmbeddingRunnable(bentoml.Runnable): @@ -68,10 +66,7 @@ class GenericEmbeddingRunnable(bentoml.Runnable): model_output = self.model(**encoded_input) # Perform pooling and normalize sentence_embeddings = F.normalize(self.mean_pooling(model_output, attention_mask), p=2, dim=1) - return [ - openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(), - num_tokens=int(torch.sum(attention_mask).item())) - ] + return [openllm.EmbeddingsOutput(embeddings=sentence_embeddings.cpu().numpy(), num_tokens=int(torch.sum(attention_mask).item()))] @staticmethod def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: diff --git a/openllm-python/src/openllm/_generation.py b/openllm-python/src/openllm/_generation.py index 6bb97f4c..ee192bf7 100644 --- a/openllm-python/src/openllm/_generation.py +++ b/openllm-python/src/openllm/_generation.py @@ -14,18 +14,15 @@ LogitsProcessorList = transformers.LogitsProcessorList StoppingCriteriaList = transformers.StoppingCriteriaList class StopSequenceCriteria(transformers.StoppingCriteria): - - def __init__(self, stop_sequences: str | list[str], tokenizer: transformers.PreTrainedTokenizer | - transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): + def __init__(self, stop_sequences: str | list[str], + tokenizer: transformers.PreTrainedTokenizer | transformers.PreTrainedTokenizerBase | transformers.PreTrainedTokenizerFast): if isinstance(stop_sequences, str): stop_sequences = [stop_sequences] self.stop_sequences, self.tokenizer = stop_sequences, tokenizer def __call__(self, input_ids: torch.Tensor, scores: t.Any, **_: t.Any) -> bool: - return any( - self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences) + return any(self.tokenizer.decode(input_ids.tolist()[0]).endswith(stop_sequence) for stop_sequence in self.stop_sequences) class StopOnTokens(transformers.StoppingCriteria): - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **_: t.Any) -> bool: return input_ids[0][-1] in {50278, 50279, 50277, 1, 0} diff --git a/openllm-python/src/openllm/_llm.py b/openllm-python/src/openllm/_llm.py index e7bccd8b..acb7cc9b 100644 --- a/openllm-python/src/openllm/_llm.py +++ b/openllm-python/src/openllm/_llm.py @@ -122,7 +122,6 @@ def resolve_peft_config_type(adapter_map: dict[str, str | None]) -> AdaptersMapp _reserved_namespace = {'config_class', 'model', 'tokenizer', 'import_kwargs'} class LLMFunction(abc.ABC): - @abc.abstractmethod def postprocess_generate(self, prompt: str, generation_result: t.Any, **attrs: t.Any) -> t.Any: '''This handler will postprocess generation results from LLM.generate and then output nicely formatted results (if the LLM decide to do so.). @@ -158,8 +157,7 @@ class LLMFunction(abc.ABC): ''' raise NotImplementedError - def generate_one(self, prompt: str, stop: list[str], - **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]: + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]: '''The entrypoint for generating one prompt. This provides additional stop tokens for generating per token level. This is useful when running with agents, or initial streaming support. @@ -177,7 +175,6 @@ class LLMFunction(abc.ABC): raise NotImplementedError class LLMSerialisation(abc.ABC, t.Generic[M, T]): - def import_model(self, *args: t.Any, trust_remote_code: bool, **attrs: t.Any) -> bentoml.Model: '''Import both model and tokenizer weights into as a BentoML models. @@ -206,7 +203,6 @@ class LLMSerialisation(abc.ABC, t.Generic[M, T]): raise NotImplementedError class LLMInterface(LLMFunction, LLMSerialisation[M, T], abc.ABC): - def llm_post_init(self) -> None: '''This function can be implemented if you need to initialized any additional variables that doesn't concern OpenLLM internals. By default, this will add `self.device` if the implementation is PyTorch. @@ -282,12 +278,12 @@ class LLM(LLMInterface[M, T], ReprMixin): if t.TYPE_CHECKING: __name__: str if t.TYPE_CHECKING and not MYPY: - def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, - autogptq.BaseQuantizeConfig]], - model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, - tag: bentoml.Tag, adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], - quantize_method: t.Optional[t.Literal['int8', 'int4', 'gptq']], - serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any) -> None: + def __attrs_init__(self, config: LLMConfig, quantization_config: t.Optional[t.Union[transformers.BitsAndBytesConfig, autogptq.BaseQuantizeConfig]], + model_id: str, model_decls: TupleAny, model_attrs: DictStrAny, tokenizer_attrs: DictStrAny, tag: bentoml.Tag, + adapters_mapping: t.Optional[AdaptersMapping], model_version: t.Optional[str], + quantize_method: t.Optional[t.Literal['int8', 'int4', + 'gptq']], serialisation_format: t.Literal['safetensors', + 'legacy'], _local: bool, **attrs: t.Any) -> None: '''Generated __attrs_init__ for openllm.LLM.''' config: LLMConfig @@ -434,20 +430,16 @@ class LLM(LLMInterface[M, T], ReprMixin): ''' cfg_cls = cls.config_class _local = False - _model_id: str = first_not_none(model_id, - os.environ.get(cfg_cls.__openllm_env__['model_id']), - default=cfg_cls.__openllm_default_id__) + _model_id: str = first_not_none(model_id, os.environ.get(cfg_cls.__openllm_env__['model_id']), default=cfg_cls.__openllm_default_id__) if validate_is_path(_model_id): _model_id, _local = resolve_filepath(_model_id), True quantize = first_not_none(quantize, - t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], - os.environ.get(cfg_cls.__openllm_env__['quantize'])), + t.cast(t.Optional[t.Literal['int8', 'int4', 'gptq']], os.environ.get(cfg_cls.__openllm_env__['quantize'])), default=None) # quantization setup if quantization_config and quantize: raise ValueError( - "'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument." - ) + "'quantization_config' and 'quantize' are mutually exclusive. Either customise your quantization_config or use the 'quantize' argument.") if quantization_config is None and quantize is not None: quantization_config, attrs = infer_quantisation_config(cls, quantize, **attrs) if quantize == 'gptq': serialisation = 'safetensors' @@ -460,9 +452,7 @@ class LLM(LLMInterface[M, T], ReprMixin): ) if adapter_map is None and adapter_id is not None: adapter_map = {adapter_id: adapter_name} if adapter_map is not None and not is_peft_available(): - raise RuntimeError( - "LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'" - ) + raise RuntimeError("LoRA adapter requires 'peft' to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'") if adapter_map: logger.debug('OpenLLM will apply the following adapters layers: %s', list(adapter_map)) if llm_config is None: @@ -517,16 +507,14 @@ class LLM(LLMInterface[M, T], ReprMixin): model_id, *maybe_revision = model_id.rsplit(':') if len(maybe_revision) > 0: if model_version is not None: - logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", - maybe_revision[0], model_version) + logger.warning("revision is specified within 'model_id' (%s), and 'model_version=%s' will be ignored.", maybe_revision[0], model_version) return f'{cls.__llm_backend__}-{model_name}:{maybe_revision[0]}' tag_name = f'{cls.__llm_backend__}-{model_name}' if openllm_core.utils.check_bool_env('OPENLLM_USE_LOCAL_LATEST', False): return str(bentoml.models.get(f"{tag_name}{':'+model_version if model_version is not None else ''}").tag) if validate_is_path(model_id): - model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, - default=generate_hash_from_file(model_id)) + model_id, model_version = resolve_filepath(model_id), first_not_none(model_version, default=generate_hash_from_file(model_id)) else: from .serialisation.transformers._helpers import process_config model_version = getattr( @@ -542,11 +530,10 @@ class LLM(LLMInterface[M, T], ReprMixin): return bentoml.Tag.from_taglike(cls._generate_tag_str(*param_decls, **attrs)) def __init__(self, *args: t.Any, model_id: str, llm_config: LLMConfig, - quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, - _adapters_mapping: AdaptersMapping | None, _tag: bentoml.Tag, - _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str, + quantization_config: transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig | None, _adapters_mapping: AdaptersMapping | None, + _tag: bentoml.Tag, _quantize_method: t.Literal['int8', 'int4', 'gptq'] | None, _model_version: str, _serialisation_format: t.Literal['safetensors', 'legacy'], _local: bool, **attrs: t.Any, - ): + ): '''Initialize the LLM with given pretrained model. > [!WARNING] @@ -662,8 +649,7 @@ class LLM(LLMInterface[M, T], ReprMixin): @property def trust_remote_code(self) -> bool: - return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), - default=self.config['trust_remote_code']) + return first_not_none(openllm_core.utils.check_bool_env('TRUST_REMOTE_CODE'), default=self.config['trust_remote_code']) @property def adapters_mapping(self) -> AdaptersMapping | None: @@ -698,10 +684,7 @@ class LLM(LLMInterface[M, T], ReprMixin): @property def identifying_params(self) -> DictStrAny: - return { - 'configuration': self.config.model_dump_json().decode(), - 'model_ids': orjson.dumps(self.config['model_ids']).decode() - } + return {'configuration': self.config.model_dump_json().decode(), 'model_ids': orjson.dumps(self.config['model_ids']).decode()} @property def llm_parameters(self) -> tuple[tuple[tuple[t.Any, ...], DictStrAny], DictStrAny]: @@ -755,8 +738,8 @@ class LLM(LLMInterface[M, T], ReprMixin): model = self.load_model(*self._model_decls, **self._model_attrs) # If OOM, then it is probably you don't have enough VRAM to run this model. if self.__llm_backend__ == 'pt' and is_torch_available(): - loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr( - model, 'is_loaded_in_4bit', False) or getattr(model, 'is_quantized', False) + loaded_in_kbit = getattr(model, 'is_loaded_in_8bit', False) or getattr(model, 'is_loaded_in_4bit', False) or getattr( + model, 'is_quantized', False) if torch.cuda.is_available() and torch.cuda.device_count() == 1 and not loaded_in_kbit: try: model = model.to('cuda') @@ -785,24 +768,20 @@ class LLM(LLMInterface[M, T], ReprMixin): _converted_first_none = False for _adapter_type, _adapters_tuples in self._adapters_mapping.items(): strategy = first_not_none(self.config['fine_tune_strategies'].get(_adapter_type), - default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), - llm_config_class=self.config_class)) + default=FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), llm_config_class=self.config_class)) default_config = strategy.eval() if inference_mode else strategy.train() for adapter in _adapters_tuples: if not adapter.name and _converted_first_none: - raise ValueError( - f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}" - ) + raise ValueError(f"{self.__class__.__name__} doesn't know how to resolve adapter_name None mapping: {adapter.adapter_id, adapter.config}") name = adapter.name if name is None: _converted_first_none = True name = 'default' peft_config = default_config.with_config( - **adapter.config).to_peft_config() if name == 'default' else FineTuneConfig( - adapter_type=t.cast('PeftType', _adapter_type), - adapter_config=adapter.config, - inference_mode=inference_mode, - llm_config_class=self.config_class).to_peft_config() + **adapter.config).to_peft_config() if name == 'default' else FineTuneConfig(adapter_type=t.cast('PeftType', _adapter_type), + adapter_config=adapter.config, + inference_mode=inference_mode, + llm_config_class=self.config_class).to_peft_config() adapter_map[_adapter_type][name] = (peft_config, adapter.adapter_id) if self.__llm_adapter_map__ is None and use_cache: self.__llm_adapter_map__ = adapter_map return adapter_map @@ -834,8 +813,7 @@ class LLM(LLMInterface[M, T], ReprMixin): _mapping = self._transpose_adapter_mapping(inference_mode=inference_mode, use_cache=use_cache) if adapter_type not in _mapping: - raise ValueError( - f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}') + raise ValueError(f'Given adapter type {adapter_type} is not supported. Please choose from {list(_mapping.keys())}') adapter_mapping = _mapping[adapter_type] self.__llm_model__ = self._wrap_default_peft_model(adapter_mapping, inference_mode=inference_mode) @@ -857,25 +835,21 @@ class LLM(LLMInterface[M, T], ReprMixin): return self.__llm_model__ - def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], - inference_mode: bool) -> M: + def _wrap_default_peft_model(self, adapter_mapping: dict[str, tuple[peft.PeftConfig, str]], inference_mode: bool) -> M: if self.__llm_model__ is None: raise ValueError('Error: Model is not loaded correctly') if isinstance(self.__llm_model__, peft.PeftModel): return self.__llm_model__ if not isinstance(self.__llm_model__, transformers.PreTrainedModel): raise ValueError('Loading LoRA layers currently only runs on PyTorch models.') if 'default' not in adapter_mapping: - raise ValueError( - "There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.") + raise ValueError("There is no 'default' mapping. Please check the adapter mapping and report this bug to the OpenLLM team.") default_config, peft_model_id = adapter_mapping.pop('default') # the below shared similar logics with `get_peft_model` # TODO: Support PromptLearningConfig - if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance( - default_config, peft.PromptLearningConfig): - logger.debug( - "Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", - default_config.task_type) + if default_config.task_type not in peft.MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys() and not isinstance(default_config, peft.PromptLearningConfig): + logger.debug("Given task type '%s' is not supported by peft. Make sure the adapter is loaded manually before running inference.", + default_config.task_type) model = peft.PeftModel(self.__llm_model__, default_config) else: # XXX: this is not ideal to serialize like this, maybe for fine-tune we will only support 0.4.0 @@ -894,12 +868,11 @@ class LLM(LLMInterface[M, T], ReprMixin): # order of these fields matter here, make sure to sync it with # openllm.models.auto.factory.BaseAutoLLMClass.for_model - def to_runner( - self, - models: list[bentoml.Model] | None = None, - max_batch_size: int | None = None, - max_latency_ms: int | None = None, - scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]: + def to_runner(self, + models: list[bentoml.Model] | None = None, + max_batch_size: int | None = None, + max_latency_ms: int | None = None, + scheduling_strategy: type[bentoml.Strategy] = openllm_core.CascadingResourceStrategy) -> LLMRunner[M, T]: '''Convert this LLM into a Runner. Args: @@ -1047,10 +1020,7 @@ class LLM(LLMInterface[M, T], ReprMixin): else: tmp_output_ids = output_ids[input_echo_len:] rfind_start = 0 - output = self.tokenizer.decode(tmp_output_ids, - skip_special_tokens=True, - spaces_between_special_tokens=False, - clean_up_tokenization_spaces=True) + output = self.tokenizer.decode(tmp_output_ids, skip_special_tokens=True, spaces_between_special_tokens=False, clean_up_tokenization_spaces=True) partially_stopped = False if stop: @@ -1183,25 +1153,17 @@ def Runner(model_name: str, ''' if llm_config is not None: attrs.update({ - 'model_id': - llm_config['env']['model_id_value'], - 'quantize': - llm_config['env']['quantize_value'], - 'serialisation': - first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors') + 'model_id': llm_config['env']['model_id_value'], + 'quantize': llm_config['env']['quantize_value'], + 'serialisation': first_not_none(os.environ.get('OPENLLM_SERIALIZATION'), attrs.get('serialisation'), default='safetensors') }) backend = t.cast( LiteralBackend, first_not_none(backend, - default=EnvVarMixin( - model_name, - backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value'])) + default=EnvVarMixin(model_name, backend=llm_config.default_backend() if llm_config is not None else 'pt')['backend_value'])) if init_local: ensure_available = True - runner = infer_auto_class(backend).create_runner(model_name, - llm_config=llm_config, - ensure_available=ensure_available, - **attrs) + runner = infer_auto_class(backend).create_runner(model_name, llm_config=llm_config, ensure_available=ensure_available, **attrs) if init_local: runner.init_local(quiet=True) return runner @@ -1214,7 +1176,6 @@ class SetAdapterOutput(t.TypedDict): def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate_sig: ModelSignature, generate_iterator_sig: ModelSignature) -> type[LLMRunnable[M, T]]: - class _Runnable(bentoml.Runnable): SUPPORTED_RESOURCES = ('nvidia.com/gpu', 'amd.com/gpu', 'cpu') SUPPORTS_CPU_MULTI_THREADING = True @@ -1252,8 +1213,7 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate return self.generate(prompt, **attrs) @bentoml.Runnable.method(**method_signature(generate_sig)) # type: ignore - def generate_one(__self: _Runnable, prompt: str, stop: list[str], - **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]: + def generate_one(__self: _Runnable, prompt: str, stop: list[str], **attrs: t.Any) -> t.Sequence[dict[t.Literal['generated_text'], str]]: adapter_name = attrs.pop('adapter_name', None) if adapter_name is not None: __self.set_adapter(adapter_name) return self.generate_one(prompt, stop, **attrs) @@ -1275,22 +1235,15 @@ def llm_runnable_class(self: LLM[M, T], embeddings_sig: ModelSignature, generate return types.new_class( self.__class__.__name__ + 'Runnable', (_Runnable,), {}, lambda ns: ns.update({ - 'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') - if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), - '__module__': - self.__module__, - '__doc__': - self.config['env'].start_docstring + 'SUPPORTED_RESOURCES': ('nvidia.com/gpu', 'amd.com/gpu') if self.config['requires_gpu'] else ('nvidia.com/gpu', 'amd.com/gpu', 'cpu'), + '__module__': self.__module__, + '__doc__': self.config['env'].start_docstring })) def llm_runner_class(self: LLM[M, T]) -> type[LLMRunner[M, T]]: - def available_adapters(_: LLMRunner[M, T]) -> PeftAdapterOutput: if not is_peft_available(): - return PeftAdapterOutput( - success=False, - result={}, - error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'") + return PeftAdapterOutput(success=False, result={}, error_msg="peft is not available. Make sure to install: 'pip install \"openllm[fine-tune]\"'") if self.__llm_adapter_map__ is None: return PeftAdapterOutput(success=False, result={}, error_msg='No adapters available for current running server.') if not isinstance(self.model, peft.PeftModel): diff --git a/openllm-python/src/openllm/_quantisation.py b/openllm-python/src/openllm/_quantisation.py index a0a6f4cf..2eade60f 100644 --- a/openllm-python/src/openllm/_quantisation.py +++ b/openllm-python/src/openllm/_quantisation.py @@ -16,8 +16,7 @@ if t.TYPE_CHECKING: from ._llm import LLM autogptq, torch, transformers = LazyLoader('autogptq', globals(), - 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader( - 'transformers', globals(), 'transformers') + 'auto_gptq'), LazyLoader('torch', globals(), 'torch'), LazyLoader('transformers', globals(), 'transformers') logger = logging.getLogger(__name__) @@ -33,9 +32,8 @@ def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: t.Literal[ **attrs: t.Any) -> tuple[autogptq.BaseQuantizeConfig, DictStrAny]: ... -def infer_quantisation_config( - cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, - **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: +def infer_quantisation_config(cls: type[LLM[t.Any, t.Any]], quantise: QuantiseMode, + **attrs: t.Any) -> tuple[transformers.BitsAndBytesConfig | autogptq.BaseQuantizeConfig, DictStrAny]: # 8 bit configuration int8_threshold = attrs.pop('llm_int8_threshhold', 6.0) int8_enable_fp32_cpu_offload = attrs.pop('llm_int8_enable_fp32_cpu_offload', False) @@ -61,7 +59,7 @@ def infer_quantisation_config( llm_int8_threshhold=int8_threshold, llm_int8_skip_modules=int8_skip_modules, llm_int8_has_fp16_weight=int8_has_fp16_weight, - ) + ) # 4 bit configuration int4_compute_dtype = attrs.pop('bnb_4bit_compute_dtype', torch.bfloat16) @@ -72,9 +70,7 @@ def infer_quantisation_config( # quantize is a openllm.LLM feature, where we can quantize the model # with bitsandbytes or quantization aware training. if not is_bitsandbytes_available(): - raise RuntimeError( - "Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'" - ) + raise RuntimeError("Quantization requires bitsandbytes to be installed. Make sure to install OpenLLM with 'pip install \"openllm[fine-tune]\"'") if quantise == 'int8': quantisation_config = create_int8_config(int8_skip_modules) elif quantise == 'int4': if is_transformers_supports_kbit(): diff --git a/openllm-python/src/openllm/_service.py b/openllm-python/src/openllm/_service.py index 8d910ad3..69c5bcfe 100644 --- a/openllm-python/src/openllm/_service.py +++ b/openllm-python/src/openllm/_service.py @@ -21,35 +21,28 @@ if t.TYPE_CHECKING: from bentoml._internal.runner.runner import AbstractRunner from bentoml._internal.runner.runner import RunnerMethod from openllm_core._typing_compat import TypeAlias - _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], - [t.List[str]], t.Sequence[openllm.EmbeddingsOutput]] + _EmbeddingMethod: TypeAlias = RunnerMethod[t.Union[bentoml.Runnable, openllm.LLMRunnable[t.Any, t.Any]], [t.List[str]], + t.Sequence[openllm.EmbeddingsOutput]] # The following warnings from bitsandbytes, and probably not that important for users to see -warnings.filterwarnings('ignore', - message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization') -warnings.filterwarnings('ignore', - message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization') +warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization') +warnings.filterwarnings('ignore', message='MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization') warnings.filterwarnings('ignore', message='The installed version of bitsandbytes was compiled without GPU support.') model = os.environ.get('OPENLLM_MODEL', '{__model_name__}') # openllm: model name adapter_map = os.environ.get('OPENLLM_ADAPTER_MAP', '''{__model_adapter_map__}''') # openllm: model adapter map llm_config = openllm.AutoConfig.for_model(model) runner = openllm.Runner(model, llm_config=llm_config, ensure_available=False, adapter_map=orjson.loads(adapter_map)) -generic_embedding_runner = bentoml.Runner( - openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type - name='llm-generic-embedding', - scheduling_strategy=openllm_core.CascadingResourceStrategy, - max_batch_size=32, - max_latency_ms=300) +generic_embedding_runner = bentoml.Runner(openllm.GenericEmbeddingRunnable, # XXX: remove arg-type once bentoml.Runner is correct set with type + name='llm-generic-embedding', + scheduling_strategy=openllm_core.CascadingResourceStrategy, + max_batch_size=32, + max_latency_ms=300) runners: list[AbstractRunner] = [runner] if not runner.supports_embeddings: runners.append(generic_embedding_runner) svc = bentoml.Service(name=f"llm-{llm_config['start_name']}-service", runners=runners) -_JsonInput = bentoml.io.JSON.from_sample({ - 'prompt': '', - 'llm_config': llm_config.model_dump(flatten=True), - 'adapter_name': None -}) +_JsonInput = bentoml.io.JSON.from_sample({'prompt': '', 'llm_config': llm_config.model_dump(flatten=True), 'adapter_name': None}) @svc.api(route='/v1/generate', input=_JsonInput, @@ -67,10 +60,7 @@ async def generate_v1(input_dict: dict[str, t.Any]) -> openllm.GenerationOutput: async def generate_stream_v1(input_dict: dict[str, t.Any]) -> t.AsyncGenerator[str, None]: echo = input_dict.pop('echo', False) qa_inputs = openllm.GenerationInput.from_llm_config(llm_config)(**input_dict) - return runner.generate_iterator.async_stream(qa_inputs.prompt, - adapter_name=qa_inputs.adapter_name, - echo=echo, - **qa_inputs.llm_config.model_dump()) + return runner.generate_iterator.async_stream(qa_inputs.prompt, adapter_name=qa_inputs.adapter_name, echo=echo, **qa_inputs.llm_config.model_dump()) @svc.api(route='/v1/metadata', input=bentoml.io.Text(), @@ -96,12 +86,10 @@ def metadata_v1(_: str) -> openllm.MetadataOutput: input=bentoml.io.JSON.from_sample(['Hey Jude, welcome to the jungle!', 'What is the meaning of life?']), output=bentoml.io.JSON.from_sample({ 'embeddings': [ - 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, - -0.0066398633643984795, 0.00945580005645752, 0.0087016262114048, -0.010709521360695362, - 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, - 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, - 0.0076906150206923485, 0.0009032754460349679, -0.010024012066423893, 0.01090280432254076, - -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, + 0.007917795330286026, -0.014421648345887661, 0.00481307040899992, 0.007331526838243008, -0.0066398633643984795, 0.00945580005645752, + 0.0087016262114048, -0.010709521360695362, 0.012635177001357079, 0.010541186667978764, -0.00730888033285737, -0.001783102168701589, + 0.02339819073677063, -0.010825827717781067, -0.015888236463069916, 0.01876218430697918, 0.0076906150206923485, 0.0009032754460349679, + -0.010024012066423893, 0.01090280432254076, -0.008668390102684498, 0.02070549875497818, 0.0014594447566196322, -0.018775740638375282, -0.014814382418990135, 0.01796768605709076 ], 'num_tokens': 20 @@ -121,8 +109,7 @@ if runner.supports_hf_agent and openllm.utils.is_transformers_supports_agent(): raise openllm.exceptions.OpenLLMException(f'Invalid JSON input received: {err}') from None stop = input_data.parameters.pop('stop', ['\n']) try: - return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), - status_code=200) + return JSONResponse(await runner.generate_one.async_run(input_data.inputs, stop, **input_data.parameters), status_code=200) except NotImplementedError: return JSONResponse(f"'{model}' is currently not supported with HuggingFace agents.", status_code=500) diff --git a/openllm-python/src/openllm/bundle/__init__.py b/openllm-python/src/openllm/bundle/__init__.py index b93a906b..61d75f2c 100644 --- a/openllm-python/src/openllm/bundle/__init__.py +++ b/openllm-python/src/openllm/bundle/__init__.py @@ -10,10 +10,7 @@ from openllm_core.utils import LazyModule _import_structure: dict[str, list[str]] = { '_package': ['create_bento', 'build_editable', 'construct_python_options', 'construct_docker_options'], - 'oci': [ - 'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', - 'supported_registries', 'RefResolver' - ] + 'oci': ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver'] } if t.TYPE_CHECKING: diff --git a/openllm-python/src/openllm/bundle/_package.py b/openllm-python/src/openllm/bundle/_package.py index f56dfc56..2e2b763b 100644 --- a/openllm-python/src/openllm/bundle/_package.py +++ b/openllm-python/src/openllm/bundle/_package.py @@ -43,8 +43,7 @@ logger = logging.getLogger(__name__) OPENLLM_DEV_BUILD = 'OPENLLM_DEV_BUILD' -def build_editable(path: str, - package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None: +def build_editable(path: str, package: t.Literal['openllm', 'openllm_core', 'openllm_client'] = 'openllm') -> str | None: '''Build OpenLLM if the OPENLLM_DEV_BUILD environment variable is set.''' if str(os.environ.get(OPENLLM_DEV_BUILD, False)).lower() != 'true': return None # We need to build the package in editable mode, so that we can import it @@ -52,9 +51,7 @@ def build_editable(path: str, from build.env import IsolatedEnvBuilder module_location = openllm_core.utils.pkg.source_locations(package) if not module_location: - raise RuntimeError( - 'Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.' - ) + raise RuntimeError('Could not find the source location of OpenLLM. Make sure to unset OPENLLM_DEV_BUILD if you are developing OpenLLM.') pyproject_path = Path(module_location).parent.parent / 'pyproject.toml' if os.path.isfile(pyproject_path.__fspath__()): logger.info('Generating built wheels for package %s...', package) @@ -64,14 +61,13 @@ def build_editable(path: str, builder.scripts_dir = env.scripts_dir env.install(builder.build_system_requires) return builder.build('wheel', path, config_settings={'--global-option': '--quiet'}) - raise RuntimeError( - 'Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.') + raise RuntimeError('Custom OpenLLM build is currently not supported. Please install OpenLLM from PyPI or built it from Git source.') def construct_python_options(llm: openllm.LLM[t.Any, t.Any], llm_fs: FS, extra_dependencies: tuple[str, ...] | None = None, adapter_map: dict[str, str | None] | None = None, - ) -> PythonOptions: + ) -> PythonOptions: packages = ['openllm', 'scipy'] # apparently bnb misses this one if adapter_map is not None: packages += ['openllm[fine-tune]'] # NOTE: add openllm to the default dependencies @@ -90,16 +86,13 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], if backend_envvar == 'flax': if not openllm_core.utils.is_flax_available(): raise ValueError(f"Flax is not available, while {env.backend} is set to 'flax'") - packages.extend( - [importlib.metadata.version('flax'), - importlib.metadata.version('jax'), - importlib.metadata.version('jaxlib')]) + packages.extend([importlib.metadata.version('flax'), importlib.metadata.version('jax'), importlib.metadata.version('jaxlib')]) elif backend_envvar == 'tf': if not openllm_core.utils.is_tf_available(): raise ValueError(f"TensorFlow is not available, while {env.backend} is set to 'tf'") - candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', - 'intel-tensorflow', 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', - ) + candidates = ('tensorflow', 'tensorflow-cpu', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-cpu', 'tf-nightly-gpu', 'intel-tensorflow', + 'intel-tensorflow-avx512', 'tensorflow-rocm', 'tensorflow-macos', + ) # For the metadata, we have to look for both tensorflow and tensorflow-cpu for candidate in candidates: try: @@ -125,15 +118,11 @@ def construct_python_options(llm: openllm.LLM[t.Any, t.Any], return PythonOptions(packages=packages, wheels=wheels, lock_packages=False, - extra_index_url=[ - 'https://download.pytorch.org/whl/cu118', - 'https://huggingface.github.io/autogptq-index/whl/cu118/' - ]) + extra_index_url=['https://download.pytorch.org/whl/cu118', 'https://huggingface.github.io/autogptq-index/whl/cu118/']) -def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, - quantize: LiteralString | None, adapter_map: dict[str, str | None] | None, - dockerfile_template: str | None, serialisation_format: t.Literal['safetensors', 'legacy'], - container_registry: LiteralContainerRegistry, +def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_resource: float, quantize: LiteralString | None, + adapter_map: dict[str, str | None] | None, dockerfile_template: str | None, + serialisation_format: t.Literal['safetensors', 'legacy'], container_registry: LiteralContainerRegistry, container_version_strategy: LiteralContainerVersionStrategy) -> DockerOptions: from openllm.cli._factory import parse_config_options environ = parse_config_options(llm.config, llm.config['timeout'], workers_per_resource, None, True, os.environ.copy()) @@ -156,10 +145,9 @@ def construct_docker_options(llm: openllm.LLM[t.Any, t.Any], _: FS, workers_per_ _env = openllm_core.utils.EnvVarMixin(llm.config['model_name'], quantize=quantize) if _env['quantize_value'] is not None: env_dict[_env.quantize] = t.cast(str, _env['quantize_value']) - return DockerOptions( - base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', - env=env_dict, - dockerfile_template=dockerfile_template) + return DockerOptions(base_image=f'{oci.CONTAINER_NAMES[container_registry]}:{oci.get_base_container_tag(container_version_strategy)}', + env=env_dict, + dockerfile_template=dockerfile_template) OPENLLM_MODEL_NAME = '# openllm: model name' OPENLLM_MODEL_ADAPTER_MAP = '# openllm: model adapter map' @@ -193,17 +181,15 @@ _service_file = Path(os.path.abspath(__file__)).parent.parent / '_service.py' def write_service(llm: openllm.LLM[t.Any, t.Any], adapter_map: dict[str, str | None] | None, llm_fs: FS) -> None: from openllm_core.utils import DEBUG model_name = llm.config['model_name'] - logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], - llm_fs.getsyspath('/')) + logger.debug('Generating service file for %s at %s (dir=%s)', model_name, llm.config['service_name'], llm_fs.getsyspath('/')) with open(_service_file.__fspath__(), 'r') as f: src_contents = f.readlines() for it in src_contents: if OPENLLM_MODEL_NAME in it: - src_contents[src_contents.index(it)] = ( - ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n') + src_contents[src_contents.index(it)] = (ModelNameFormatter(model_name).vformat(it)[:-(len(OPENLLM_MODEL_NAME) + 3)] + '\n') elif OPENLLM_MODEL_ADAPTER_MAP in it: - src_contents[src_contents.index(it)] = (ModelAdapterMapFormatter( - orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n') + src_contents[src_contents.index(it)] = ( + ModelAdapterMapFormatter(orjson.dumps(adapter_map).decode()).vformat(it)[:-(len(OPENLLM_MODEL_ADAPTER_MAP) + 3)] + '\n') script = f"# GENERATED BY 'openllm build {model_name}'. DO NOT EDIT\n\n" + ''.join(src_contents) if DEBUG: logger.info('Generated script:\n%s', script) llm_fs.writetext(llm.config['service_name'], script) @@ -235,14 +221,12 @@ def create_bento(bento_tag: bentoml.Tag, if isinstance(workers_per_resource, str): if workers_per_resource == 'round_robin': workers_per_resource = 1.0 elif workers_per_resource == 'conserved': - workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / - openllm_core.utils.device_count()) + workers_per_resource = 1.0 if openllm_core.utils.device_count() == 0 else float(1 / openllm_core.utils.device_count()) else: try: workers_per_resource = float(workers_per_resource) except ValueError: - raise ValueError( - "'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None + raise ValueError("'workers_per_resource' only accept ['round_robin', 'conserved'] as possible strategies.") from None elif isinstance(workers_per_resource, int): workers_per_resource = float(workers_per_resource) logger.info("Building Bento for '%s'", llm.config['start_name']) @@ -258,10 +242,8 @@ def create_bento(bento_tag: bentoml.Tag, exclude=['/venv', '/.venv', '__pycache__/', '*.py[cod]', '*$py.class'], python=construct_python_options(llm, llm_fs, extra_dependencies, adapter_map), models=[llm_spec], - docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, - adapter_map, dockerfile_template, - serialisation_format, container_registry, - container_version_strategy)) + docker=construct_docker_options(llm, llm_fs, workers_per_resource, quantize, adapter_map, dockerfile_template, + serialisation_format, container_registry, container_version_strategy)) bento = bentoml.Bento.create(build_config=build_config, version=bento_tag.version, build_ctx=llm_fs.getsyspath('/')) # NOTE: the model_id_path here are only used for setting this environment variable within the container built with for BentoLLM. diff --git a/openllm-python/src/openllm/bundle/oci/__init__.py b/openllm-python/src/openllm/bundle/oci/__init__.py index 10a8dd97..33d94740 100644 --- a/openllm-python/src/openllm/bundle/oci/__init__.py +++ b/openllm-python/src/openllm/bundle/oci/__init__.py @@ -68,8 +68,7 @@ def _commit_time_range(r: int = 5) -> str: class VersionNotSupported(openllm.exceptions.OpenLLMException): """Raised when the stable release is too low that it doesn't include OpenLLM base container.""" -_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', - ['git_hash', 'version', 'strategy']) +_RefTuple: type[RefTuple] = openllm_core.utils.codegen.make_attr_tuple_class('_RefTuple', ['git_hash', 'version', 'strategy']) def nightly_resolver(cls: type[RefResolver]) -> str: # NOTE: all openllm container will have sha- @@ -84,10 +83,8 @@ def nightly_resolver(cls: type[RefResolver]) -> str: return next(f'sha-{it["sha"][:7]}' for it in commits if '[skip ci]' not in it['commit']['message']) # now is the correct behaviour return orjson.loads( - subprocess.check_output([ - docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', - 'docker://ghcr.io/bentoml/openllm' - ]).decode().strip())['Tags'][-2] + subprocess.check_output([docker_bin, 'run', '--rm', '-it', 'quay.io/skopeo/stable:latest', 'list-tags', + 'docker://ghcr.io/bentoml/openllm']).decode().strip())['Tags'][-2] @attr.attrs(eq=False, order=False, slots=True, frozen=True) class RefResolver: @@ -107,20 +104,16 @@ class RefResolver: # NOTE: This strategy will only support openllm>0.2.12 meta: dict[str, t.Any] = cls._ghapi.repos.get_latest_release() version_str = meta['name'].lstrip('v') - version: tuple[str, - str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str) + version: tuple[str, str | None] = (cls._ghapi.git.get_ref(ref=f"tags/{meta['name']}")['object']['sha'], version_str) else: version = ('', version_str) if openllm_core.utils.VersionInfo.from_version_string(t.cast(str, version_str)) < (0, 2, 12): - raise VersionNotSupported( - f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'" - ) + raise VersionNotSupported(f"Version {version_str} doesn't support OpenLLM base container. Consider using 'nightly' or upgrade 'openllm>=0.2.12'") return _RefTuple((*version, 'release' if _use_base_strategy else 'custom')) @classmethod @functools.lru_cache(maxsize=64) - def from_strategy(cls, - strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver: + def from_strategy(cls, strategy_or_version: t.Literal['release', 'nightly'] | LiteralString | None = None) -> RefResolver: # using default strategy if strategy_or_version is None or strategy_or_version == 'release': return cls(*cls._release_ref()) elif strategy_or_version == 'latest': return cls('latest', '0.0.0', 'latest') @@ -128,8 +121,7 @@ class RefResolver: _ref = cls._nightly_ref() return cls(_ref[0], '0.0.0', _ref[-1]) else: - logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', - strategy_or_version) + logger.warning('Using custom %s. Make sure that it is at lease 0.2.12 for base container support.', strategy_or_version) return cls(*cls._release_ref(version_str=strategy_or_version)) @property @@ -162,8 +154,7 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon pyproject_path = pathlib.Path(_module_location).parent.parent / 'pyproject.toml' if not pyproject_path.exists(): raise ValueError( - "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'" - ) + "This utility can only be run within OpenLLM git repository. Clone it first with 'git clone https://github.com/bentoml/OpenLLM.git'") if not registries: tags: dict[str | LiteralContainerRegistry, str] = { alias: f'{value}:{get_base_container_tag(version_strategy)}' for alias, value in _CONTAINER_REGISTRY.items() @@ -181,18 +172,14 @@ def build_container(registries: LiteralContainerRegistry | t.Sequence[LiteralCon if machine and outputs is not None: tags['image_sha'] = outputs.decode('utf-8').strip() except Exception as err: raise openllm.exceptions.OpenLLMException( - f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}' - ) from err + f'Failed to containerize base container images (Scroll up to see error above, or set OPENLLMDEVDEBUG=True for more traceback):\n{err}') from err return tags if t.TYPE_CHECKING: CONTAINER_NAMES: dict[LiteralContainerRegistry, str] supported_registries: list[str] -__all__ = [ - 'CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', - 'RefResolver' -] +__all__ = ['CONTAINER_NAMES', 'get_base_container_tag', 'build_container', 'get_base_container_name', 'supported_registries', 'RefResolver'] def __dir__() -> list[str]: return sorted(__all__) diff --git a/openllm-python/src/openllm/cli/_factory.py b/openllm-python/src/openllm/cli/_factory.py index 195a6143..5f67aa3b 100644 --- a/openllm-python/src/openllm/cli/_factory.py +++ b/openllm-python/src/openllm/cli/_factory.py @@ -50,14 +50,10 @@ def bento_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete ] def model_complete_envvar(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[sc.CompletionItem]: - return [ - sc.CompletionItem(inflection.dasherize(it), help='Model') - for it in openllm.CONFIG_MAPPING - if it.startswith(incomplete) - ] + return [sc.CompletionItem(inflection.dasherize(it), help='Model') for it in openllm.CONFIG_MAPPING if it.startswith(incomplete)] -def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, - device: t.Tuple[str, ...] | None, cors: bool, environ: DictStrAny) -> DictStrAny: +def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_resource: float, device: t.Tuple[str, ...] | None, cors: bool, + environ: DictStrAny) -> DictStrAny: # TODO: Support amd.com/gpu on k8s _bentoml_config_options_env = environ.pop('BENTOML_CONFIG_OPTIONS', '') _bentoml_config_options_opts = [ @@ -67,22 +63,15 @@ def parse_config_options(config: LLMConfig, server_timeout: int, workers_per_res ] if device: if len(device) > 1: - _bentoml_config_options_opts.extend([ - f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' - for idx, dev in enumerate(device) - ]) + _bentoml_config_options_opts.extend( + [f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"[{idx}]={dev}' for idx, dev in enumerate(device)]) else: - _bentoml_config_options_opts.append( - f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') - _bentoml_config_options_opts.append( - f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}') + _bentoml_config_options_opts.append(f'runners."llm-{config["start_name"]}-runner".resources."nvidia.com/gpu"=[{device[0]}]') + _bentoml_config_options_opts.append(f'runners."llm-generic-embedding".resources.cpu={openllm.get_resource({"cpu":"system"},"cpu")}') if cors: + _bentoml_config_options_opts.extend(['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"']) _bentoml_config_options_opts.extend( - ['api_server.http.cors.enabled=true', 'api_server.http.cors.access_control_allow_origins="*"']) - _bentoml_config_options_opts.extend([ - f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' - for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT']) - ]) + [f'api_server.http.cors.access_control_allow_methods[{idx}]="{it}"' for idx, it in enumerate(['GET', 'OPTIONS', 'POST', 'HEAD', 'PUT'])]) _bentoml_config_options_env += ' ' if _bentoml_config_options_env else '' + ' '.join(_bentoml_config_options_opts) environ['BENTOML_CONFIG_OPTIONS'] = _bentoml_config_options_env if DEBUG: logger.debug('Setting BENTOML_CONFIG_OPTIONS=%s', _bentoml_config_options_env) @@ -104,17 +93,13 @@ def _id_callback(ctx: click.Context, _: click.Parameter, value: t.Tuple[str, ... ctx.params[_adapter_mapping_key][adapter_id] = adapter_name[0] if len(adapter_name) > 0 else None return None -def start_command_factory(group: click.Group, - model: str, - _context_settings: DictStrAny | None = None, - _serve_grpc: bool = False) -> click.Command: +def start_command_factory(group: click.Group, model: str, _context_settings: DictStrAny | None = None, _serve_grpc: bool = False) -> click.Command: llm_config = openllm.AutoConfig.for_model(model) - command_attrs: DictStrAny = dict( - name=llm_config['model_name'], - context_settings=_context_settings or termui.CONTEXT_SETTINGS, - short_help=f"Start a LLMServer for '{model}'", - aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None, - help=f'''\ + command_attrs: DictStrAny = dict(name=llm_config['model_name'], + context_settings=_context_settings or termui.CONTEXT_SETTINGS, + short_help=f"Start a LLMServer for '{model}'", + aliases=[llm_config['start_name']] if llm_config['name_type'] == 'dasherize' else None, + help=f'''\ {llm_config['env'].start_docstring} \b @@ -133,15 +118,13 @@ Available official model_id(s): [default: {llm_config['default_id']}] \b {orjson.dumps(llm_config['model_ids'], option=orjson.OPT_INDENT_2).decode()} ''', - ) + ) if llm_config['requires_gpu'] and openllm.utils.device_count() < 1: # NOTE: The model requires GPU, therefore we will return a dummy command command_attrs.update({ - 'short_help': - '(Disabled because there is no GPU available)', - 'help': - f'{model} is currently not available to run on your local machine because it requires GPU for inference.' + 'short_help': '(Disabled because there is no GPU available)', + 'help': f'{model} is currently not available to run on your local machine because it requires GPU for inference.' }) return noop_command(group, llm_config, _serve_grpc, **command_attrs) @@ -150,12 +133,10 @@ Available official model_id(s): [default: {llm_config['default_id']}] @click.pass_context def start_cmd(ctx: click.Context, /, server_timeout: int, model_id: str | None, model_version: str | None, workers_per_resource: t.Literal['conserved', 'round_robin'] | LiteralString, device: t.Tuple[str, ...], - quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, - serialisation_format: t.Literal['safetensors', 'legacy'], cors: bool, adapter_id: str | None, - return_process: bool, **attrs: t.Any, - ) -> LLMConfig | subprocess.Popen[bytes]: - if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env( - 'OPENLLM_SERIALIZATION_WARNING'): + quantize: t.Literal['int8', 'int4', 'gptq'] | None, backend: LiteralBackend, serialisation_format: t.Literal['safetensors', 'legacy'], + cors: bool, adapter_id: str | None, return_process: bool, **attrs: t.Any, + ) -> LLMConfig | subprocess.Popen[bytes]: + if serialisation_format == 'safetensors' and quantize is not None and openllm_core.utils.check_bool_env('OPENLLM_SERIALIZATION_WARNING'): termui.echo( f"'--quantize={quantize}' might not work with 'safetensors' serialisation format. Use with caution!. To silence this warning, set \"OPENLLM_SERIALIZATION_WARNING=False\"\nNote: You can always fallback to '--serialisation legacy' when running quantisation.", fg='yellow') @@ -184,10 +165,7 @@ Available official model_id(s): [default: {llm_config['default_id']}] wpr = float(wpr) # Create a new model env to work with the envvar during CLI invocation - env = openllm.utils.EnvVarMixin(config['model_name'], - backend, - model_id=model_id or config['default_id'], - quantize=quantize) + env = openllm.utils.EnvVarMixin(config['model_name'], backend, model_id=model_id or config['default_id'], quantize=quantize) requirements = llm_config['requirements'] if requirements is not None and len(requirements) > 0: missing_requirements = [i for i in requirements if importlib.util.find_spec(inflection.underscore(i)) is None] @@ -218,17 +196,14 @@ Available official model_id(s): [default: {llm_config['default_id']}] serialisation=serialisation_format) start_env.update({env.config: llm.config.model_dump_json().decode()}) - server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer( - '_service:svc', **server_attrs) + server = bentoml.GrpcServer('_service:svc', **server_attrs) if _serve_grpc else bentoml.HTTPServer('_service:svc', **server_attrs) openllm.utils.analytics.track_start_init(llm.config) def next_step(model_name: str, adapter_map: DictStrAny | None) -> None: cmd_name = f'openllm build {model_name}' if adapter_map is not None: - cmd_name += ' ' + ' '.join([ - f'--adapter-id {s}' - for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()] - ]) + cmd_name += ' ' + ' '.join( + [f'--adapter-id {s}' for s in [f'{p}:{name}' if name not in (None, 'default') else p for p, name in adapter_map.items()]]) if not openllm.utils.get_quiet_mode(): termui.echo(f"\n🚀 Next step: run '{cmd_name}' to create a Bento for {model_name}", fg='blue') @@ -265,17 +240,13 @@ def noop_command(group: click.Group, llm_config: LLMConfig, _serve_grpc: bool, * return noop def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callable[[FC], t.Callable[[FC], FC]]: - def wrapper(fn: FC) -> t.Callable[[FC], FC]: composed = openllm.utils.compose( llm_config.to_click_options, _http_server_args if not serve_grpc else _grpc_server_args, - cog.optgroup.group( - 'General LLM Options', - help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), + cog.optgroup.group('General LLM Options', help=f"The following options are related to running '{llm_config['start_name']}' LLM Server."), model_id_option(factory=cog.optgroup), model_version_option(factory=cog.optgroup), cog.optgroup.option('--server-timeout', type=int, default=None, help='Server timeout in seconds'), - workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), - backend_option(factory=cog.optgroup), + workers_per_resource_option(factory=cog.optgroup), cors_option(factory=cog.optgroup), backend_option(factory=cog.optgroup), cog.optgroup.group('LLM Optimization Options', help='''Optimization related options. @@ -286,7 +257,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab - DeepSpeed Inference: [link](https://www.deepspeed.ai/inference/) - GGML: Fast inference on [bare metal](https://github.com/ggerganov/ggml) ''', - ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup), + ), quantize_option(factory=cog.optgroup), serialisation_option(factory=cog.optgroup), cog.optgroup.option('--device', type=openllm.utils.dantic.CUDA, multiple=True, @@ -312,8 +283,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab '''), cog.optgroup.option('--adapter-id', default=None, - help='Optional name or path for given LoRA adapter' + - f" to wrap '{llm_config['model_name']}'", + help='Optional name or path for given LoRA adapter' + f" to wrap '{llm_config['model_name']}'", multiple=True, callback=_id_callback, metavar='[PATH | [remote/][adapter_name:]adapter_id][, ...]'), @@ -323,8 +293,7 @@ def start_decorator(llm_config: LLMConfig, serve_grpc: bool = False) -> t.Callab return wrapper -def parse_device_callback(ctx: click.Context, param: click.Parameter, - value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None: +def parse_device_callback(ctx: click.Context, param: click.Parameter, value: tuple[tuple[str], ...] | None) -> t.Tuple[str, ...] | None: if value is None: return value if not isinstance(value, tuple): ctx.fail(f'{param} only accept multiple values, not {type(value)} (value: {value})') el: t.Tuple[str, ...] = tuple(i for k in value for i in k) @@ -342,19 +311,15 @@ def parse_serve_args(serve_grpc: bool) -> t.Callable[[t.Callable[..., LLMConfig] from bentoml_cli.cli import cli command = 'serve' if not serve_grpc else 'serve-grpc' - group = cog.optgroup.group( - f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", - help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]", - ) + group = cog.optgroup.group(f"Start a {'HTTP' if not serve_grpc else 'gRPC'} server options", + help=f"Related to serving the model [synonymous to `bentoml {'serve-http' if not serve_grpc else command }`]", + ) def decorator(f: t.Callable[Concatenate[int, t.Optional[str], P], LLMConfig]) -> t.Callable[[FC], FC]: serve_command = cli.commands[command] # The first variable is the argument bento # The last five is from BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS - serve_options = [ - p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] - if p.name not in _IGNORED_OPTIONS - ] + serve_options = [p for p in serve_command.params[1:-BentoMLCommandGroup.NUMBER_OF_COMMON_PARAMS] if p.name not in _IGNORED_OPTIONS] for options in reversed(serve_options): attrs = options.to_info_dict() # we don't need param_type_name, since it should all be options @@ -391,10 +356,7 @@ def _click_factory_type(*param_decls: t.Any, **attrs: t.Any) -> t.Callable[[FC | cli_option = functools.partial(_click_factory_type, attr='option') cli_argument = functools.partial(_click_factory_type, attr='argument') -def output_option(f: _AnyCallable | None = None, - *, - default_value: LiteralOutput = 'pretty', - **attrs: t.Any) -> t.Callable[[FC], FC]: +def output_option(f: _AnyCallable | None = None, *, default_value: LiteralOutput = 'pretty', **attrs: t.Any) -> t.Callable[[FC], FC]: output = ['json', 'pretty', 'porcelain'] def complete_output_var(ctx: click.Context, param: click.Parameter, incomplete: str) -> list[CompletionItem]: @@ -434,12 +396,11 @@ def model_id_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable **attrs)(f) def model_version_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option( - '--model-version', - type=click.STRING, - default=None, - help='Optional model version to save for this model. It will be inferred automatically from model-id.', - **attrs)(f) + return cli_option('--model-version', + type=click.STRING, + default=None, + help='Optional model version to save for this model. It will be inferred automatically from model-id.', + **attrs)(f) def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: # NOTE: LiteralBackend needs to remove the last two item as ggml and mlc is wip @@ -453,10 +414,7 @@ def backend_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[ **attrs)(f) def model_name_argument(f: _AnyCallable | None = None, required: bool = True, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_argument('model_name', - type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), - required=required, - **attrs)(f) + return cli_argument('model_name', type=click.Choice([inflection.dasherize(name) for name in openllm.CONFIG_MAPPING]), required=required, **attrs)(f) def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--quantise', @@ -482,10 +440,7 @@ def quantize_option(f: _AnyCallable | None = None, *, build: bool = False, **att > [!NOTE] that quantization are currently only available in *PyTorch* models.''', **attrs)(f) -def workers_per_resource_option(f: _AnyCallable | None = None, - *, - build: bool = False, - **attrs: t.Any) -> t.Callable[[FC], FC]: +def workers_per_resource_option(f: _AnyCallable | None = None, *, build: bool = False, **attrs: t.Any) -> t.Callable[[FC], FC]: return cli_option('--workers-per-resource', default=None, callback=workers_per_resource_callback, @@ -536,18 +491,16 @@ def serialisation_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Cal **attrs)(f) def container_registry_option(f: _AnyCallable | None = None, **attrs: t.Any) -> t.Callable[[FC], FC]: - return cli_option( - '--container-registry', - 'container_registry', - type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), - default='ecr', - show_default=True, - show_envvar=True, - envvar='OPENLLM_CONTAINER_REGISTRY', - callback=container_registry_callback, - help= - 'The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker', - **attrs)(f) + return cli_option('--container-registry', + 'container_registry', + type=click.Choice(list(openllm.bundle.CONTAINER_NAMES)), + default='ecr', + show_default=True, + show_envvar=True, + envvar='OPENLLM_CONTAINER_REGISTRY', + callback=container_registry_callback, + help='The default container registry to get the base image for building BentoLLM. Currently, it supports ecr, ghcr, docker', + **attrs)(f) _wpr_strategies = {'round_robin', 'conserved'} @@ -559,9 +512,8 @@ def workers_per_resource_callback(ctx: click.Context, param: click.Parameter, va try: float(value) # type: ignore[arg-type] except ValueError: - raise click.BadParameter( - f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", - ctx, param) from None + raise click.BadParameter(f"'workers_per_resource' only accept '{_wpr_strategies}' as possible strategies, otherwise pass in float.", ctx, + param) from None else: return value diff --git a/openllm-python/src/openllm/cli/_sdk.py b/openllm-python/src/openllm/cli/_sdk.py index 3ac7a11a..98f1c7eb 100644 --- a/openllm-python/src/openllm/cli/_sdk.py +++ b/openllm-python/src/openllm/cli/_sdk.py @@ -84,8 +84,7 @@ def _start(model_name: str, from .entrypoint import start_grpc_command llm_config = openllm.AutoConfig.for_model(model_name) _ModelEnv = openllm_core.utils.EnvVarMixin(model_name, - backend=openllm_core.utils.first_not_none( - backend, default=llm_config.default_backend()), + backend=openllm_core.utils.first_not_none(backend, default=llm_config.default_backend()), model_id=model_id, quantize=quantize) os.environ[_ModelEnv.backend] = _ModelEnv['backend_value'] @@ -94,26 +93,19 @@ def _start(model_name: str, if model_id: args.extend(['--model-id', model_id]) if timeout: args.extend(['--server-timeout', str(timeout)]) if workers_per_resource: - args.extend([ - '--workers-per-resource', - str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource - ]) + args.extend(['--workers-per-resource', str(workers_per_resource) if not isinstance(workers_per_resource, str) else workers_per_resource]) if device and not os.environ.get('CUDA_VISIBLE_DEVICES'): args.extend(['--device', ','.join(device)]) if quantize: args.extend(['--quantize', str(quantize)]) if cors: args.append('--cors') if adapter_map: - args.extend( - list( - itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items() - ]))) + args.extend(list(itertools.chain.from_iterable([['--adapter-id', f"{k}{':'+v if v else ''}"] for k, v in adapter_map.items()]))) if additional_args: args.extend(additional_args) if __test__: args.append('--return-process') return start_command_factory(start_command if not _serve_grpc else start_grpc_command, model_name, _context_settings=termui.CONTEXT_SETTINGS, - _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, - standalone_mode=False) + _serve_grpc=_serve_grpc).main(args=args if len(args) > 0 else None, standalone_mode=False) @inject def _build(model_name: str, @@ -180,9 +172,7 @@ def _build(model_name: str, Returns: ``bentoml.Bento | str``: BentoLLM instance. This can be used to serve the LLM or can be pushed to BentoCloud. """ - args: list[str] = [ - sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format - ] + args: list[str] = [sys.executable, '-m', 'openllm', 'build', model_name, '--machine', '--serialisation', serialisation_format] if quantize: args.extend(['--quantize', quantize]) if containerize and push: raise OpenLLMException("'containerize' and 'push' are currently mutually exclusive.") if push: args.extend(['--push']) @@ -265,8 +255,7 @@ def _list_models() -> dict[str, t.Any]: from .entrypoint import models_command return models_command.main(args=['-o', 'json', '--show-available', '--machine'], standalone_mode=False) -start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk( - _start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk( - _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk( - _import_model), openllm_core.utils.codegen.gen_sdk(_list_models) +start, start_grpc, build, import_model, list_models = openllm_core.utils.codegen.gen_sdk(_start, _serve_grpc=False), openllm_core.utils.codegen.gen_sdk( + _start, _serve_grpc=True), openllm_core.utils.codegen.gen_sdk(_build), openllm_core.utils.codegen.gen_sdk( + _import_model), openllm_core.utils.codegen.gen_sdk(_list_models) __all__ = ['start', 'start_grpc', 'build', 'import_model', 'list_models'] diff --git a/openllm-python/src/openllm/cli/extension/build_base_container.py b/openllm-python/src/openllm/cli/extension/build_base_container.py index 05cc7827..16d3c50f 100644 --- a/openllm-python/src/openllm/cli/extension/build_base_container.py +++ b/openllm-python/src/openllm/cli/extension/build_base_container.py @@ -34,8 +34,8 @@ if t.TYPE_CHECKING: help='Version strategy to use for tagging the image.') @click.option('--push/--no-push', help='Whether to push to remote repository', is_flag=True, default=False) @machine_option -def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, - version_strategy: LiteralContainerVersionStrategy, push: bool, machine: bool) -> dict[str, str]: +def cli(container_registry: tuple[LiteralContainerRegistry, ...] | None, version_strategy: LiteralContainerVersionStrategy, push: bool, + machine: bool) -> dict[str, str]: mapping = openllm.bundle.build_container(container_registry, version_strategy, push, machine) if machine: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white') return mapping diff --git a/openllm-python/src/openllm/cli/extension/dive_bentos.py b/openllm-python/src/openllm/cli/extension/dive_bentos.py index 44bf9fbc..4126000b 100644 --- a/openllm-python/src/openllm/cli/extension/dive_bentos.py +++ b/openllm-python/src/openllm/cli/extension/dive_bentos.py @@ -24,10 +24,7 @@ if t.TYPE_CHECKING: @machine_option @click.pass_context @inject -def cli(ctx: click.Context, - bento: str, - machine: bool, - _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None: +def cli(ctx: click.Context, bento: str, machine: bool, _bento_store: BentoStore = Provide[BentoMLContainer.bento_store]) -> str | None: '''Dive into a BentoLLM. This is synonymous to cd $(b get : -o path).''' try: bentomodel = _bento_store.get(bento) diff --git a/openllm-python/src/openllm/cli/extension/get_containerfile.py b/openllm-python/src/openllm/cli/extension/get_containerfile.py index 8585d370..df3c8f0b 100644 --- a/openllm-python/src/openllm/cli/extension/get_containerfile.py +++ b/openllm-python/src/openllm/cli/extension/get_containerfile.py @@ -19,9 +19,7 @@ from openllm_core.utils import bentoml_cattr if t.TYPE_CHECKING: from bentoml._internal.bento import BentoStore -@click.command('get_containerfile', - context_settings=termui.CONTEXT_SETTINGS, - help='Return Containerfile of any given Bento.') +@click.command('get_containerfile', context_settings=termui.CONTEXT_SETTINGS, help='Return Containerfile of any given Bento.') @click.argument('bento', type=str, shell_complete=bento_complete_envvar) @click.pass_context @inject diff --git a/openllm-python/src/openllm/cli/extension/get_prompt.py b/openllm-python/src/openllm/cli/extension/get_prompt.py index 7ea4b7f1..a3b66bc6 100644 --- a/openllm-python/src/openllm/cli/extension/get_prompt.py +++ b/openllm-python/src/openllm/cli/extension/get_prompt.py @@ -32,8 +32,8 @@ LiteralOutput = t.Literal['json', 'pretty', 'porcelain'] callback=opt_callback, metavar='ARG=VALUE[,ARG=VALUE]') @click.pass_context -def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, - _memoized: dict[str, t.Any], **_: t.Any) -> str | None: +def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, output: LiteralOutput, machine: bool, _memoized: dict[str, t.Any], + **_: t.Any) -> str | None: '''Get the default prompt used by OpenLLM.''' module = openllm.utils.EnvVarMixin(model_name).module _memoized = {k: v[0] for k, v in _memoized.items() if v} @@ -46,15 +46,11 @@ def cli(ctx: click.Context, /, model_name: str, prompt: str, format: str | None, if format is None: if not hasattr(module, 'PROMPT_MAPPING') or module.PROMPT_MAPPING is None: raise RuntimeError('Failed to find prompt mapping while DEFAULT_PROMPT_TEMPLATE is a function.') - raise click.BadOptionUsage( - 'format', - f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})") + raise click.BadOptionUsage('format', f"{model_name} prompt requires passing '--format' (available format: {list(module.PROMPT_MAPPING)})") if prompt_mapping is None: - raise click.BadArgumentUsage( - f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None + raise click.BadArgumentUsage(f'Failed to fine prompt mapping while the default prompt for {model_name} is a callable.') from None if format not in prompt_mapping: - raise click.BadOptionUsage( - 'format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})') + raise click.BadOptionUsage('format', f'Given format {format} is not valid for {model_name} (available format: {list(prompt_mapping)})') _prompt_template = template(format) else: _prompt_template = template diff --git a/openllm-python/src/openllm/cli/extension/list_bentos.py b/openllm-python/src/openllm/cli/extension/list_bentos.py index 67e77d89..3169c878 100644 --- a/openllm-python/src/openllm/cli/extension/list_bentos.py +++ b/openllm-python/src/openllm/cli/extension/list_bentos.py @@ -19,26 +19,28 @@ def cli(ctx: click.Context, output: LiteralOutput) -> None: '''List available bentos built by OpenLLM.''' mapping = { k: [{ - 'tag': - str(b.tag), - 'size': - human_readable_size(openllm.utils.calc_dir_size(b.path)), + 'tag': str(b.tag), + 'size': human_readable_size(openllm.utils.calc_dir_size(b.path)), 'models': [{ 'tag': str(m.tag), 'size': human_readable_size(openllm.utils.calc_dir_size(m.path)) - } for m in (bentoml.models.get(_.tag) for _ in b.info.models)] - } for b in tuple(i for i in bentoml.list() if all( - k in i.info.labels for k in {'start_name', 'bundler'})) if b.info.labels['start_name'] == k - ] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) + } + for m in (bentoml.models.get(_.tag) + for _ in b.info.models)] + } + for b in tuple(i + for i in bentoml.list() + if all(k in i.info.labels + for k in {'start_name', 'bundler'})) + if b.info.labels['start_name'] == k] for k in tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) } mapping = {k: v for k, v in mapping.items() if v} if output == 'pretty': import tabulate tabulate.PRESERVE_WHITESPACE = True - termui.echo(tabulate.tabulate( - [(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], - tablefmt='fancy_grid', - headers=['LLM', 'Tag', 'Size', 'Models']), + termui.echo(tabulate.tabulate([(k, i['tag'], i['size'], [_['tag'] for _ in i['models']]) for k, v in mapping.items() for i in v], + tablefmt='fancy_grid', + headers=['LLM', 'Tag', 'Size', 'Models']), fg='white') else: termui.echo(orjson.dumps(mapping, option=orjson.OPT_INDENT_2).decode(), fg='white') diff --git a/openllm-python/src/openllm/cli/extension/list_models.py b/openllm-python/src/openllm/cli/extension/list_models.py index 36bc9ce6..2d87560e 100644 --- a/openllm-python/src/openllm/cli/extension/list_models.py +++ b/openllm-python/src/openllm/cli/extension/list_models.py @@ -26,17 +26,14 @@ def cli(model_name: str | None, output: LiteralOutput) -> DictStrAny: models = tuple(inflection.dasherize(key) for key in openllm.CONFIG_MAPPING.keys()) ids_in_local_store = { k: [ - i for i in bentoml.models.list() if 'framework' in i.info.labels and - i.info.labels['framework'] == 'openllm' and 'model_name' in i.info.labels and i.info.labels['model_name'] == k + i for i in bentoml.models.list() if 'framework' in i.info.labels and i.info.labels['framework'] == 'openllm' and + 'model_name' in i.info.labels and i.info.labels['model_name'] == k ] for k in models } if model_name is not None: ids_in_local_store = { - k: [ - i - for i in v - if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name) - ] for k, v in ids_in_local_store.items() + k: [i for i in v if 'model_name' in i.info.labels and i.info.labels['model_name'] == inflection.dasherize(model_name)] + for k, v in ids_in_local_store.items() } ids_in_local_store = {k: v for k, v in ids_in_local_store.items() if v} local_models = { diff --git a/openllm-python/src/openllm/cli/extension/playground.py b/openllm-python/src/openllm/cli/extension/playground.py index e06756c0..1fd2c0fa 100644 --- a/openllm-python/src/openllm/cli/extension/playground.py +++ b/openllm-python/src/openllm/cli/extension/playground.py @@ -34,12 +34,7 @@ def load_notebook_metadata() -> DictStrAny: @click.command('playground', context_settings=termui.CONTEXT_SETTINGS) @click.argument('output-dir', default=None, required=False) -@click.option('--port', - envvar='JUPYTER_PORT', - show_envvar=True, - show_default=True, - default=8888, - help='Default port for Jupyter server') +@click.option('--port', envvar='JUPYTER_PORT', show_envvar=True, show_default=True, default=8888, help='Default port for Jupyter server') @click.pass_context def cli(ctx: click.Context, output_dir: str | None, port: int) -> None: """OpenLLM Playground. @@ -60,9 +55,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None: > This command requires Jupyter to be installed. Install it with 'pip install "openllm[playground]"' """ if not is_jupyter_available() or not is_jupytext_available() or not is_notebook_available(): - raise RuntimeError( - "Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'" - ) + raise RuntimeError("Playground requires 'jupyter', 'jupytext', and 'notebook'. Install it with 'pip install \"openllm[playground]\"'") metadata = load_notebook_metadata() _temp_dir = False if output_dir is None: @@ -74,8 +67,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None: termui.echo('The playground notebooks will be saved to: ' + os.path.abspath(output_dir), fg='blue') for module in pkgutil.iter_modules(playground.__path__): if module.ispkg or os.path.exists(os.path.join(output_dir, module.name + '.ipynb')): - logger.debug('Skipping: %s (%s)', module.name, - 'File already exists' if not module.ispkg else f'{module.name} is a module') + logger.debug('Skipping: %s (%s)', module.name, 'File already exists' if not module.ispkg else f'{module.name} is a module') continue if not isinstance(module.module_finder, importlib.machinery.FileFinder): continue termui.echo('Generating notebook for: ' + module.name, fg='magenta') @@ -84,10 +76,7 @@ def cli(ctx: click.Context, output_dir: str | None, port: int) -> None: f.cells.insert(0, markdown_cell) jupytext.write(f, os.path.join(output_dir, module.name + '.ipynb'), fmt='notebook') try: - subprocess.check_output([ - sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', - str(port), '--no-browser', '--debug' - ]) + subprocess.check_output([sys.executable, '-m', 'jupyter', 'notebook', '--notebook-dir', output_dir, '--port', str(port), '--no-browser', '--debug']) except subprocess.CalledProcessError as e: termui.echo(e.output, fg='red') raise click.ClickException(f'Failed to start a jupyter server:\n{e}') from None diff --git a/openllm-python/src/openllm/cli/termui.py b/openllm-python/src/openllm/cli/termui.py index 78f0d90a..c82d3c95 100644 --- a/openllm-python/src/openllm/cli/termui.py +++ b/openllm-python/src/openllm/cli/termui.py @@ -16,9 +16,5 @@ def echo(text: t.Any, fg: str = 'green', _with_style: bool = True, **attrs: t.An t.cast(t.Callable[..., None], click.echo if not _with_style else click.secho)(text, **attrs) COLUMNS: int = int(os.environ.get('COLUMNS', str(120))) -CONTEXT_SETTINGS: DictStrAny = { - 'help_option_names': ['-h', '--help'], - 'max_content_width': COLUMNS, - 'token_normalize_func': inflection.underscore -} +CONTEXT_SETTINGS: DictStrAny = {'help_option_names': ['-h', '--help'], 'max_content_width': COLUMNS, 'token_normalize_func': inflection.underscore} __all__ = ['echo', 'COLUMNS', 'CONTEXT_SETTINGS'] diff --git a/openllm-python/src/openllm/models/auto/factory.py b/openllm-python/src/openllm/models/auto/factory.py index 67650f23..fb73cf56 100644 --- a/openllm-python/src/openllm/models/auto/factory.py +++ b/openllm-python/src/openllm/models/auto/factory.py @@ -30,9 +30,7 @@ class BaseAutoLLMClass: _model_mapping: t.ClassVar[_LazyAutoMapping] def __init__(self, *args: t.Any, **attrs: t.Any): - raise EnvironmentError( - f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead." - ) + raise EnvironmentError(f"Cannot instantiate {self.__class__.__name__} directly. Please use '{self.__class__.__name__}.Runner(model_name)' instead.") @classmethod def for_model(cls, @@ -50,10 +48,7 @@ class BaseAutoLLMClass: >>> llm = openllm.AutoLLM.for_model("flan-t5") ``` ''' - llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, - model_version=model_version, - llm_config=llm_config, - **attrs) + llm = cls.infer_class_from_name(model).from_pretrained(model_id=model_id, model_version=model_version, llm_config=llm_config, **attrs) if ensure_available: llm.ensure_model_id_exists() return llm @@ -116,9 +111,7 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): This OrderedDict values() and keys() returns the list instead, so you don't have to do list(mapping.values()) to get the list of values. """ - - def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], - model_mapping: OrderedDict[LiteralString, LiteralString]): + def __init__(self, config_mapping: OrderedDict[LiteralString, LiteralString], model_mapping: OrderedDict[LiteralString, LiteralString]): self._config_mapping = config_mapping self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} self._model_mapping = model_mapping @@ -153,32 +146,26 @@ class _LazyAutoMapping(OrderedDict, ReprMixin): return ReprMixin.__repr__(self) def __repr_args__(self) -> t.Generator[tuple[str, tuple[str, str]], t.Any, t.Any]: - yield from ((key, (value, self._model_mapping[key])) - for key, value in self._config_mapping.items() - if key in self._model_mapping) + yield from ((key, (value, self._model_mapping[key])) for key, value in self._config_mapping.items() if key in self._model_mapping) def __bool__(self) -> bool: return bool(self.keys()) def keys(self) -> ConfigModelKeysView: - return t.cast('ConfigModelKeysView', [ - self._load_attr_from_module(key, name) - for key, name in self._config_mapping.items() - if key in self._model_mapping.keys() - ] + list(self._extra_content.keys())) + return t.cast('ConfigModelKeysView', + [self._load_attr_from_module(key, name) for key, name in self._config_mapping.items() if key in self._model_mapping.keys()] + + list(self._extra_content.keys())) def values(self) -> ConfigModelValuesView: - return t.cast('ConfigModelValuesView', [ - self._load_attr_from_module(key, name) - for key, name in self._model_mapping.items() - if key in self._config_mapping.keys() - ] + list(self._extra_content.values())) + return t.cast('ConfigModelValuesView', + [self._load_attr_from_module(key, name) for key, name in self._model_mapping.items() if key in self._config_mapping.keys()] + + list(self._extra_content.values())) def items(self) -> ConfigModelItemsView: - return t.cast('ConfigModelItemsView', [(self._load_attr_from_module( - key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) - for key in self._model_mapping.keys() - if key in self._config_mapping.keys()] + list(self._extra_content.items())) + return t.cast('ConfigModelItemsView', + [(self._load_attr_from_module(key, self._config_mapping[key]), self._load_attr_from_module(key, self._model_mapping[key])) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys()] + list(self._extra_content.items())) def __iter__(self) -> t.Iterator[type[openllm.LLMConfig]]: return iter(t.cast('SupportsIter[t.Iterator[type[openllm.LLMConfig]]]', self.keys())) diff --git a/openllm-python/src/openllm/models/auto/modeling_auto.py b/openllm-python/src/openllm/models/auto/modeling_auto.py index bbd17b1b..dfcd3b55 100644 --- a/openllm-python/src/openllm/models/auto/modeling_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_auto.py @@ -7,10 +7,9 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass from .factory import _LazyAutoMapping -MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), - ('flan_t5', 'FlanT5'), ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), - ('opt', 'OPT'), ('stablelm', 'StableLM'), ('starcoder', 'StarCoder'), - ('baichuan', 'Baichuan')]) +MODEL_MAPPING_NAMES = OrderedDict([('chatglm', 'ChatGLM'), ('dolly_v2', 'DollyV2'), ('falcon', 'Falcon'), ('flan_t5', 'FlanT5'), + ('gpt_neox', 'GPTNeoX'), ('llama', 'Llama'), ('mpt', 'MPT'), ('opt', 'OPT'), ('stablelm', 'StableLM'), + ('starcoder', 'StarCoder'), ('baichuan', 'Baichuan')]) MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) class AutoLLM(BaseAutoLLMClass): diff --git a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py index eac69df5..e7538975 100644 --- a/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py +++ b/openllm-python/src/openllm/models/auto/modeling_vllm_auto.py @@ -7,9 +7,8 @@ from openllm_core.config import CONFIG_MAPPING_NAMES from .factory import BaseAutoLLMClass from .factory import _LazyAutoMapping -MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), - ('falcon', 'VLLMFalcon'), ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), - ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), +MODEL_VLLM_MAPPING_NAMES = OrderedDict([('baichuan', 'VLLMBaichuan'), ('dolly_v2', 'VLLMDollyV2'), ('falcon', 'VLLMFalcon'), + ('gpt_neox', 'VLLMGPTNeoX'), ('mpt', 'VLLMMPT'), ('opt', 'VLLMOPT'), ('stablelm', 'VLLMStableLM'), ('starcoder', 'VLLMStarCoder'), ('llama', 'VLLMLlama')]) MODEL_VLLM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_VLLM_MAPPING_NAMES) diff --git a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py index 78476403..75a52794 100644 --- a/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py +++ b/openllm-python/src/openllm/models/baichuan/modeling_baichuan.py @@ -11,6 +11,5 @@ class Baichuan(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrai import torch inputs = self.tokenizer(prompt, return_tensors='pt').to(self.device) with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined] - outputs = self.model.generate(**inputs, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()) + outputs = self.model.generate(**inputs, generation_config=self.config.model_construct_env(**attrs).to_generation_config()) return self.tokenizer.batch_decode(outputs, skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py index 93f286e7..0bed146c 100644 --- a/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py +++ b/openllm-python/src/openllm/models/chatglm/modeling_chatglm.py @@ -14,9 +14,7 @@ class ChatGLM(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrain self.model.eval() # Only use half precision if the model is not yet quantized if self.config.use_half_precision: self.model.half() - return self.model.chat(self.tokenizer, - prompt, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()) + return self.model.chat(self.tokenizer, prompt, generation_config=self.config.model_construct_env(**attrs).to_generation_config()) def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput: import torch diff --git a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py index 7af31f25..9ad1be40 100644 --- a/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py +++ b/openllm-python/src/openllm/models/dolly_v2/modeling_dolly_v2.py @@ -11,8 +11,9 @@ from openllm_core.config.configuration_dolly_v2 import RESPONSE_KEY from openllm_core.config.configuration_dolly_v2 import get_special_token_id if t.TYPE_CHECKING: import torch, transformers, tensorflow as tf else: - torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader( - 'transformers', globals(), 'transformers'), openllm.utils.LazyLoader('tf', globals(), 'tensorflow') + torch, transformers, tf = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), + 'transformers'), openllm.utils.LazyLoader( + 'tf', globals(), 'tensorflow') logger = logging.getLogger(__name__) @overload @@ -35,22 +36,8 @@ def get_pipeline(model: transformers.PreTrainedModel, **attrs: t.Any) -> type[transformers.Pipeline] | transformers.Pipeline: # Lazy loading the pipeline. See databricks' implementation on HuggingFace for more information. class InstructionTextGenerationPipeline(transformers.Pipeline): - - def __init__(self, - *args: t.Any, - do_sample: bool = True, - max_new_tokens: int = 256, - top_p: float = 0.92, - top_k: int = 0, - **kwargs: t.Any): - super().__init__(*args, - model=model, - tokenizer=tokenizer, - do_sample=do_sample, - max_new_tokens=max_new_tokens, - top_p=top_p, - top_k=top_k, - **kwargs) + def __init__(self, *args: t.Any, do_sample: bool = True, max_new_tokens: int = 256, top_p: float = 0.92, top_k: int = 0, **kwargs: t.Any): + super().__init__(*args, model=model, tokenizer=tokenizer, do_sample=do_sample, max_new_tokens=max_new_tokens, top_p=top_p, top_k=top_k, **kwargs) def _sanitize_parameters(self, return_full_text: bool | None = None, @@ -59,8 +46,7 @@ def get_pipeline(model: transformers.PreTrainedModel, preprocess_params: dict[str, t.Any] = {} # newer versions of the tokenizer configure the response key as a special token. newer versions still may # append a newline to yield a single token. find whatever token is configured for the response key. - tokenizer_response_key = next( - (token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None) + tokenizer_response_key = next((token for token in self.tokenizer.additional_special_tokens if token.startswith(RESPONSE_KEY)), None) response_key_token_id = None end_key_token_id = None if tokenizer_response_key: @@ -84,17 +70,15 @@ def get_pipeline(model: transformers.PreTrainedModel, inputs['instruction_text'] = input_ return t.cast(t.Dict[str, t.Any], inputs) - def _forward(self, input_tensors: dict[str, t.Any], - **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput: + def _forward(self, input_tensors: dict[str, t.Any], **generate_kwargs: t.Any) -> transformers.utils.generic.ModelOutput: if t.TYPE_CHECKING: assert self.tokenizer is not None input_ids, attention_mask = input_tensors['input_ids'], input_tensors.get('attention_mask', None) if input_ids.shape[1] == 0: input_ids, attention_mask, in_b = None, None, 1 else: in_b = input_ids.shape[0] - generated_sequence = self.model.generate( - input_ids=input_ids.to(self.model.device) if input_ids is not None else None, - attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, - pad_token_id=self.tokenizer.pad_token_id, - **generate_kwargs) + generated_sequence = self.model.generate(input_ids=input_ids.to(self.model.device) if input_ids is not None else None, + attention_mask=attention_mask.to(self.model.device) if attention_mask is not None else None, + pad_token_id=self.tokenizer.pad_token_id, + **generate_kwargs) out_b = generated_sequence.shape[0] if self.framework == 'pt': generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:]) @@ -162,10 +146,7 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: - return { - 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, - 'torch_dtype': torch.bfloat16 - }, {} + return {'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None, 'torch_dtype': torch.bfloat16}, {} def load_model(self, *args: t.Any, **attrs: t.Any) -> transformers.Pipeline: return get_pipeline(transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, *args, **attrs), @@ -176,6 +157,4 @@ class DollyV2(openllm.LLM['transformers.Pipeline', 'transformers.PreTrainedToken def generate(self, prompt: str, **attrs: t.Any) -> list[dict[t.Literal['generated_text'], str]]: llm_config = self.config.model_construct_env(**attrs) with torch.inference_mode(): - return self.model(prompt, - return_full_text=llm_config.return_full_text, - generation_config=llm_config.to_generation_config()) + return self.model(prompt, return_full_text=llm_config.return_full_text, generation_config=llm_config.to_generation_config()) diff --git a/openllm-python/src/openllm/models/falcon/modeling_falcon.py b/openllm-python/src/openllm/models/falcon/modeling_falcon.py index 866b6c16..d0cf4837 100644 --- a/openllm-python/src/openllm/models/falcon/modeling_falcon.py +++ b/openllm-python/src/openllm/models/falcon/modeling_falcon.py @@ -4,42 +4,31 @@ import typing as t import openllm if t.TYPE_CHECKING: import torch, transformers else: - torch, transformers = openllm.utils.LazyLoader('torch', globals(), - 'torch'), openllm.utils.LazyLoader('transformers', globals(), - 'transformers') + torch, transformers = openllm.utils.LazyLoader('torch', globals(), 'torch'), openllm.utils.LazyLoader('transformers', globals(), 'transformers') class Falcon(openllm.LLM['transformers.PreTrainedModel', 'transformers.PreTrainedTokenizerBase']): __openllm_internal__ = True @property def import_kwargs(self) -> tuple[dict[str, t.Any], dict[str, t.Any]]: - return { - 'torch_dtype': torch.bfloat16, - 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None - }, {} + return {'torch_dtype': torch.bfloat16, 'device_map': 'auto' if torch.cuda.is_available() and torch.cuda.device_count() > 1 else None}, {} def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - eos_token_id, inputs = attrs.pop('eos_token_id', - self.tokenizer.eos_token_id), self.tokenizer(prompt, - return_tensors='pt').to(self.device) + eos_token_id, inputs = attrs.pop('eos_token_id', self.tokenizer.eos_token_id), self.tokenizer(prompt, return_tensors='pt').to(self.device) with torch.inference_mode(), torch.autocast('cuda', dtype=torch.float16): # type: ignore[attr-defined] - return self.tokenizer.batch_decode(self.model.generate( - input_ids=inputs['input_ids'], - attention_mask=inputs['attention_mask'], - generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, **attrs).to_generation_config()), + return self.tokenizer.batch_decode(self.model.generate(input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + generation_config=self.config.model_construct_env(eos_token_id=eos_token_id, + **attrs).to_generation_config()), skip_special_tokens=True) - def generate_one(self, prompt: str, stop: list[str], - **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: - max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer( - prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop( - 'stopping_criteria', openllm.StoppingCriteriaList([])) + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: + max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', + openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], - max_new_tokens=max_new_tokens, - stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py index 51a76400..601e0749 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flan_t5.py @@ -11,11 +11,10 @@ class FlanT5(openllm.LLM['transformers.T5ForConditionalGeneration', 'transformer def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return self.tokenizer.batch_decode( - self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()), - skip_special_tokens=True) + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True) def embeddings(self, prompts: list[str]) -> openllm.EmbeddingsOutput: import torch diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py index f8459b77..e49a447d 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_flax_flan_t5.py @@ -32,10 +32,9 @@ class FlaxFlanT5(openllm.LLM['transformers.FlaxT5ForConditionalGeneration', 'tra def generate(self, prompt: str, **attrs: t.Any) -> list[str]: # NOTE: decoder_start_token_id is extracted from https://huggingface.co/google/flan-t5-small/tree/main as it is required for encoder-decoder generation. decoder_start_token_id = attrs.pop('decoder_start_token_id', 0) - return self.tokenizer.batch_decode(self.model.generate( - self.tokenizer(prompt, return_tensors='np')['input_ids'], - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - decoder_start_token_id=decoder_start_token_id).sequences, + return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='np')['input_ids'], + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + decoder_start_token_id=decoder_start_token_id).sequences, skip_special_tokens=True, clean_up_tokenization_spaces=True) diff --git a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py index 09efcae1..9645af10 100644 --- a/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py +++ b/openllm-python/src/openllm/models/flan_t5/modeling_tf_flan_t5.py @@ -8,8 +8,7 @@ class TFFlanT5(openllm.LLM['transformers.TFT5ForConditionalGeneration', 'transfo __openllm_internal__ = True def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - return self.tokenizer.batch_decode(self.model.generate( - self.tokenizer(prompt, return_tensors='tf').input_ids, - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + return self.tokenizer.batch_decode(self.model.generate(self.tokenizer(prompt, return_tensors='tf').input_ids, + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()), skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/llama/modeling_llama.py b/openllm-python/src/openllm/models/llama/modeling_llama.py index b259fba8..54cf394e 100644 --- a/openllm-python/src/openllm/models/llama/modeling_llama.py +++ b/openllm-python/src/openllm/models/llama/modeling_llama.py @@ -26,17 +26,13 @@ class Llama(openllm.LLM['transformers.LlamaForCausalLM', 'transformers.LlamaToke return openllm.EmbeddingsOutput(embeddings=F.normalize(sum_embeddings / seq_length, p=2, dim=1).tolist(), num_tokens=int(torch.sum(attention_mask).item())) - def generate_one(self, prompt: str, stop: list[str], - **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: - max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer( - prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop( - 'stopping_criteria', openllm.StoppingCriteriaList([])) + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: + max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', + openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], - max_new_tokens=max_new_tokens, - stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/models/mpt/modeling_mpt.py b/openllm-python/src/openllm/models/mpt/modeling_mpt.py index 33553246..d79532f8 100644 --- a/openllm-python/src/openllm/models/mpt/modeling_mpt.py +++ b/openllm-python/src/openllm/models/mpt/modeling_mpt.py @@ -48,11 +48,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken torch_dtype = attrs.pop('torch_dtype', torch.bfloat16 if torch.cuda.is_available() else torch.float32) device_map = attrs.pop('device_map', None) attrs.pop('low_cpu_mem_usage', None) - config = get_mpt_config(self.model_id, - self.config.max_sequence_length, - self.device, - device_map=device_map, - trust_remote_code=trust_remote_code) + config = get_mpt_config(self.model_id, self.config.max_sequence_length, self.device, device_map=device_map, trust_remote_code=trust_remote_code) tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **tokenizer_attrs) if tokenizer.pad_token_id is None: tokenizer.pad_token = tokenizer.eos_token model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, @@ -62,10 +58,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken device_map=device_map, **attrs) try: - return bentoml.transformers.save_model(self.tag, - model, - custom_objects={'tokenizer': tokenizer}, - labels=generate_labels(self)) + return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)) finally: torch.cuda.empty_cache() @@ -79,7 +72,7 @@ class MPT(openllm.LLM['transformers.PreTrainedModel', 'transformers.GPTNeoXToken self.device, device_map=device_map, trust_remote_code=trust_remote_code, - ) + ) model = transformers.AutoModelForCausalLM.from_pretrained(self._bentomodel.path, config=config, trust_remote_code=trust_remote_code, diff --git a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py index d20a6a3e..19239321 100644 --- a/openllm-python/src/openllm/models/opt/modeling_flax_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_flax_opt.py @@ -16,12 +16,11 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok __openllm_internal__ = True def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: - config, tokenizer = transformers.AutoConfig.from_pretrained( - self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) + config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained( + self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, - transformers.FlaxAutoModelForCausalLM.from_pretrained( - self.model_id, **attrs), + transformers.FlaxAutoModelForCausalLM.from_pretrained(self.model_id, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)) @@ -45,6 +44,5 @@ class FlaxOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Tok def generate(self, prompt: str, **attrs: t.Any) -> list[str]: return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='np'), do_sample=True, - generation_config=self.config.model_construct_env( - **attrs).to_generation_config()).sequences, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()).sequences, skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/modeling_opt.py b/openllm-python/src/openllm/models/opt/modeling_opt.py index 80538b26..e555a03f 100644 --- a/openllm-python/src/openllm/models/opt/modeling_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_opt.py @@ -18,8 +18,7 @@ class OPT(openllm.LLM['transformers.OPTForCausalLM', 'transformers.GPT2Tokenizer def generate(self, prompt: str, **attrs: t.Any) -> list[str]: import torch with torch.inference_mode(): - return self.tokenizer.batch_decode( - self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()), - skip_special_tokens=True) + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py index 522d6c8e..d00fa7ea 100644 --- a/openllm-python/src/openllm/models/opt/modeling_tf_opt.py +++ b/openllm-python/src/openllm/models/opt/modeling_tf_opt.py @@ -11,18 +11,16 @@ class TFOPT(openllm.LLM['transformers.TFOPTForCausalLM', 'transformers.GPT2Token def import_model(self, *args: t.Any, trust_remote_code: bool = False, **attrs: t.Any) -> bentoml.Model: import transformers - config, tokenizer = transformers.AutoConfig.from_pretrained( - self.model_id), transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) + config, tokenizer = transformers.AutoConfig.from_pretrained(self.model_id), transformers.AutoTokenizer.from_pretrained( + self.model_id, **self.llm_parameters[-1]) tokenizer.pad_token_id = config.pad_token_id return bentoml.transformers.save_model(self.tag, - transformers.TFOPTForCausalLM.from_pretrained( - self.model_id, trust_remote_code=trust_remote_code, **attrs), + transformers.TFOPTForCausalLM.from_pretrained(self.model_id, trust_remote_code=trust_remote_code, **attrs), custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)) def generate(self, prompt: str, **attrs: t.Any) -> list[str]: - return self.tokenizer.batch_decode( - self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()), - skip_special_tokens=True) + return self.tokenizer.batch_decode(self.model.generate(**self.tokenizer(prompt, return_tensors='tf'), + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()), + skip_special_tokens=True) diff --git a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py index 0a8c9b82..1e0cdb02 100644 --- a/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py +++ b/openllm-python/src/openllm/models/stablelm/modeling_stablelm.py @@ -17,11 +17,10 @@ class StableLM(openllm.LLM['transformers.GPTNeoXForCausalLM', 'transformers.GPTN import torch with torch.inference_mode(): return [ - self.tokenizer.decode( - self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), - do_sample=True, - generation_config=self.config.model_construct_env(**attrs).to_generation_config(), - pad_token_id=self.tokenizer.eos_token_id, - stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], - skip_special_tokens=True) + self.tokenizer.decode(self.model.generate(**self.tokenizer(prompt, return_tensors='pt').to(self.device), + do_sample=True, + generation_config=self.config.model_construct_env(**attrs).to_generation_config(), + pad_token_id=self.tokenizer.eos_token_id, + stopping_criteria=openllm.StoppingCriteriaList([openllm.StopOnTokens()]))[0], + skip_special_tokens=True) ] diff --git a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py index 67502aa4..89de3061 100644 --- a/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py +++ b/openllm-python/src/openllm/models/starcoder/modeling_starcoder.py @@ -28,19 +28,10 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers. import transformers torch_dtype, device_map = attrs.pop('torch_dtype', torch.float16), attrs.pop('device_map', 'auto') tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_id, **self.llm_parameters[-1]) - tokenizer.add_special_tokens({ - 'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], - 'pad_token': EOD - }) - model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, - torch_dtype=torch_dtype, - device_map=device_map, - **attrs) + tokenizer.add_special_tokens({'additional_special_tokens': [EOD, FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD], 'pad_token': EOD}) + model = transformers.AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch_dtype, device_map=device_map, **attrs) try: - return bentoml.transformers.save_model(self.tag, - model, - custom_objects={'tokenizer': tokenizer}, - labels=generate_labels(self)) + return bentoml.transformers.save_model(self.tag, model, custom_objects={'tokenizer': tokenizer}, labels=generate_labels(self)) finally: torch.cuda.empty_cache() @@ -49,26 +40,21 @@ class StarCoder(openllm.LLM['transformers.GPTBigCodeForCausalLM', 'transformers. with torch.inference_mode(): # eos_token_id=self.tokenizer.convert_tokens_to_ids("<|end|>"), # NOTE: this is for finetuning starcoder # NOTE: support fine-tuning starcoder - result_tensor = self.model.generate( - self.tokenizer.encode(prompt, return_tensors='pt').to(self.device), - do_sample=True, - pad_token_id=self.tokenizer.eos_token_id, - generation_config=self.config.model_construct_env(**attrs).to_generation_config()) + result_tensor = self.model.generate(self.tokenizer.encode(prompt, return_tensors='pt').to(self.device), + do_sample=True, + pad_token_id=self.tokenizer.eos_token_id, + generation_config=self.config.model_construct_env(**attrs).to_generation_config()) # TODO: We will probably want to return the tokenizer here so that we can manually process this # return (skip_special_tokens=False, clean_up_tokenization_spaces=False)) return self.tokenizer.batch_decode(result_tensor[0], skip_special_tokens=True, clean_up_tokenization_spaces=True) - def generate_one(self, prompt: str, stop: list[str], - **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: - max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer( - prompt, return_tensors='pt').to(self.device) - src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop( - 'stopping_criteria', openllm.StoppingCriteriaList([])) + def generate_one(self, prompt: str, stop: list[str], **preprocess_generate_kwds: t.Any) -> list[dict[t.Literal['generated_text'], str]]: + max_new_tokens, encoded_inputs = preprocess_generate_kwds.pop('max_new_tokens', 200), self.tokenizer(prompt, return_tensors='pt').to(self.device) + src_len, stopping_criteria = encoded_inputs['input_ids'].shape[1], preprocess_generate_kwds.pop('stopping_criteria', + openllm.StoppingCriteriaList([])) stopping_criteria.append(openllm.StopSequenceCriteria(stop, self.tokenizer)) result = self.tokenizer.decode( - self.model.generate(encoded_inputs['input_ids'], - max_new_tokens=max_new_tokens, - stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) + self.model.generate(encoded_inputs['input_ids'], max_new_tokens=max_new_tokens, stopping_criteria=stopping_criteria)[0].tolist()[src_len:]) # Inference API returns the stop sequence for stop_seq in stop: if result.endswith(stop_seq): result = result[:-len(stop_seq)] diff --git a/openllm-python/src/openllm/playground/falcon_tuned.py b/openllm-python/src/openllm/playground/falcon_tuned.py index 527aff0c..3776c3c1 100644 --- a/openllm-python/src/openllm/playground/falcon_tuned.py +++ b/openllm-python/src/openllm/playground/falcon_tuned.py @@ -61,16 +61,13 @@ model, tokenizer = openllm.AutoLLM.for_model("falcon", quantize="int4", bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, - ensure_available=True).prepare_for_training(adapter_type="lora", - lora_alpha=16, - lora_dropout=0.1, - r=16, - bias="none", - target_modules=[ - "query_key_value", "dense", - "dense_h_to_4h", - "dense_4h_to_h" - ]) + ensure_available=True).prepare_for_training( + adapter_type="lora", + lora_alpha=16, + lora_dropout=0.1, + r=16, + bias="none", + target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"]) model.config.use_cache = False tokenizer.pad_token = tokenizer.eos_token @@ -81,9 +78,8 @@ trainer = SFTTrainer(model=model, dataset_text_field="text", max_seq_length=model_args.max_sequence_length, tokenizer=tokenizer, - args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), - **dataclasses.asdict(training_args)), - ) + args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)), + ) # upcast layernorm in float32 for more stable training for name, module in trainer.model.named_modules(): diff --git a/openllm-python/src/openllm/playground/llama2_qlora.py b/openllm-python/src/openllm/playground/llama2_qlora.py index 28c8339f..c9bb9630 100644 --- a/openllm-python/src/openllm/playground/llama2_qlora.py +++ b/openllm-python/src/openllm/playground/llama2_qlora.py @@ -78,10 +78,7 @@ def chunk(sample, chunk_length=2048): batch_chunk_length = (batch_total_length // chunk_length) * chunk_length # Split by chunks of max_len. - result = { - k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] - for k, t in concatenated_examples.items() - } + result = {k: [t[i:i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)] for k, t in concatenated_examples.items()} # add remainder to global variable for next batch remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()} # prepare labels @@ -101,8 +98,7 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME): print("Sample from dolly-v2 ds:", dataset[randint(0, len(dataset))]["text"]) # tokenize and chunk dataset - lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), - batched=True, + lm_dataset = dataset.map(lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)).map(partial(chunk, chunk_length=2048), batched=True) # Print total number of samples @@ -113,7 +109,7 @@ def prepare_for_int4_training(model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True, - ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]: + ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]: from peft.tuners.lora import LoraLayer llm = openllm.AutoLLM.for_model("llama", @@ -124,16 +120,14 @@ def prepare_for_int4_training(model_id: str, bnb_4bit_compute_dtype=torch.bfloat16, use_cache=not gradient_checkpointing, device_map="auto", - ) + ) print("Model summary:", llm.model) # get lora target modules modules = find_all_linear_names(llm.model) print(f"Found {len(modules)} modules to quantize: {modules}") - model, tokenizer = llm.prepare_for_training(adapter_type="lora", - use_gradient_checkpointing=gradient_checkpointing, - target_modules=modules) + model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules) # pre-process the model by upcasting the layer norms in float 32 for for name, module in model.named_modules(): @@ -189,7 +183,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments): model, tokenizer = prepare_for_int4_training(model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16, - ) + ) datasets = prepare_datasets(tokenizer) trainer = transformers.Trainer(model=model, @@ -197,7 +191,7 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments): **dataclasses.asdict(training_args)), train_dataset=datasets, data_collator=transformers.default_data_collator, - ) + ) trainer.train() @@ -212,14 +206,10 @@ def train_loop(model_args: ModelArguments, training_args: TrainingArguments): del model, trainer torch.cuda.empty_cache() - model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, - low_cpu_mem_usage=True, - torch_dtype=torch.float16) + model = peft.AutoPeftModelForCausalLM.from_pretrained(training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16) # merge lora with base weights and save model = model.merge_and_unload() - model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), - safe_serialization=True, - max_shard_size="2GB") + model.save_pretrained(os.path.join(os.getcwd(), "outputs", "merged_llama_lora"), safe_serialization=True, max_shard_size="2GB") else: trainer.model.save_pretrained(os.path.join(training_args.output_dir, "lora")) diff --git a/openllm-python/src/openllm/playground/opt_tuned.py b/openllm-python/src/openllm/playground/opt_tuned.py index 322ddd0a..5b21e600 100644 --- a/openllm-python/src/openllm/playground/opt_tuned.py +++ b/openllm-python/src/openllm/playground/opt_tuned.py @@ -26,14 +26,12 @@ if t.TYPE_CHECKING: DEFAULT_MODEL_ID = "facebook/opt-6.7b" -def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, - training_args: TrainingArguments): +def load_trainer(model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments): return transformers.Trainer(model=model, train_dataset=dataset_dict["train"], - args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), - **dataclasses.asdict(training_args)), + args=dataclasses.replace(transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), - ) + ) @dataclasses.dataclass class TrainingArguments: @@ -58,16 +56,13 @@ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): else: model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()) -model, tokenizer = openllm.AutoLLM.for_model("opt", - model_id=model_args.model_id, - quantize="int8", - ensure_available=True).prepare_for_training( - adapter_type="lora", - r=16, - lora_alpha=32, - target_modules=["q_proj", "v_proj"], - lora_dropout=0.05, - bias="none") +model, tokenizer = openllm.AutoLLM.for_model("opt", model_id=model_args.model_id, quantize="int8", + ensure_available=True).prepare_for_training(adapter_type="lora", + r=16, + lora_alpha=32, + target_modules=["q_proj", "v_proj"], + lora_dropout=0.05, + bias="none") # ft on english_quotes data = load_dataset("Abirate/english_quotes") diff --git a/openllm-python/src/openllm/serialisation/__init__.py b/openllm-python/src/openllm/serialisation/__init__.py index 1714d65b..e336bca8 100644 --- a/openllm-python/src/openllm/serialisation/__init__.py +++ b/openllm-python/src/openllm/serialisation/__init__.py @@ -59,14 +59,12 @@ def load_tokenizer(llm: openllm.LLM[t.Any, T], **tokenizer_attrs: t.Any) -> T: return tokenizer class _Caller(t.Protocol[P]): - def __call__(self, llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: ... _extras = ['get', 'import_model', 'load_model'] def _make_dispatch_function(fn: str) -> _Caller[P]: - def caller(llm: openllm.LLM[M, T], *args: P.args, **kwargs: P.kwargs) -> t.Any: """Generic function dispatch to correct serialisation submodules based on LLM runtime. diff --git a/openllm-python/src/openllm/serialisation/constants.py b/openllm-python/src/openllm/serialisation/constants.py index f6b8265c..f90116ec 100644 --- a/openllm-python/src/openllm/serialisation/constants.py +++ b/openllm-python/src/openllm/serialisation/constants.py @@ -7,6 +7,5 @@ FRAMEWORK_TO_AUTOCLASS_MAPPING = { 'vllm': ('AutoModelForCausalLM', 'AutoModelForSeq2SeqLM') } HUB_ATTRS = [ - 'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', - 'subfolder', 'use_auth_token' + 'cache_dir', 'code_revision', 'force_download', 'local_files_only', 'proxies', 'resume_download', 'revision', 'subfolder', 'use_auth_token' ] diff --git a/openllm-python/src/openllm/serialisation/ggml.py b/openllm-python/src/openllm/serialisation/ggml.py index fd4397cc..babb0c23 100644 --- a/openllm-python/src/openllm/serialisation/ggml.py +++ b/openllm-python/src/openllm/serialisation/ggml.py @@ -13,11 +13,7 @@ if t.TYPE_CHECKING: _conversion_strategy = {'pt': 'ggml'} -def import_model(llm: openllm.LLM[t.Any, t.Any], - *decls: t.Any, - trust_remote_code: bool = True, - **attrs: t.Any, - ) -> bentoml.Model: +def import_model(llm: openllm.LLM[t.Any, t.Any], *decls: t.Any, trust_remote_code: bool = True, **attrs: t.Any,) -> bentoml.Model: raise NotImplementedError('Currently work in progress.') def get(llm: openllm.LLM[t.Any, t.Any], auto_import: bool = False) -> bentoml.Model: diff --git a/openllm-python/src/openllm/serialisation/transformers/__init__.py b/openllm-python/src/openllm/serialisation/transformers/__init__.py index c75e3636..40062be0 100644 --- a/openllm-python/src/openllm/serialisation/transformers/__init__.py +++ b/openllm-python/src/openllm/serialisation/transformers/__init__.py @@ -68,24 +68,18 @@ def import_model(llm: openllm.LLM[M, T], config, hub_attrs, attrs = process_config(llm.model_id, trust_remote_code, **attrs) _, tokenizer_attrs = llm.llm_parameters quantize_method = llm._quantize_method - safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), - default=llm._serialisation_format == 'safetensors') + safe_serialisation = openllm.utils.first_not_none(attrs.get('safe_serialization'), default=llm._serialisation_format == 'safetensors') # Disable safe serialization with vLLM if llm.__llm_backend__ == 'vllm': safe_serialisation = False - metadata: DictStrAny = { - 'safe_serialisation': safe_serialisation, - '_quantize': quantize_method is not None and quantize_method - } + metadata: DictStrAny = {'safe_serialisation': safe_serialisation, '_quantize': quantize_method is not None and quantize_method} signatures: DictStrAny = {} if quantize_method == 'gptq': if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'" - ) + "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException( - f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") + raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") signatures['generate'] = {'batchable': False} else: # this model might be called with --quantize int4, therefore we need to pop this out @@ -95,10 +89,7 @@ def import_model(llm: openllm.LLM[M, T], if llm.__llm_backend__ != 'flax': attrs['use_safetensors'] = safe_serialisation metadata['_framework'] = 'pt' if llm.__llm_backend__ == 'vllm' else llm.__llm_backend__ - tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, - trust_remote_code=trust_remote_code, - **hub_attrs, - **tokenizer_attrs) + tokenizer = infer_tokenizers_from_llm(llm).from_pretrained(llm.model_id, trust_remote_code=trust_remote_code, **hub_attrs, **tokenizer_attrs) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token external_modules: list[types.ModuleType] = [importlib.import_module(tokenizer.__module__)] @@ -117,25 +108,18 @@ def import_model(llm: openllm.LLM[M, T], if quantize_method == 'gptq': if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'" - ) + "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException( - f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") + raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") logger.debug('Saving model with GPTQ quantisation will require loading model into memory.') model = autogptq.AutoGPTQForCausalLM.from_quantized(llm.model_id, *decls, - quantize_config=t.cast('autogptq.BaseQuantizeConfig', - llm.quantization_config), + quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config), trust_remote_code=trust_remote_code, use_safetensors=safe_serialisation, **hub_attrs, **attrs) - update_model(bentomodel, - metadata={ - '_pretrained_class': model.__class__.__name__, - '_framework': model.model.framework - }) + update_model(bentomodel, metadata={'_pretrained_class': model.__class__.__name__, '_framework': model.model.framework}) model.save_quantized(bentomodel.path, use_safetensors=safe_serialisation) else: architectures = getattr(config, 'architectures', []) @@ -159,18 +143,14 @@ def import_model(llm: openllm.LLM[M, T], model.save_pretrained(bentomodel.path, max_shard_size='5GB', safe_serialization=safe_serialisation) else: # we will clone the all tings into the bentomodel path without loading model into memory - snapshot_download(llm.model_id, - local_dir=bentomodel.path, - local_dir_use_symlinks=False, - ignore_patterns=HfIgnore.ignore_patterns(llm)) + snapshot_download(llm.model_id, local_dir=bentomodel.path, local_dir_use_symlinks=False, ignore_patterns=HfIgnore.ignore_patterns(llm)) except Exception: raise else: bentomodel.flush() # type: ignore[no-untyped-call] bentomodel.save(_model_store) openllm.utils.analytics.track( - openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, - model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)) + openllm.utils.analytics.ModelSaveEvent(module=bentomodel.info.module, model_size_in_kb=openllm.utils.calc_dir_size(bentomodel.path) / 1024)) finally: bentomodel.exit_cloudpickle_context(imported_modules) # NOTE: We need to free up the cache after importing the model @@ -189,36 +169,29 @@ def get(llm: openllm.LLM[M, T], auto_import: bool = False) -> bentoml.Model: try: model = bentoml.models.get(llm.tag) if Version(model.info.api_version) < Version('v2'): - raise openllm.exceptions.OpenLLMException( - 'Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.') + raise openllm.exceptions.OpenLLMException('Please run "openllm prune -y --include-bentos" and upgrade all saved model to latest release.') if model.info.labels['backend'] != llm.__llm_backend__: raise openllm.exceptions.OpenLLMException( - f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}." - ) + f"Model {model.tag} was saved with backend {model.info.labels['backend']}, while loading with {llm.__llm_backend__}.") return model except Exception as err: if auto_import: return import_model(llm, trust_remote_code=llm.trust_remote_code) - raise openllm.exceptions.OpenLLMException( - f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err + raise openllm.exceptions.OpenLLMException(f'Failed while getting stored artefact (lookup for traceback):\n{err}') from err def load_model(llm: openllm.LLM[M, T], *decls: t.Any, **attrs: t.Any) -> M: config, hub_attrs, attrs = process_config(llm.model_id, llm.trust_remote_code, **attrs) - safe_serialization = openllm.utils.first_not_none(t.cast( - t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), + safe_serialization = openllm.utils.first_not_none(t.cast(t.Optional[bool], llm._bentomodel.info.metadata.get('safe_serialisation', None)), attrs.pop('safe_serialization', None), default=llm._serialisation_format == 'safetensors') if '_quantize' in llm._bentomodel.info.metadata and llm._bentomodel.info.metadata['_quantize'] == 'gptq': if not openllm.utils.is_autogptq_available(): raise openllm.exceptions.OpenLLMException( - "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'" - ) + "GPTQ quantisation requires 'auto-gptq' (Not found in local environment). Install it with 'pip install \"openllm[gptq]\"'") if llm.config['model_type'] != 'causal_lm': - raise openllm.exceptions.OpenLLMException( - f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") + raise openllm.exceptions.OpenLLMException(f"GPTQ only support Causal LM (got {llm.__class__} of {llm.config['model_type']})") return autogptq.AutoGPTQForCausalLM.from_quantized(llm._bentomodel.path, *decls, - quantize_config=t.cast('autogptq.BaseQuantizeConfig', - llm.quantization_config), + quantize_config=t.cast('autogptq.BaseQuantizeConfig', llm.quantization_config), trust_remote_code=llm.trust_remote_code, use_safetensors=safe_serialization, **hub_attrs, diff --git a/openllm-python/src/openllm/serialisation/transformers/_helpers.py b/openllm-python/src/openllm/serialisation/transformers/_helpers.py index b325fd85..469b9bb2 100644 --- a/openllm-python/src/openllm/serialisation/transformers/_helpers.py +++ b/openllm-python/src/openllm/serialisation/transformers/_helpers.py @@ -24,13 +24,11 @@ if t.TYPE_CHECKING: from openllm_core._typing_compat import T else: transformers, torch = openllm_core.utils.LazyLoader('transformers', globals(), - 'transformers'), openllm_core.utils.LazyLoader( - 'torch', globals(), 'torch') + 'transformers'), openllm_core.utils.LazyLoader('torch', globals(), 'torch') _object_setattr = object.__setattr__ -def process_config(model_id: str, trust_remote_code: bool, - **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]: +def process_config(model_id: str, trust_remote_code: bool, **attrs: t.Any) -> tuple[transformers.PretrainedConfig, DictStrAny, DictStrAny]: '''A helper function that correctly parse config and attributes for transformers.PretrainedConfig. Args: @@ -55,8 +53,7 @@ def process_config(model_id: str, trust_remote_code: bool, return config, hub_attrs, attrs def infer_tokenizers_from_llm(__llm: openllm.LLM[t.Any, T], /) -> T: - __cls = getattr(transformers, - openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None) + __cls = getattr(transformers, openllm_core.utils.first_not_none(__llm.config['tokenizer_class'], default='AutoTokenizer'), None) if __cls is None: raise ValueError(f'Cannot infer correct tokenizer class for {__llm}. Make sure to unset `tokenizer_class`') return __cls @@ -105,13 +102,11 @@ def make_model_signatures(llm: openllm.LLM[M, T]) -> ModelSignaturesType: infer_fn: tuple[str, ...] = ('__call__',) default_config = ModelSignature(batchable=False) if llm.__llm_backend__ in {'pt', 'vllm'}: - infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', - 'group_beam_search', 'constrained_beam_search', - ) + infer_fn += ('forward', 'generate', 'contrastive_search', 'greedy_search', 'sample', 'beam_search', 'beam_sample', 'group_beam_search', + 'constrained_beam_search', + ) elif llm.__llm_backend__ == 'tf': - infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', - 'contrastive_search', - ) + infer_fn += ('predict', 'call', 'generate', 'compute_transition_scores', 'greedy_search', 'sample', 'beam_search', 'contrastive_search',) else: infer_fn += ('generate',) return {k: default_config for k in infer_fn} diff --git a/openllm-python/src/openllm/testing.py b/openllm-python/src/openllm/testing.py index 5736d1da..36f55972 100644 --- a/openllm-python/src/openllm/testing.py +++ b/openllm-python/src/openllm/testing.py @@ -27,10 +27,7 @@ def build_bento(model: str, bentoml.bentos.delete(bento.tag) @contextlib.contextmanager -def build_container(bento: bentoml.Bento | str | bentoml.Tag, - image_tag: str | None = None, - cleanup: bool = False, - **attrs: t.Any) -> t.Iterator[str]: +def build_container(bento: bentoml.Bento | str | bentoml.Tag, image_tag: str | None = None, cleanup: bool = False, **attrs: t.Any) -> t.Iterator[str]: if isinstance(bento, bentoml.Bento): bento_tag = bento.tag else: bento_tag = bentoml.Tag.from_taglike(bento) if image_tag is None: image_tag = str(bento_tag) diff --git a/openllm-python/src/openllm/utils/__init__.py b/openllm-python/src/openllm/utils/__init__.py index 4033d3fb..75cf83e8 100644 --- a/openllm-python/src/openllm/utils/__init__.py +++ b/openllm-python/src/openllm/utils/__init__.py @@ -27,8 +27,7 @@ def generate_labels(llm: openllm.LLM[t.Any, t.Any]) -> dict[str, t.Any]: 'serialisation_format': llm._serialisation_format } -def infer_auto_class( - backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: +def infer_auto_class(backend: LiteralBackend) -> type[openllm.AutoLLM | openllm.AutoTFLLM | openllm.AutoFlaxLLM | openllm.AutoVLLM]: import openllm if backend == 'tf': return openllm.AutoTFLLM elif backend == 'flax': return openllm.AutoFlaxLLM @@ -36,10 +35,7 @@ def infer_auto_class( elif backend == 'vllm': return openllm.AutoVLLM else: raise RuntimeError(f"Unknown backend: {backend} (supported: 'pt', 'flax', 'tf', 'vllm')") -__all__ = [ - 'generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', - 'dummy_vllm_objects' -] +__all__ = ['generate_labels', 'infer_auto_class', 'dummy_flax_objects', 'dummy_pt_objects', 'dummy_tf_objects', 'dummy_vllm_objects'] def __dir__() -> t.Sequence[str]: return sorted(__all__) diff --git a/openllm-python/tests/_strategies/_configuration.py b/openllm-python/tests/_strategies/_configuration.py index 813df70d..5ad3a60c 100644 --- a/openllm-python/tests/_strategies/_configuration.py +++ b/openllm-python/tests/_strategies/_configuration.py @@ -16,39 +16,26 @@ env_strats = st.sampled_from([openllm.utils.EnvVarMixin(model_name) for model_na def model_settings(draw: st.DrawFn): '''Strategy for generating ModelSettings objects.''' kwargs: dict[str, t.Any] = { - 'default_id': - st.text(min_size=1), - 'model_ids': - st.lists(st.text(), min_size=1), - 'architecture': - st.text(min_size=1), - 'url': - st.text(), - 'requires_gpu': - st.booleans(), - 'trust_remote_code': - st.booleans(), - 'requirements': - st.none() | st.lists(st.text(), min_size=1), - 'default_backend': - st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])), - 'model_type': - st.sampled_from(['causal_lm', 'seq2seq_lm']), - 'name_type': - st.sampled_from(['dasherize', 'lowercase']), - 'timeout': - st.integers(min_value=3600), - 'workers_per_resource': - st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), + 'default_id': st.text(min_size=1), + 'model_ids': st.lists(st.text(), min_size=1), + 'architecture': st.text(min_size=1), + 'url': st.text(), + 'requires_gpu': st.booleans(), + 'trust_remote_code': st.booleans(), + 'requirements': st.none() | st.lists(st.text(), min_size=1), + 'default_backend': st.dictionaries(st.sampled_from(['cpu', 'nvidia.com/gpu']), st.sampled_from(['vllm', 'pt', 'tf', 'flax'])), + 'model_type': st.sampled_from(['causal_lm', 'seq2seq_lm']), + 'name_type': st.sampled_from(['dasherize', 'lowercase']), + 'timeout': st.integers(min_value=3600), + 'workers_per_resource': st.one_of(st.integers(min_value=1), st.floats(min_value=0.1, max_value=1.0)), } return draw(st.builds(ModelSettings, **kwargs)) -def make_llm_config( - cls_name: str, - dunder_config: dict[str, t.Any] | ModelSettings, - fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None, - generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None, -) -> type[openllm.LLMConfig]: +def make_llm_config(cls_name: str, + dunder_config: dict[str, t.Any] | ModelSettings, + fields: tuple[tuple[t.LiteralString, str, t.Any], ...] | None = None, + generation_fields: tuple[tuple[t.LiteralString, t.Any], ...] | None = None, + ) -> type[openllm.LLMConfig]: globs: dict[str, t.Any] = {'openllm': openllm} _config_args: list[str] = [] lines: list[str] = [f'class {cls_name}Config(openllm.LLMConfig):'] diff --git a/openllm-python/tests/configuration_test.py b/openllm-python/tests/configuration_test.py index 147ebc66..43dda04a 100644 --- a/openllm-python/tests/configuration_test.py +++ b/openllm-python/tests/configuration_test.py @@ -24,21 +24,19 @@ from ._strategies._configuration import make_llm_config from ._strategies._configuration import model_settings # XXX: @aarnphm fixes TypedDict behaviour in 3.11 -@pytest.mark.skipif(sys.version_info[:2] == (3, 11), - reason='TypedDict in 3.11 behaves differently, so we need to fix this') +@pytest.mark.skipif(sys.version_info[:2] == (3, 11), reason='TypedDict in 3.11 behaves differently, so we need to fix this') def test_missing_default(): with pytest.raises(ValueError, match='Missing required fields *'): make_llm_config('MissingDefaultId', {'name_type': 'lowercase', 'requirements': ['bentoml']}) with pytest.raises(ValueError, match='Missing required fields *'): make_llm_config('MissingModelId', {'default_id': 'huggingface/t5-tiny-testing', 'requirements': ['bentoml']}) with pytest.raises(ValueError, match='Missing required fields *'): - make_llm_config( - 'MissingArchitecture', { - 'default_id': 'huggingface/t5-tiny-testing', - 'model_ids': ['huggingface/t5-tiny-testing'], - 'requirements': ['bentoml'], - }, - ) + make_llm_config('MissingArchitecture', { + 'default_id': 'huggingface/t5-tiny-testing', + 'model_ids': ['huggingface/t5-tiny-testing'], + 'requirements': ['bentoml'], + }, + ) def test_forbidden_access(): cl_ = make_llm_config( @@ -79,16 +77,11 @@ def test_config_derived_follow_attrs_protocol(gen_settings: ModelSettings): cl_ = make_llm_config('AttrsProtocolLLM', gen_settings) assert attr.has(cl_) -@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), - st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), - ) -def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, - input_temperature: float): - cl_ = make_llm_config('ComplexLLM', - gen_settings, - fields=(('field1', 'float', field1),), - generation_fields=(('temperature', temperature),), - ) +@given(model_settings(), st.integers(max_value=283473), st.floats(min_value=0.0, max_value=1.0), st.integers(max_value=283473), + st.floats(min_value=0.0, max_value=1.0), + ) +def test_complex_struct_dump(gen_settings: ModelSettings, field1: int, temperature: float, input_field1: int, input_temperature: float): + cl_ = make_llm_config('ComplexLLM', gen_settings, fields=(('field1', 'float', field1),), generation_fields=(('temperature', temperature),),) sent = cl_() assert sent.model_dump()['field1'] == field1 assert sent.model_dump()['generation_config']['temperature'] == temperature @@ -129,7 +122,6 @@ def test_struct_envvar(): assert overwrite_default['temperature'] == 0.2 def test_struct_provided_fields(): - class EnvLLM(openllm.LLMConfig): __config__ = {'default_id': 'asdfasdf', 'model_ids': ['asdf', 'asdfasdfads'], 'architecture': 'PreTrainedModel',} field1: int = 2 @@ -151,7 +143,7 @@ def test_struct_envvar_with_overwrite_provided_env(monkeypatch: pytest.MonkeyPat 'architecture': 'PreTrainedModel' }, fields=(('field1', 'float', 3.0),), - ).model_construct_env(field1=20.0, temperature=0.4) + ).model_construct_env(field1=20.0, temperature=0.4) assert sent.generation_config.temperature == 0.4 assert sent.field1 == 20.0 diff --git a/openllm-python/tests/conftest.py b/openllm-python/tests/conftest.py index 959b6e11..cd207c91 100644 --- a/openllm-python/tests/conftest.py +++ b/openllm-python/tests/conftest.py @@ -10,35 +10,22 @@ import openllm if t.TYPE_CHECKING: from openllm_core._typing_compat import LiteralBackend -_MODELING_MAPPING = { - 'flan_t5': 'google/flan-t5-small', - 'opt': 'facebook/opt-125m', - 'baichuan': 'baichuan-inc/Baichuan-7B', -} -_PROMPT_MAPPING = { - 'qa': - 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?', -} +_MODELING_MAPPING = {'flan_t5': 'google/flan-t5-small', 'opt': 'facebook/opt-125m', 'baichuan': 'baichuan-inc/Baichuan-7B',} +_PROMPT_MAPPING = {'qa': 'Answer the following yes/no question by reasoning step-by-step. Can you write a whole Haiku in a single tweet?',} -def parametrise_local_llm( - model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]: +def parametrise_local_llm(model: str,) -> t.Generator[tuple[str, openllm.LLMRunner[t.Any, t.Any] | openllm.LLM[t.Any, t.Any]], None, None]: if model not in _MODELING_MAPPING: pytest.skip(f"'{model}' is not yet supported in framework testing.") backends: tuple[LiteralBackend, ...] = tuple() if model in openllm.MODEL_MAPPING_NAMES: backends += ('pt',) if model in openllm.MODEL_FLAX_MAPPING_NAMES: backends += ('flax',) if model in openllm.MODEL_TF_MAPPING_NAMES: backends += ('tf',) for backend, prompt in itertools.product(backends, _PROMPT_MAPPING.keys()): - yield prompt, openllm.Runner(model, - model_id=_MODELING_MAPPING[model], - ensure_available=True, - backend=backend, - init_local=True) + yield prompt, openllm.Runner(model, model_id=_MODELING_MAPPING[model], ensure_available=True, backend=backend, init_local=True) def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: if os.getenv('GITHUB_ACTIONS') is None: if 'prompt' in metafunc.fixturenames and 'llm' in metafunc.fixturenames: - metafunc.parametrize('prompt,llm', - [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])]) + metafunc.parametrize('prompt,llm', [(p, llm) for p, llm in parametrise_local_llm(metafunc.function.__name__[5:-15])]) def pytest_sessionfinish(session: pytest.Session, exitstatus: int): # If no tests are collected, pytest exists with code 5, which makes the CI fail. diff --git a/openllm-python/tests/models/conftest.py b/openllm-python/tests/models/conftest.py index fb201a40..9d53a778 100644 --- a/openllm-python/tests/models/conftest.py +++ b/openllm-python/tests/models/conftest.py @@ -40,13 +40,7 @@ if t.TYPE_CHECKING: from openllm.client import BaseAsyncClient class ResponseComparator(JSONSnapshotExtension): - - def serialize(self, - data: SerializableData, - *, - exclude: PropertyFilter | None = None, - matcher: PropertyMatcher | None = None, - ) -> SerializedData: + def serialize(self, data: SerializableData, *, exclude: PropertyFilter | None = None, matcher: PropertyMatcher | None = None,) -> SerializedData: if openllm.utils.LazyType(ListAny).isinstance(data): data = [d.unmarshaled for d in data] else: @@ -55,7 +49,6 @@ class ResponseComparator(JSONSnapshotExtension): return orjson.dumps(data, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS).decode() def matches(self, *, serialized_data: SerializableData, snapshot_data: SerializableData) -> bool: - def convert_data(data: SerializableData) -> openllm.GenerationOutput | t.Sequence[openllm.GenerationOutput]: try: data = orjson.loads(data) @@ -83,8 +76,7 @@ class ResponseComparator(JSONSnapshotExtension): return (len(s.responses) == len(t.responses) and all([_s == _t for _s, _t in zip(s.responses, t.responses)]) and eq_config(s.marshaled_config, t.marshaled_config)) - return len(serialized_data) == len(snapshot_data) and all( - [eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]) + return len(serialized_data) == len(snapshot_data) and all([eq_output(s, t) for s, t in zip(serialized_data, snapshot_data)]) @pytest.fixture() def response_snapshot(snapshot: SnapshotAssertion): @@ -133,14 +125,8 @@ class LocalHandle(_Handle): return self.process.poll() is None class HandleProtocol(t.Protocol): - @contextlib.contextmanager - def __call__(*, - model: str, - model_id: str, - image_tag: str, - quantize: t.AnyStr | None = None, - ) -> t.Generator[_Handle, None, None]: + def __call__(*, model: str, model_id: str, image_tag: str, quantize: t.AnyStr | None = None,) -> t.Generator[_Handle, None, None]: ... @attr.define(init=False) @@ -148,9 +134,7 @@ class DockerHandle(_Handle): container_name: str docker_client: docker.DockerClient - def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, - deployment_mode: t.Literal['container', 'local'], - ): + def __init__(self, docker_client: docker.DockerClient, container_name: str, port: int, deployment_mode: t.Literal['container', 'local'],): self.__attrs_init__(port, deployment_mode, container_name, docker_client) def status(self) -> bool: @@ -165,22 +149,14 @@ def _local_handle(model: str, quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False, - ): + ): with openllm.utils.reserve_free_port() as port: pass if not _serve_grpc: - proc = openllm.start(model, - model_id=model_id, - quantize=quantize, - additional_args=['--port', str(port)], - __test__=True) + proc = openllm.start(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True) else: - proc = openllm.start_grpc(model, - model_id=model_id, - quantize=quantize, - additional_args=['--port', str(port)], - __test__=True) + proc = openllm.start_grpc(model, model_id=model_id, quantize=quantize, additional_args=['--port', str(port)], __test__=True) yield LocalHandle(proc, port, deployment_mode) proc.terminate() @@ -201,7 +177,7 @@ def _container_handle(model: str, quantize: t.Literal['int8', 'int4', 'gptq'] | None = None, *, _serve_grpc: bool = False, - ): + ): envvar = openllm.utils.EnvVarMixin(model) with openllm.utils.reserve_free_port() as port, openllm.utils.reserve_free_port() as prom_port: @@ -237,7 +213,7 @@ def _container_handle(model: str, '3000/tcp': port, '3001/tcp': prom_port }, - ) + ) yield DockerHandle(client, container.name, port, deployment_mode) diff --git a/openllm-python/tests/models/flan_t5_test.py b/openllm-python/tests/models/flan_t5_test.py index 1fc625d9..2b9d75fd 100644 --- a/openllm-python/tests/models/flan_t5_test.py +++ b/openllm-python/tests/models/flan_t5_test.py @@ -16,11 +16,8 @@ model = 'flan_t5' model_id = 'google/flan-t5-small' @pytest.fixture(scope='module') -def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], - clean_context: contextlib.ExitStack, - ): - with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, - clean_context=clean_context) as image_tag: +def flan_t5_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,): + with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle diff --git a/openllm-python/tests/models/opt_test.py b/openllm-python/tests/models/opt_test.py index 4d12711d..ba30f4c7 100644 --- a/openllm-python/tests/models/opt_test.py +++ b/openllm-python/tests/models/opt_test.py @@ -16,11 +16,8 @@ model = 'opt' model_id = 'facebook/opt-125m' @pytest.fixture(scope='module') -def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], - clean_context: contextlib.ExitStack, - ): - with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, - clean_context=clean_context) as image_tag: +def opt_125m_handle(handler: HandleProtocol, deployment_mode: t.Literal['container', 'local'], clean_context: contextlib.ExitStack,): + with openllm.testing.prepare(model, model_id=model_id, deployment_mode=deployment_mode, clean_context=clean_context) as image_tag: with handler(model=model, model_id=model_id, image_tag=image_tag) as handle: yield handle diff --git a/openllm-python/tests/package_test.py b/openllm-python/tests/package_test.py index 17c121e6..9fdf7eea 100644 --- a/openllm-python/tests/package_test.py +++ b/openllm-python/tests/package_test.py @@ -15,11 +15,10 @@ if t.TYPE_CHECKING: HF_INTERNAL_T5_TESTING = 'hf-internal-testing/tiny-random-t5' -actions_xfail = functools.partial( - pytest.mark.xfail, - condition=os.getenv('GITHUB_ACTIONS') is not None, - reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.', -) +actions_xfail = functools.partial(pytest.mark.xfail, + condition=os.getenv('GITHUB_ACTIONS') is not None, + reason='Marking GitHub Actions to xfail due to flakiness and building environment not isolated.', + ) @actions_xfail def test_general_build_with_internal_testing(): @@ -51,8 +50,7 @@ def test_general_build_from_local(tmp_path_factory: pytest.TempPathFactory): def dockerfile_template(tmp_path_factory: pytest.TempPathFactory): file = tmp_path_factory.mktemp('dockerfiles') / 'Dockerfile.template' file.write_text( - "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}" - ) + "{% extends bento_base_template %}\n{% block SETUP_BENTO_ENTRYPOINT %}\n{{ super() }}\nRUN echo 'sanity from custom dockerfile'\n{% endblock %}") return file @pytest.mark.usefixtures('dockerfile_template') diff --git a/openllm-python/tests/strategies_test.py b/openllm-python/tests/strategies_test.py index 9236dacd..97d72038 100644 --- a/openllm-python/tests/strategies_test.py +++ b/openllm-python/tests/strategies_test.py @@ -71,11 +71,9 @@ def test_nvidia_gpu_validate(monkeypatch: pytest.MonkeyPatch): mcls.setenv('CUDA_VISIBLE_DEVICES', '') assert len(NvidiaGpuResource.from_system()) >= 0 # TODO: real from_system tests - assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1], - ).match('Input list should be all string type.') + assert pytest.raises(ValueError, NvidiaGpuResource.validate, [*NvidiaGpuResource.from_system(), 1],).match('Input list should be all string type.') assert pytest.raises(ValueError, NvidiaGpuResource.validate, [-2]).match('Input list should be all string type.') - assert pytest.raises(ValueError, NvidiaGpuResource.validate, - ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID') + assert pytest.raises(ValueError, NvidiaGpuResource.validate, ['GPU-5ebe9f43', 'GPU-ac33420d4628']).match('Failed to parse available GPUs UUID') def test_nvidia_gpu_from_spec(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as mcls: diff --git a/pyproject.toml b/pyproject.toml index fe922a8b..9ee7e1a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -261,51 +261,28 @@ ignore_patterns = [ based_on_style = "google" INDENT_WIDTH = 2 JOIN_MULTIPLE_LINES = true -COLUMN_LIMIT = 120 +COLUMN_LIMIT = 152 USE_TABS = false BLANK_LINES_AROUND_TOP_LEVEL_DEFINITION = 1 BLANK_LINES_BETWEEN_TOP_LEVEL_IMPORTS_AND_VARIABLES = 1 DISABLE_ENDING_COMMA_HEURISTIC = true -# ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true -# ALLOW_MULTILINE_DICTIONARY_KEYS = false -# ALLOW_MULTILINE_LAMBDAS = false -# ALLOW_SPLIT_BEFORE_DEFAULT_OR_NAMED_ASSIGNS = false -# ALLOW_SPLIT_BEFORE_DICT_VALUE = false -# ARITHMETIC_PRECEDENCE_INDICATION = true -# BLANK_LINE_BEFORE_CLASS_DOCSTRING = false -# BLANK_LINE_BEFORE_MODULE_DOCSTRING = false -# BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false -# DEDENT_CLOSING_BRACKETS = true -# INDENT_CLOSING_BRACKETS = false -# COALESCE_BRACKETS = true -# EACH_DICT_ENTRY_ON_SEPARATE_LINE = true -# CONTINUATION_ALIGN_STYLE = "SPACE" -# INDENT_BLANK_LINES = false -# NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true -# SPACES_AROUND_SUBSCRIPT_COLON = false -# SPACES_AROUND_DICT_DELIMITERS = false -# SPACES_AROUND_LIST_DELIMITERS = false -# SPACES_AROUND_POWER_OPERATOR = false -# SPACES_AROUND_TUPLE_DELIMITERS = false -# SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false -# SPACE_INSIDE_BRACKETS = false -# SPLIT_ALL_COMMA_SEPARATED_VALUES = false -# SPLIT_ALL_TOP_LEVEL_COMMA_SEPARATED_VALUES = true -# SPLIT_ARGUMENTS_WHEN_COMMA_TERMINATED = false -# SPLIT_BEFORE_BITWISE_OPERATOR = false -# SPLIT_BEFORE_CLOSING_BRACKET = false -# SPLIT_BEFORE_DICT_SET_GENERATOR = false -# SPLIT_BEFORE_DOT = true -# SPLIT_BEFORE_EXPRESSION_AFTER_OPENING_PAREN = false -# SPLIT_BEFORE_FIRST_ARGUMENT = false -# SPLIT_BEFORE_LOGICAL_OPERATOR = false -# SPLIT_BEFORE_NAMED_ASSIGNS = false -# SPLIT_COMPLEX_COMPREHENSION = true -# SPLIT_PENALTY_IMPORT_NAMES = 10000 -# SPLIT_PENALTY_AFTER_OPENING_BRACKET = 350 -# SPLIT_PENALTY_BEFORE_IF_EXPR = 10000 -# SPLIT_PENALTY_COMPREHENSION = 2500 -# SPLIT_PENALTY_FOR_ADDED_LINE_SPLIT = 5000 +BLANK_LINE_BEFORE_CLASS_DOCSTRING = false +BLANK_LINE_BEFORE_MODULE_DOCSTRING = false +BLANK_LINE_BEFORE_NESTED_CLASS_OR_DEF = false +ALIGN_CLOSING_BRACKET_WITH_VISUAL_INDENT = true +ALLOW_MULTILINE_DICTIONARY_KEYS = false +ALLOW_SPLIT_BEFORE_DICT_VALUE = false +COALESCE_BRACKETS = true +NO_SPACES_AROUND_SELECTED_BINARY_OPERATORS = true +SPACES_AROUND_SUBSCRIPT_COLON = false +SPACES_AROUND_DICT_DELIMITERS = false +SPACES_AROUND_LIST_DELIMITERS = false +SPACES_AROUND_POWER_OPERATOR = false +SPACES_AROUND_TUPLE_DELIMITERS = false +SPACE_BETWEEN_ENDING_COMMA_AND_CLOSING_BRACKET = false +SPACE_INSIDE_BRACKETS = false +SPLIT_ALL_COMMA_SEPARATED_VALUES = false +SPLIT_BEFORE_DOT = true [tool.pytest.ini_options] addopts = ["-rfEX", "-pno:warnings", "--snapshot-warn-unused"] diff --git a/tools/dependencies.py b/tools/dependencies.py index b27e1412..ed98489d 100755 --- a/tools/dependencies.py +++ b/tools/dependencies.py @@ -29,10 +29,8 @@ class Classifier: @staticmethod def status() -> dict[int, str]: return { - v: status for v, status in zip(range(1, 8), [ - '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', - '7 - Inactive' - ]) + v: status for v, status in zip(range( + 1, 8), ['1 - Planning', '2 - Pre-Alpha', '3 - Alpha', '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive']) } @staticmethod @@ -47,14 +45,10 @@ class Classifier: return cls_.joiner.join([cls_.identifier[identifier], *decls]) @staticmethod - def create_python_classifier(implementation: list[str] | None = None, - supported_version: list[str] | None = None) -> list[str]: + def create_python_classifier(implementation: list[str] | None = None, supported_version: list[str] | None = None) -> list[str]: if supported_version is None: supported_version = ['3.8', '3.9', '3.10', '3.11', '3.12'] if implementation is None: implementation = ['CPython', 'PyPy'] - base = [ - Classifier.create_classifier('language', 'Python'), - Classifier.create_classifier('language', 'Python', '3'), - ] + base = [Classifier.create_classifier('language', 'Python'), Classifier.create_classifier('language', 'Python', '3'),] base.append(Classifier.create_classifier('language', 'Python', '3', 'Only')) base.extend([Classifier.create_classifier('language', 'Python', version) for version in supported_version]) base.extend([Classifier.create_classifier('language', 'Python', 'Implementation', impl) for impl in implementation]) @@ -153,8 +147,7 @@ _locals = locals().copy() # NOTE: update this table when adding new external dependencies # sync with openllm.utils.OPTIONAL_DEPENDENCIES -_base_requirements.update( - {v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES}) +_base_requirements.update({v: _locals.get(f'{inflection.underscore(v).upper()}_DEPS') for v in openllm.utils.OPTIONAL_DEPENDENCIES}) _base_requirements = {k: v for k, v in sorted(_base_requirements.items())} @@ -187,10 +180,7 @@ def create_optional_table() -> Table: all_array.append(f"openllm[{','.join(_base_requirements)}]") table = tomlkit.table(is_super_table=True) - _base_requirements.update({ - 'full': correct_style(all_array.multiline(True)), - 'all': tomlkit.array('["openllm[full]"]') - }) + _base_requirements.update({'full': correct_style(all_array.multiline(True)), 'all': tomlkit.array('["openllm[full]"]')}) table.update({k: v for k, v in sorted(_base_requirements.items())}) table.add(tomlkit.nl()) @@ -228,9 +218,8 @@ def authors() -> Array: def keywords() -> Array: arr = correct_style(tomlkit.array()) arr.extend([ - 'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2', - 'Fine tuning', 'Serverless', 'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch', - 'Transformers' + 'MLOps', 'AI', 'BentoML', 'Model Serving', 'Model Deployment', 'LLMOps', 'Falcon', 'Vicuna', 'Llama 2', 'Fine tuning', 'Serverless', + 'Large Language Model', 'Generative AI', 'StableLM', 'Alpaca', 'PyTorch', 'Transformers' ]) return arr.multiline(True) @@ -240,8 +229,7 @@ def build_cli_extensions() -> Table: ext.update({ f'openllm-{inflection.dasherize(ke)}': f'openllm.cli.extension.{ke}:cli' for ke in sorted([ fname[:-3] - for fname in os.listdir( - os.path.abspath(os.path.join(ROOT, 'openllm-python', 'src', 'openllm', 'cli', 'extension'))) + for fname in os.listdir(os.path.abspath(os.path.join(ROOT, 'openllm-python', 'src', 'openllm', 'cli', 'extension'))) if fname.endswith('.py') and not fname.startswith('__') ]) }) diff --git a/tools/update-brew-tap.py b/tools/update-brew-tap.py index 18c34bf6..7a9b816e 100755 --- a/tools/update-brew-tap.py +++ b/tools/update-brew-tap.py @@ -21,8 +21,7 @@ _gz_strategies: dict[t.Literal['macos_arm', 'macos_intel', 'linux_intel'], str] 'linux_intel': 'x86_64-unknown-linux-musl' } -def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', - 'archive']) -> str: +def determine_release_url(svn_url: str, tag: str, target: t.Literal['macos_arm', 'macos_intel', 'linux_intel', 'archive']) -> str: if target == 'archive': return f'{svn_url}/archive/{tag}.tar.gz' return f"{svn_url}/releases/download/{tag}/openllm-{tag.replace('v', '')}-{_gz_strategies[target]}.tar.gz" @@ -36,11 +35,9 @@ def main() -> int: release_tag = api.repos.get_latest_release().name shadict: dict[str, t.Any] = { - k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() - for k in _gz_strategies + k: get_release_hash_command(determine_release_url(_info.svn_url, release_tag, k), release_tag)().strip() for k in _gz_strategies } - shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), - release_tag)().strip() + shadict['archive'] = get_release_hash_command(determine_release_url(_info.svn_url, release_tag, 'archive'), release_tag)().strip() ENVIRONMENT = Environment(extensions=['jinja2.ext.do', 'jinja2.ext.loopcontrols', 'jinja2.ext.debug'], trim_blocks=True, diff --git a/tools/update-config-stubs.py b/tools/update-config-stubs.py index 1daf0ab2..5df89839 100755 --- a/tools/update-config-stubs.py +++ b/tools/update-config-stubs.py @@ -24,14 +24,12 @@ def process_annotations(annotations: str) -> str: else: return annotations _value_docstring = { - 'default_id': - '''Return the default model to use when using 'openllm start '. + 'default_id': '''Return the default model to use when using 'openllm start '. This could be one of the keys in 'self.model_ids' or custom users model. This field is required when defining under '__config__'. ''', - 'model_ids': - '''A list of supported pretrained models tag for this given runnable. + 'model_ids': '''A list of supported pretrained models tag for this given runnable. For example: For FLAN-T5 impl, this would be ["google/flan-t5-small", "google/flan-t5-base", @@ -39,8 +37,7 @@ _value_docstring = { This field is required when defining under '__config__'. ''', - 'architecture': - '''The model architecture that is supported by this LLM. + 'architecture': '''The model architecture that is supported by this LLM. Note that any model weights within this architecture generation can always be run and supported by this LLM. @@ -50,34 +47,21 @@ _value_docstring = { ```bash openllm start gpt-neox --model-id stabilityai/stablelm-tuned-alpha-3b ```''', - 'default_backend': - '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''', - 'url': - 'The resolved url for this LLMConfig.', - 'requires_gpu': - 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.', - 'trust_remote_code': - 'Whether to always trust remote code', - 'service_name': - "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"", - 'requirements': - 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.', - 'model_type': - 'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"', - 'name_type': - '''The default name typed for this model. "dasherize" will convert the name to lowercase and + 'default_backend': '''The default backend to run LLM based on available accelerator. By default, it will be PyTorch (pt) for most models. For some models, such as Llama, it will use `vllm` or `flax`. It is a dictionary of key as the accelerator spec in k8s ('cpu', 'nvidia.com/gpu', 'amd.com/gpu', 'cloud-tpus.google.com/v2', ...) and the values as supported OpenLLM backend ('flax', 'tf', 'pt', 'vllm', 'ggml', 'mlc')''', + 'url': 'The resolved url for this LLMConfig.', + 'requires_gpu': 'Determines if this model is only available on GPU. By default it supports GPU and fallback to CPU.', + 'trust_remote_code': 'Whether to always trust remote code', + 'service_name': "Generated service name for this LLMConfig. By default, it is \"generated_{model_name}_service.py\"", + 'requirements': 'The default PyPI requirements needed to run this given LLM. By default, we will depend on bentoml, torch, transformers.', + 'model_type': 'The model type for this given LLM. By default, it should be causal language modeling. Currently supported "causal_lm" or "seq2seq_lm"', + 'name_type': '''The default name typed for this model. "dasherize" will convert the name to lowercase and replace spaces with dashes. "lowercase" will convert the name to lowercase. If this is not set, then both `model_name` and `start_name` must be specified.''', - 'model_name': - 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__', - 'start_name': - 'Default name to be used with `openllm start`', - 'env': - 'A EnvVarMixin instance for this LLMConfig.', - 'timeout': - 'The default timeout to be set for this given LLM.', - 'workers_per_resource': - '''The number of workers per resource. This is used to determine the number of workers to use for this model. + 'model_name': 'The normalized version of __openllm_start_name__, determined by __openllm_name_type__', + 'start_name': 'Default name to be used with `openllm start`', + 'env': 'A EnvVarMixin instance for this LLMConfig.', + 'timeout': 'The default timeout to be set for this given LLM.', + 'workers_per_resource': '''The number of workers per resource. This is used to determine the number of workers to use for this model. For example, if this is set to 0.5, then OpenLLM will use 1 worker per 2 resources. If this is set to 1, then OpenLLM will use 1 worker per resource. If this is set to 2, then OpenLLM will use 2 workers per resource. @@ -86,10 +70,8 @@ _value_docstring = { By default, it is set to 1. ''', - 'fine_tune_strategies': - 'The fine-tune strategies for this given LLM.', - 'tokenizer_class': - 'Optional tokenizer class for this given LLM. See Llama for example.', + 'fine_tune_strategies': 'The fine-tune strategies for this given LLM.', + 'tokenizer_class': 'Optional tokenizer class for this given LLM. See Llama for example.', } _transformed = {'fine_tune_strategies': 't.Dict[AdapterType, FineTuneConfig]'} @@ -99,16 +81,13 @@ def main() -> int: processed = f.readlines() start_idx, end_idx = processed.index(' ' * 2 + START_COMMENT), processed.index(' ' * 2 + END_COMMENT) - start_stub_idx, end_stub_idx = processed.index(' ' * 4 + START_SPECIAL_COMMENT), processed.index(' ' * 4 + - END_SPECIAL_COMMENT) - start_attrs_idx, end_attrs_idx = processed.index(' ' * 4 + START_ATTRS_COMMENT), processed.index(' ' * 4 + - END_ATTRS_COMMENT) + start_stub_idx, end_stub_idx = processed.index(' ' * 4 + START_SPECIAL_COMMENT), processed.index(' ' * 4 + END_SPECIAL_COMMENT) + start_attrs_idx, end_attrs_idx = processed.index(' ' * 4 + START_ATTRS_COMMENT), processed.index(' ' * 4 + END_ATTRS_COMMENT) # NOTE: inline stubs __config__ attrs representation special_attrs_lines: list[str] = [] for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): - special_attrs_lines.append( - f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n") + special_attrs_lines.append(f"{' ' * 4}{keys}: {_transformed.get(keys, process_annotations(ForwardRef.__forward_arg__))}\n") # NOTE: inline stubs for _ConfigAttr type stubs config_attr_lines: list[str] = [] for keys, ForwardRef in codegen.get_annotations(ModelSettings).items(): @@ -132,40 +111,28 @@ def main() -> int: lines.append(' ' * 2 + '# NOTE: generation_class, sampling_class and extras arguments\n') lines.extend([ ' ' * 2 + line for line in [ - '@overload\n', - "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", - '@overload\n', - "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", - '@overload\n', "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n", + '@overload\n', "def __getitem__(self, item: t.Literal['generation_class']) -> t.Type[openllm_core.GenerationConfig]: ...\n", '@overload\n', + "def __getitem__(self, item: t.Literal['sampling_class']) -> t.Type[openllm_core.SamplingParams]: ...\n", '@overload\n', + "def __getitem__(self, item: t.Literal['extras']) -> t.Dict[str, t.Any]: ...\n", ] ]) lines.append(' ' * 2 + '# NOTE: GenerationConfig arguments\n') generation_config_anns = codegen.get_annotations(GenerationConfig) for keys, type_pep563 in generation_config_anns.items(): - lines.extend([ - ' ' * 2 + line - for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"] - ]) + lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n"]]) lines.append(' ' * 2 + '# NOTE: SamplingParams arguments\n') for keys, type_pep563 in codegen.get_annotations(SamplingParams).items(): if keys not in generation_config_anns: - lines.extend([ - ' ' * 2 + line - for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",] - ]) + lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys}']) -> {type_pep563}: ...\n",]]) lines.append(' ' * 2 + '# NOTE: PeftType arguments\n') for keys in PeftType._member_names_: - lines.extend([ - ' ' * 2 + line for line in - ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",] - ]) + lines.extend([' ' * 2 + line for line in ['@overload\n', f"def __getitem__(self, item: t.Literal['{keys.lower()}']) -> dict[str, t.Any]: ...\n",]]) - processed = processed[:start_attrs_idx] + [ - ' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT - ] + processed[end_attrs_idx + 1:start_stub_idx] + [ - ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT - ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT - ] + processed[end_idx + 1:] + processed = processed[:start_attrs_idx] + [' ' * 4 + START_ATTRS_COMMENT, *special_attrs_lines, ' ' * 4 + END_ATTRS_COMMENT + ] + processed[end_attrs_idx + 1:start_stub_idx] + [ + ' ' * 4 + START_SPECIAL_COMMENT, *config_attr_lines, ' ' * 4 + END_SPECIAL_COMMENT + ] + processed[end_stub_idx + 1:start_idx] + [' ' * 2 + START_COMMENT, *lines, ' ' * 2 + END_COMMENT + ] + processed[end_idx + 1:] with _TARGET_FILE.open('w') as f: f.writelines(processed) return 0 diff --git a/tools/update-dummy.py b/tools/update-dummy.py index 104430de..f3aac8d8 100755 --- a/tools/update-dummy.py +++ b/tools/update-dummy.py @@ -14,15 +14,10 @@ from openllm import CONFIG_MAPPING if t.TYPE_CHECKING: from collections import OrderedDict config_requirements = { - k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None - for k, v in CONFIG_MAPPING.items() -} -_dependencies: dict[LiteralBackend, str] = { - k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm')) -} -_auto: dict[str, str] = { - k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM')) + k: [_.replace('-', '_') for _ in v.__openllm_requirements__] if v.__openllm_requirements__ else None for k, v in CONFIG_MAPPING.items() } +_dependencies: dict[LiteralBackend, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('torch', 'tensorflow', 'flax', 'vllm'))} +_auto: dict[str, str] = {k: v for k, v in zip(LiteralBackend.__args__[:-2], ('AutoLLM', 'AutoTFLLM', 'AutoFlaxLLM', 'AutoVLLM'))} def get_target_dummy_file(backend: LiteralBackend) -> Path: return _ROOT / 'openllm-python' / 'src' / 'openllm' / 'utils' / f'dummy_{backend}_objects.py' @@ -36,34 +31,29 @@ def get_mapping(backend: LiteralBackend) -> OrderedDict[t.Any, t.Any]: def make_class_stub(model_name: str, backend: LiteralBackend, indentation: int = 2, auto: bool = False) -> list[str]: _dep_list: list[str] = [ f'"{v}"' for v in [ - _dependencies[backend], *(t.cast(t.List[str], config_requirements[model_name] - ) if model_name != '__default__' and config_requirements[model_name] else []) + _dependencies[backend], *( + t.cast(t.List[str], config_requirements[model_name]) if model_name != '__default__' and config_requirements[model_name] else []) ] ] if auto: cl_ = _auto[backend] else: cl_ = get_mapping(backend)[model_name] lines = [ f'class {cl_}(metaclass=_DummyMetaclass):', ' ' * indentation + f"_backends=[{','.join(_dep_list)}]", - ' ' * indentation + - f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])" + ' ' * indentation + f"def __init__(self,*param_decls:_t.Any,**attrs: _t.Any):_require_backends(self,[{','.join(_dep_list)}])" ] return lines def write_stub(backend: LiteralBackend, _path: str) -> list[str]: base = [ - f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', - 'from __future__ import annotations', 'import typing as _t', - 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends', + f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations', + 'import typing as _t', 'from openllm_core.utils import DummyMetaclass as _DummyMetaclass, require_backends as _require_backends', ] base.extend([v for it in [make_class_stub(k, backend) for k in get_mapping(backend)] for v in it]) # autoclass base.extend(make_class_stub('__default__', backend, auto=True)) # mapping and export _imports = [f'"{v}"' for v in get_mapping(backend).values()] - base += [ - f'{mapping_names(backend)}:_t.Any=None', - f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n" - ] + base += [f'{mapping_names(backend)}:_t.Any=None', f"__all__:list[str]=[\"{mapping_names(backend)}\",\"{_auto[backend]}\",{','.join(_imports)}]\n"] return base def main() -> int: diff --git a/tools/update-models-import.py b/tools/update-models-import.py index c6e94a3d..f401a6da 100755 --- a/tools/update-models-import.py +++ b/tools/update-models-import.py @@ -6,27 +6,23 @@ from pathlib import Path _TARGET_FILE = Path(__file__).parent.parent / 'openllm-python' / 'src' / 'openllm' / 'models' / '__init__.py' def create_module_import() -> str: - r = [ - f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') - if p.name not in ['__pycache__', '__init__.py', '.DS_Store'] - ] + r = [f'"{p.name}"' for p in _TARGET_FILE.parent.glob('*/') if p.name not in ['__pycache__', '__init__.py', '.DS_Store']] return f"_MODELS:set[str]={{{', '.join(sorted(r))}}}" def create_stubs_import() -> list[str]: return [ - 'if t.TYPE_CHECKING:from . import ' + ','.join([ - f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) - if p.name not in {'__pycache__', '__init__.py', '.DS_Store'} - ]), '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__', - '__dir__=__lazy.__dir__', '__getattr__=__lazy.__getattr__\n' + 'if t.TYPE_CHECKING:from . import ' + + ','.join([f'{p.name} as {p.name}' for p in sorted(_TARGET_FILE.parent.glob('*/')) if p.name not in {'__pycache__', '__init__.py', '.DS_Store'}]), + '__lazy=LazyModule(__name__, globals()["__file__"], {k: [] for k in _MODELS})', '__all__=__lazy.__all__', '__dir__=__lazy.__dir__', + '__getattr__=__lazy.__getattr__\n' ] def main() -> int: _path = os.path.join(os.path.basename(os.path.dirname(__file__)), os.path.basename(__file__)) with _TARGET_FILE.open('w') as f: f.writelines('\n'.join([ - f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', - 'from __future__ import annotations', 'import typing as t', 'from openllm_core.utils import LazyModule', + f'# This file is generated by {_path}. DO NOT EDIT MANUALLY!', f'# To update this, run ./{_path}', 'from __future__ import annotations', + 'import typing as t', 'from openllm_core.utils import LazyModule', create_module_import(), *create_stubs_import(), ])) return 0 diff --git a/tools/update-readme.py b/tools/update-readme.py index 97218fec..0bb2f9de 100755 --- a/tools/update-readme.py +++ b/tools/update-readme.py @@ -42,8 +42,7 @@ def main() -> int: meta.extend([f'{header}\n' for header in formatted.keys() if header not in ('URL',)]) meta += ['\n'] # NOTE: rows - for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], - zip(*formatted.values())): + for name, architecture, url, model_ids, installation in t.cast(t.Iterable[t.Tuple[str, str, str, t.List[str], str]], zip(*formatted.values())): meta += '\n' # configure architecture URL cfg_cls = openllm.CONFIG_MAPPING[name] diff --git a/tools/write-coverage-report.py b/tools/write-coverage-report.py index fd329e32..8ba14449 100755 --- a/tools/write-coverage-report.py +++ b/tools/write-coverage-report.py @@ -31,8 +31,7 @@ def main() -> int: color = 'ok' if float(total_rate) >= 95 else 'critical' lines.insert(0, f'![Code Coverage](https://img.shields.io/badge/coverage-{total_rate}%25-{color}?style=flat)\n') - lines.append( - f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n') + lines.append(f'**Summary** | {100 if total_rate == 100 else total_rate}% ({total_statements_covered} / {total_statements})\n') coverage_report = ROOT / 'coverage-report.md' with coverage_report.open('w', encoding='utf-8') as f: