chore(cli): move playground to CLI components (#655)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
2026-02-19 15:18:12 -05:00 · 2023-11-14 23:20:50 -05:00
parent cbdcfc87a2
commit 103156cd71
13 changed files with 70 additions and 82 deletions
--- a/openllm-python/src/openllm/init.py
+++ b/openllm-python/src/openllm/init.py
@@ -15,12 +15,10 @@ import pathlib as _pathlib
 import warnings as _warnings

 import openllm_cli as _cli
-
 from openllm_cli import _sdk

 from . import utils as utils

-
 if utils.DEBUG:
  utils.set_debug_mode(True)
  utils.set_quiet_mode(False)
@@ -51,7 +49,6 @@ __lazy = utils.LazyModule(
    'exceptions': [],
    'client': ['HTTPClient', 'AsyncHTTPClient'],
    'bundle': [],
-    'playground': [],
    'testing': [],
    'protocol': [],
    'utils': [],
--- a/openllm-python/src/openllm/init.pyi
+++ b/openllm-python/src/openllm/init.pyi
@@ -38,7 +38,6 @@ from . import (
  bundle as bundle,
  client as client,
  exceptions as exceptions,
-  playground as playground,
  serialisation as serialisation,
  testing as testing,
  utils as utils,
--- a/openllm-python/src/openllm/playground/README.md
+++ b/openllm-python/src/openllm/playground/README.md
@@ -1,14 +0,0 @@
-This folder represents a playground and source of truth for a features
-development around OpenLLM
-
-```bash
-openllm playground
-```
-
-## Usage for developing this module
-
-Write a python script that can be used by `jupytext` to convert it to a jupyter
-notebook via `openllm playground`
-
-Make sure to add the new file and its documentation into
-[`_meta.yml`](./_meta.yml).
--- a/openllm-python/src/openllm/playground/init.py
+++ b/openllm-python/src/openllm/playground/init.py
--- a/openllm-python/src/openllm/playground/_meta.yml
+++ b/openllm-python/src/openllm/playground/_meta.yml
@@ -1,36 +0,0 @@
-features:
-  description: |
-    ## General introduction to OpenLLM.
-
-    This script will demo a few features from OpenLLM:
-
-    - Usage of Auto class abstraction and run prediction with `generate`
-    - Ability to send per-requests parameters
-    - Runner integration with BentoML
-opt_tuned:
-  description: |
-    ## Fine tuning OPT
-
-    This script demonstrate how one can easily fine tune OPT
-    with [LoRa](https://arxiv.org/abs/2106.09685) and in int8 with bitsandbytes.
-
-    It is based on one of the Peft examples fine tuning script.
-    It requires at least one GPU to be available, so make sure to have it.
-falcon_tuned:
-  description: |
-    ## Fine tuning Falcon
-
-    This script demonstrate how one can fine tune Falcon using [QLoRa](https://arxiv.org/pdf/2305.14314.pdf),
-    [trl](https://github.com/lvwerra/trl).
-
-    It is trained using OpenAssistant's Guanaco [dataset](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)
-
-    It requires at least one GPU to be available, so make sure to have it.
-llama2_qlora:
-  description: |
-    ## Fine tuning LlaMA 2
-    This script demonstrate how one can fine tune Falcon using LoRA with [trl](https://github.com/lvwerra/trl)
-
-    It is trained using the [Dolly datasets](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
-
-    It requires at least one GPU to be available.
--- a/openllm-python/src/openllm/playground/falcon_tuned.py
+++ b/openllm-python/src/openllm/playground/falcon_tuned.py
@@ -1,95 +0,0 @@
-from __future__ import annotations
-import dataclasses
-import logging
-import os
-import sys
-import typing as t
-
-import torch
-import transformers
-
-import openllm
-
-# Make sure to have at least one GPU to run this script
-
-openllm.utils.configure_logging()
-
-logger = logging.getLogger(__name__)
-
-# On notebook, make sure to install the following
-# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
-
-from datasets import load_dataset
-from trl import SFTTrainer
-
-DEFAULT_MODEL_ID = 'ybelkada/falcon-7b-sharded-bf16'
-DATASET_NAME = 'timdettmers/openassistant-guanaco'
-
-
-@dataclasses.dataclass
-class TrainingArguments:
-  per_device_train_batch_size: int = dataclasses.field(default=4)
-  gradient_accumulation_steps: int = dataclasses.field(default=4)
-  optim: str = dataclasses.field(default='paged_adamw_32bit')
-  save_steps: int = dataclasses.field(default=10)
-  warmup_steps: int = dataclasses.field(default=10)
-  max_steps: int = dataclasses.field(default=500)
-  logging_steps: int = dataclasses.field(default=10)
-  learning_rate: float = dataclasses.field(default=2e-4)
-  max_grad_norm: float = dataclasses.field(default=0.3)
-  warmup_ratio: float = dataclasses.field(default=0.03)
-  fp16: bool = dataclasses.field(default=True)
-  group_by_length: bool = dataclasses.field(default=True)
-  lr_scheduler_type: str = dataclasses.field(default='constant')
-  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'falcon'))
-
-
-@dataclasses.dataclass
-class ModelArguments:
-  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
-  max_sequence_length: int = dataclasses.field(default=512)
-
-
-parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
-if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
-  # If we pass only one argument to the script and it's the path to a json file,
-  # let's parse it to get our arguments.
-  model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-else:
-  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
-
-llm = openllm.LLM(
-  model_args.model_id, quantize='int4', bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16
-)
-model, tokenizer = llm.prepare_for_training(
-  adapter_type='lora',
-  lora_alpha=16,
-  lora_dropout=0.1,
-  r=16,
-  bias='none',
-  target_modules=['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'],
-)
-model.config.use_cache = False
-tokenizer.pad_token = tokenizer.eos_token
-
-dataset = load_dataset(DATASET_NAME, split='train')
-
-trainer = SFTTrainer(
-  model=model,
-  train_dataset=dataset,
-  dataset_text_field='text',
-  max_seq_length=model_args.max_sequence_length,
-  tokenizer=tokenizer,
-  args=dataclasses.replace(
-    transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
-  ),
-)
-
-# upcast layernorm in float32 for more stable training
-for name, module in trainer.model.named_modules():
-  if 'norm' in name:
-    module = module.to(torch.float32)
-
-trainer.train()
-
-trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))
--- a/openllm-python/src/openllm/playground/features.py
+++ b/openllm-python/src/openllm/playground/features.py
@@ -1,44 +0,0 @@
-from __future__ import annotations
-import argparse
-import logging
-import typing as t
-
-import asyncio
-import openllm
-
-openllm.utils.configure_logging()
-
-logger = logging.getLogger(__name__)
-
-MAX_NEW_TOKENS = 384
-
-Q = 'Answer the following question, step by step:\n{q}\nA:'
-question = 'What is the meaning of life?'
-
-
-async def main() -> int:
-  parser = argparse.ArgumentParser()
-  parser.add_argument('question', default=question)
-
-  if openllm.utils.in_notebook():
-    args = parser.parse_args(args=[question])
-  else:
-    args = parser.parse_args()
-
-  llm = openllm.LLM[t.Any, t.Any]('facebook/opt-2.7b')
-  prompt = Q.format(q=args.question)
-
-  logger.info('-' * 50, "Running with 'generate()'", '-' * 50)
-  res = await llm.generate(prompt)
-  logger.info('=' * 10, 'Response:', res)
-
-  logger.info('-' * 50, "Running with 'generate()' with per-requests argument", '-' * 50)
-  res = await llm.generate(prompt, max_new_tokens=MAX_NEW_TOKENS)
-  logger.info('=' * 10, 'Response:', res)
-
-  return 0
-
-
-def _mp_fn(index: t.Any):  # type: ignore
-  # For xla_spawn (TPUs)
-  asyncio.run(main())
--- a/openllm-python/src/openllm/playground/llama2_qlora.py
+++ b/openllm-python/src/openllm/playground/llama2_qlora.py
@@ -1,238 +0,0 @@
-from __future__ import annotations
-import dataclasses
-import logging
-import os
-import sys
-import typing as t
-
-import torch
-import transformers
-
-import openllm
-
-if t.TYPE_CHECKING:
-  import peft
-
-# Make sure to have at least one GPU to run this script
-
-openllm.utils.configure_logging()
-
-logger = logging.getLogger(__name__)
-
-# On notebook, make sure to install the following
-# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
-
-from functools import partial
-from itertools import chain
-from random import randint, randrange
-
-import bitsandbytes as bnb
-from datasets import load_dataset
-
-
-# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
-def find_all_linear_names(model):
-  lora_module_names = set()
-  for name, module in model.named_modules():
-    if isinstance(module, bnb.nn.Linear4bit):
-      names = name.split('.')
-      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
-
-  if 'lm_head' in lora_module_names:  # needed for 16-bit
-    lora_module_names.remove('lm_head')
-  return list(lora_module_names)
-
-
-# Change this to the local converted path if you don't have access to the meta-llama model
-DEFAULT_MODEL_ID = 'meta-llama/Llama-2-7b-hf'
-# change this to 'main' if you want to use the latest llama
-DEFAULT_MODEL_VERSION = '335a02887eb6684d487240bbc28b5699298c3135'
-DATASET_NAME = 'databricks/databricks-dolly-15k'
-
-
-def format_dolly(sample):
-  instruction = f"### Instruction\n{sample['instruction']}"
-  context = f"### Context\n{sample['context']}" if len(sample['context']) > 0 else None
-  response = f"### Answer\n{sample['response']}"
-  # join all the parts together
-  prompt = '\n\n'.join([i for i in [instruction, context, response] if i is not None])
-  return prompt
-
-
-# template dataset to add prompt to each sample
-def template_dataset(sample, tokenizer):
-  sample['text'] = f'{format_dolly(sample)}{tokenizer.eos_token}'
-  return sample
-
-
-# empty list to save remainder from batches to use in next batch
-remainder = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
-
-
-def chunk(sample, chunk_length=2048):
-  # define global remainder variable to save remainder from batches to use in next batch
-  global remainder
-  # Concatenate all texts and add remainder from previous batch
-  concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
-  concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
-  # get total number of tokens for batch
-  batch_total_length = len(concatenated_examples[next(iter(sample.keys()))])
-
-  # get max number of chunks for batch
-  if batch_total_length >= chunk_length:
-    batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
-
-  # Split by chunks of max_len.
-  result = {
-    k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
-    for k, t in concatenated_examples.items()
-  }
-  # add remainder to global variable for next batch
-  remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
-  # prepare labels
-  result['labels'] = result['input_ids'].copy()
-  return result
-
-
-def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
-  # Load dataset from the hub
-  dataset = load_dataset(dataset_name, split='train')
-
-  print(f'dataset size: {len(dataset)}')
-  print(dataset[randrange(len(dataset))])
-
-  # apply prompt template per sample
-  dataset = dataset.map(partial(template_dataset, tokenizer=tokenizer), remove_columns=list(dataset.features))
-  # print random sample
-  print('Sample from dolly-v2 ds:', dataset[randint(0, len(dataset))]['text'])
-
-  # tokenize and chunk dataset
-  lm_dataset = dataset.map(
-    lambda sample: tokenizer(sample['text']), batched=True, remove_columns=list(dataset.features)
-  ).map(partial(chunk, chunk_length=2048), batched=True)
-
-  # Print total number of samples
-  print(f'Total number of samples: {len(lm_dataset)}')
-  return lm_dataset
-
-
-def prepare_for_int4_training(
-  model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True
-) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
-  from peft.tuners.lora import LoraLayer
-
-  llm = openllm.LLM(
-    model_id,
-    revision=model_version,
-    quantize='int4',
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    use_cache=not gradient_checkpointing,
-    device_map='auto',
-  )
-  print('Model summary:', llm.model)
-
-  # get lora target modules
-  modules = find_all_linear_names(llm.model)
-  print(f'Found {len(modules)} modules to quantize: {modules}')
-
-  model, tokenizer = llm.prepare_for_training(
-    adapter_type='lora', use_gradient_checkpointing=gradient_checkpointing, target_modules=modules
-  )
-
-  # pre-process the model by upcasting the layer norms in float 32 for
-  for name, module in model.named_modules():
-    if isinstance(module, LoraLayer):
-      if bf16:
-        module = module.to(torch.bfloat16)
-    if 'norm' in name:
-      module = module.to(torch.float32)
-    if 'lm_head' in name or 'embed_tokens' in name:
-      if hasattr(module, 'weight'):
-        if bf16 and module.weight.dtype == torch.float32:
-          module = module.to(torch.bfloat16)
-  return model, tokenizer
-
-
-@dataclasses.dataclass
-class TrainingArguments:
-  per_device_train_batch_size: int = dataclasses.field(default=1)
-  gradient_checkpointing: bool = dataclasses.field(default=True)
-  bf16: bool = dataclasses.field(default=torch.cuda.get_device_capability()[0] == 8)
-  learning_rate: float = dataclasses.field(default=5e-5)
-  num_train_epochs: int = dataclasses.field(default=3)
-  logging_steps: int = dataclasses.field(default=1)
-  report_to: str = dataclasses.field(default='none')
-  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'llama'))
-  save_strategy: str = dataclasses.field(default='no')
-
-
-@dataclasses.dataclass
-class ModelArguments:
-  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
-  model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION)
-  seed: int = dataclasses.field(default=42)
-  merge_weights: bool = dataclasses.field(default=False)
-
-
-if openllm.utils.in_notebook():
-  model_args, training_rags = ModelArguments(), TrainingArguments()
-else:
-  parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
-  if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
-    # If we pass only one argument to the script and it's the path to a json file,
-    # let's parse it to get our arguments.
-    model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-  else:
-    model_args, training_args = t.cast(
-      t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()
-    )
-
-# import the model first hand
-openllm.import_model(model_id=model_args.model_id, model_version=model_args.model_version)
-
-
-def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
-  import peft
-
-  transformers.set_seed(model_args.seed)
-
-  model, tokenizer = prepare_for_int4_training(
-    model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16
-  )
-  datasets = prepare_datasets(tokenizer)
-
-  trainer = transformers.Trainer(
-    model=model,
-    args=dataclasses.replace(
-      transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
-    ),
-    train_dataset=datasets,
-    data_collator=transformers.default_data_collator,
-  )
-
-  trainer.train()
-
-  if model_args.merge_weights:
-    # note that this will requires larger GPU as we will load the whole model into memory
-
-    # merge adapter weights with base model and save
-    # save int4 model
-    trainer.model.save_pretrained(training_args.output_dir, safe_serialization=False)
-
-    # gc mem
-    del model, trainer
-    torch.cuda.empty_cache()
-
-    model = peft.AutoPeftModelForCausalLM.from_pretrained(
-      training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16
-    )
-    # merge lora with base weights and save
-    model = model.merge_and_unload()
-    model.save_pretrained(
-      os.path.join(os.getcwd(), 'outputs', 'merged_llama_lora'), safe_serialization=True, max_shard_size='2GB'
-    )
-  else:
-    trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))
-
-
-train_loop(model_args, training_args)
--- a/openllm-python/src/openllm/playground/opt_tuned.py
+++ b/openllm-python/src/openllm/playground/opt_tuned.py
@@ -1,81 +0,0 @@
-from __future__ import annotations
-import dataclasses
-import logging
-import os
-import sys
-import typing as t
-
-import transformers
-
-import openllm
-
-# Make sure to have at least one GPU to run this script
-
-openllm.utils.configure_logging()
-
-logger = logging.getLogger(__name__)
-
-# On notebook, make sure to install the following
-# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
-
-from datasets import load_dataset
-
-if t.TYPE_CHECKING:
-  from peft import PeftModel
-
-DEFAULT_MODEL_ID = 'facebook/opt-6.7b'
-
-
-def load_trainer(
-  model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments
-):
-  return transformers.Trainer(
-    model=model,
-    train_dataset=dataset_dict['train'],
-    args=dataclasses.replace(
-      transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
-    ),
-    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
-  )
-
-
-@dataclasses.dataclass
-class TrainingArguments:
-  per_device_train_batch_size: int = dataclasses.field(default=4)
-  gradient_accumulation_steps: int = dataclasses.field(default=4)
-  warmup_steps: int = dataclasses.field(default=10)
-  max_steps: int = dataclasses.field(default=50)
-  learning_rate: float = dataclasses.field(default=3e-4)
-  fp16: bool = dataclasses.field(default=True)
-  logging_steps: int = dataclasses.field(default=1)
-  output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'opt'))
-
-
-@dataclasses.dataclass
-class ModelArguments:
-  model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
-
-
-parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
-if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
-  # If we pass only one argument to the script and it's the path to a json file,
-  # let's parse it to get our arguments.
-  model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-else:
-  model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
-
-llm = openllm.LLM(model_args.model_id, quantize='int8')
-model, tokenizer = llm.prepare_for_training(
-  adapter_type='lora', r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'], lora_dropout=0.05, bias='none'
-)
-
-# ft on english_quotes
-data = load_dataset('Abirate/english_quotes')
-data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
-
-trainer = load_trainer(model, tokenizer, data, training_args)
-model.config.use_cache = False  # silence just for warning, reenable for inference later
-
-trainer.train()
-
-trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))