chore(cli): move playground to CLI components (#655)

Signed-off-by: Aaron <29749331+aarnphm@users.noreply.github.com>
This commit is contained in:
Aaron Pham
2023-11-14 23:20:50 -05:00
committed by GitHub
parent cbdcfc87a2
commit 103156cd71
13 changed files with 70 additions and 82 deletions

View File

@@ -15,12 +15,10 @@ import pathlib as _pathlib
import warnings as _warnings
import openllm_cli as _cli
from openllm_cli import _sdk
from . import utils as utils
if utils.DEBUG:
utils.set_debug_mode(True)
utils.set_quiet_mode(False)
@@ -51,7 +49,6 @@ __lazy = utils.LazyModule(
'exceptions': [],
'client': ['HTTPClient', 'AsyncHTTPClient'],
'bundle': [],
'playground': [],
'testing': [],
'protocol': [],
'utils': [],

View File

@@ -38,7 +38,6 @@ from . import (
bundle as bundle,
client as client,
exceptions as exceptions,
playground as playground,
serialisation as serialisation,
testing as testing,
utils as utils,

View File

@@ -1,14 +0,0 @@
This folder represents a playground and source of truth for a features
development around OpenLLM
```bash
openllm playground
```
## Usage for developing this module
Write a python script that can be used by `jupytext` to convert it to a jupyter
notebook via `openllm playground`
Make sure to add the new file and its documentation into
[`_meta.yml`](./_meta.yml).

View File

@@ -1,36 +0,0 @@
features:
description: |
## General introduction to OpenLLM.
This script will demo a few features from OpenLLM:
- Usage of Auto class abstraction and run prediction with `generate`
- Ability to send per-requests parameters
- Runner integration with BentoML
opt_tuned:
description: |
## Fine tuning OPT
This script demonstrate how one can easily fine tune OPT
with [LoRa](https://arxiv.org/abs/2106.09685) and in int8 with bitsandbytes.
It is based on one of the Peft examples fine tuning script.
It requires at least one GPU to be available, so make sure to have it.
falcon_tuned:
description: |
## Fine tuning Falcon
This script demonstrate how one can fine tune Falcon using [QLoRa](https://arxiv.org/pdf/2305.14314.pdf),
[trl](https://github.com/lvwerra/trl).
It is trained using OpenAssistant's Guanaco [dataset](https://huggingface.co/datasets/timdettmers/openassistant-guanaco)
It requires at least one GPU to be available, so make sure to have it.
llama2_qlora:
description: |
## Fine tuning LlaMA 2
This script demonstrate how one can fine tune Falcon using LoRA with [trl](https://github.com/lvwerra/trl)
It is trained using the [Dolly datasets](https://huggingface.co/datasets/databricks/databricks-dolly-15k)
It requires at least one GPU to be available.

View File

@@ -1,95 +0,0 @@
from __future__ import annotations
import dataclasses
import logging
import os
import sys
import typing as t
import torch
import transformers
import openllm
# Make sure to have at least one GPU to run this script
openllm.utils.configure_logging()
logger = logging.getLogger(__name__)
# On notebook, make sure to install the following
# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
from datasets import load_dataset
from trl import SFTTrainer
DEFAULT_MODEL_ID = 'ybelkada/falcon-7b-sharded-bf16'
DATASET_NAME = 'timdettmers/openassistant-guanaco'
@dataclasses.dataclass
class TrainingArguments:
per_device_train_batch_size: int = dataclasses.field(default=4)
gradient_accumulation_steps: int = dataclasses.field(default=4)
optim: str = dataclasses.field(default='paged_adamw_32bit')
save_steps: int = dataclasses.field(default=10)
warmup_steps: int = dataclasses.field(default=10)
max_steps: int = dataclasses.field(default=500)
logging_steps: int = dataclasses.field(default=10)
learning_rate: float = dataclasses.field(default=2e-4)
max_grad_norm: float = dataclasses.field(default=0.3)
warmup_ratio: float = dataclasses.field(default=0.03)
fp16: bool = dataclasses.field(default=True)
group_by_length: bool = dataclasses.field(default=True)
lr_scheduler_type: str = dataclasses.field(default='constant')
output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'falcon'))
@dataclasses.dataclass
class ModelArguments:
model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
max_sequence_length: int = dataclasses.field(default=512)
parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
llm = openllm.LLM(
model_args.model_id, quantize='int4', bnb_4bit_quant_type='nf4', bnb_4bit_compute_dtype=torch.float16
)
model, tokenizer = llm.prepare_for_training(
adapter_type='lora',
lora_alpha=16,
lora_dropout=0.1,
r=16,
bias='none',
target_modules=['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'],
)
model.config.use_cache = False
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(DATASET_NAME, split='train')
trainer = SFTTrainer(
model=model,
train_dataset=dataset,
dataset_text_field='text',
max_seq_length=model_args.max_sequence_length,
tokenizer=tokenizer,
args=dataclasses.replace(
transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
),
)
# upcast layernorm in float32 for more stable training
for name, module in trainer.model.named_modules():
if 'norm' in name:
module = module.to(torch.float32)
trainer.train()
trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))

View File

@@ -1,44 +0,0 @@
from __future__ import annotations
import argparse
import logging
import typing as t
import asyncio
import openllm
openllm.utils.configure_logging()
logger = logging.getLogger(__name__)
MAX_NEW_TOKENS = 384
Q = 'Answer the following question, step by step:\n{q}\nA:'
question = 'What is the meaning of life?'
async def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument('question', default=question)
if openllm.utils.in_notebook():
args = parser.parse_args(args=[question])
else:
args = parser.parse_args()
llm = openllm.LLM[t.Any, t.Any]('facebook/opt-2.7b')
prompt = Q.format(q=args.question)
logger.info('-' * 50, "Running with 'generate()'", '-' * 50)
res = await llm.generate(prompt)
logger.info('=' * 10, 'Response:', res)
logger.info('-' * 50, "Running with 'generate()' with per-requests argument", '-' * 50)
res = await llm.generate(prompt, max_new_tokens=MAX_NEW_TOKENS)
logger.info('=' * 10, 'Response:', res)
return 0
def _mp_fn(index: t.Any): # type: ignore
# For xla_spawn (TPUs)
asyncio.run(main())

View File

@@ -1,238 +0,0 @@
from __future__ import annotations
import dataclasses
import logging
import os
import sys
import typing as t
import torch
import transformers
import openllm
if t.TYPE_CHECKING:
import peft
# Make sure to have at least one GPU to run this script
openllm.utils.configure_logging()
logger = logging.getLogger(__name__)
# On notebook, make sure to install the following
# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
from functools import partial
from itertools import chain
from random import randint, randrange
import bitsandbytes as bnb
from datasets import load_dataset
# COPIED FROM https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
lora_module_names = set()
for name, module in model.named_modules():
if isinstance(module, bnb.nn.Linear4bit):
names = name.split('.')
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
if 'lm_head' in lora_module_names: # needed for 16-bit
lora_module_names.remove('lm_head')
return list(lora_module_names)
# Change this to the local converted path if you don't have access to the meta-llama model
DEFAULT_MODEL_ID = 'meta-llama/Llama-2-7b-hf'
# change this to 'main' if you want to use the latest llama
DEFAULT_MODEL_VERSION = '335a02887eb6684d487240bbc28b5699298c3135'
DATASET_NAME = 'databricks/databricks-dolly-15k'
def format_dolly(sample):
instruction = f"### Instruction\n{sample['instruction']}"
context = f"### Context\n{sample['context']}" if len(sample['context']) > 0 else None
response = f"### Answer\n{sample['response']}"
# join all the parts together
prompt = '\n\n'.join([i for i in [instruction, context, response] if i is not None])
return prompt
# template dataset to add prompt to each sample
def template_dataset(sample, tokenizer):
sample['text'] = f'{format_dolly(sample)}{tokenizer.eos_token}'
return sample
# empty list to save remainder from batches to use in next batch
remainder = {'input_ids': [], 'attention_mask': [], 'token_type_ids': []}
def chunk(sample, chunk_length=2048):
# define global remainder variable to save remainder from batches to use in next batch
global remainder
# Concatenate all texts and add remainder from previous batch
concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
# get total number of tokens for batch
batch_total_length = len(concatenated_examples[next(iter(sample.keys()))])
# get max number of chunks for batch
if batch_total_length >= chunk_length:
batch_chunk_length = (batch_total_length // chunk_length) * chunk_length
# Split by chunks of max_len.
result = {
k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
for k, t in concatenated_examples.items()
}
# add remainder to global variable for next batch
remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
# prepare labels
result['labels'] = result['input_ids'].copy()
return result
def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
# Load dataset from the hub
dataset = load_dataset(dataset_name, split='train')
print(f'dataset size: {len(dataset)}')
print(dataset[randrange(len(dataset))])
# apply prompt template per sample
dataset = dataset.map(partial(template_dataset, tokenizer=tokenizer), remove_columns=list(dataset.features))
# print random sample
print('Sample from dolly-v2 ds:', dataset[randint(0, len(dataset))]['text'])
# tokenize and chunk dataset
lm_dataset = dataset.map(
lambda sample: tokenizer(sample['text']), batched=True, remove_columns=list(dataset.features)
).map(partial(chunk, chunk_length=2048), batched=True)
# Print total number of samples
print(f'Total number of samples: {len(lm_dataset)}')
return lm_dataset
def prepare_for_int4_training(
model_id: str, model_version: str | None = None, gradient_checkpointing: bool = True, bf16: bool = True
) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
from peft.tuners.lora import LoraLayer
llm = openllm.LLM(
model_id,
revision=model_version,
quantize='int4',
bnb_4bit_compute_dtype=torch.bfloat16,
use_cache=not gradient_checkpointing,
device_map='auto',
)
print('Model summary:', llm.model)
# get lora target modules
modules = find_all_linear_names(llm.model)
print(f'Found {len(modules)} modules to quantize: {modules}')
model, tokenizer = llm.prepare_for_training(
adapter_type='lora', use_gradient_checkpointing=gradient_checkpointing, target_modules=modules
)
# pre-process the model by upcasting the layer norms in float 32 for
for name, module in model.named_modules():
if isinstance(module, LoraLayer):
if bf16:
module = module.to(torch.bfloat16)
if 'norm' in name:
module = module.to(torch.float32)
if 'lm_head' in name or 'embed_tokens' in name:
if hasattr(module, 'weight'):
if bf16 and module.weight.dtype == torch.float32:
module = module.to(torch.bfloat16)
return model, tokenizer
@dataclasses.dataclass
class TrainingArguments:
per_device_train_batch_size: int = dataclasses.field(default=1)
gradient_checkpointing: bool = dataclasses.field(default=True)
bf16: bool = dataclasses.field(default=torch.cuda.get_device_capability()[0] == 8)
learning_rate: float = dataclasses.field(default=5e-5)
num_train_epochs: int = dataclasses.field(default=3)
logging_steps: int = dataclasses.field(default=1)
report_to: str = dataclasses.field(default='none')
output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'llama'))
save_strategy: str = dataclasses.field(default='no')
@dataclasses.dataclass
class ModelArguments:
model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION)
seed: int = dataclasses.field(default=42)
merge_weights: bool = dataclasses.field(default=False)
if openllm.utils.in_notebook():
model_args, training_rags = ModelArguments(), TrainingArguments()
else:
parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, training_args = t.cast(
t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()
)
# import the model first hand
openllm.import_model(model_id=model_args.model_id, model_version=model_args.model_version)
def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
import peft
transformers.set_seed(model_args.seed)
model, tokenizer = prepare_for_int4_training(
model_args.model_id, gradient_checkpointing=training_args.gradient_checkpointing, bf16=training_args.bf16
)
datasets = prepare_datasets(tokenizer)
trainer = transformers.Trainer(
model=model,
args=dataclasses.replace(
transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
),
train_dataset=datasets,
data_collator=transformers.default_data_collator,
)
trainer.train()
if model_args.merge_weights:
# note that this will requires larger GPU as we will load the whole model into memory
# merge adapter weights with base model and save
# save int4 model
trainer.model.save_pretrained(training_args.output_dir, safe_serialization=False)
# gc mem
del model, trainer
torch.cuda.empty_cache()
model = peft.AutoPeftModelForCausalLM.from_pretrained(
training_args.output_dir, low_cpu_mem_usage=True, torch_dtype=torch.float16
)
# merge lora with base weights and save
model = model.merge_and_unload()
model.save_pretrained(
os.path.join(os.getcwd(), 'outputs', 'merged_llama_lora'), safe_serialization=True, max_shard_size='2GB'
)
else:
trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))
train_loop(model_args, training_args)

View File

@@ -1,81 +0,0 @@
from __future__ import annotations
import dataclasses
import logging
import os
import sys
import typing as t
import transformers
import openllm
# Make sure to have at least one GPU to run this script
openllm.utils.configure_logging()
logger = logging.getLogger(__name__)
# On notebook, make sure to install the following
# ! pip install -U openllm[fine-tune] @ git+https://github.com/bentoml/OpenLLM.git
from datasets import load_dataset
if t.TYPE_CHECKING:
from peft import PeftModel
DEFAULT_MODEL_ID = 'facebook/opt-6.7b'
def load_trainer(
model: PeftModel, tokenizer: transformers.GPT2TokenizerFast, dataset_dict: t.Any, training_args: TrainingArguments
):
return transformers.Trainer(
model=model,
train_dataset=dataset_dict['train'],
args=dataclasses.replace(
transformers.TrainingArguments(training_args.output_dir), **dataclasses.asdict(training_args)
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
@dataclasses.dataclass
class TrainingArguments:
per_device_train_batch_size: int = dataclasses.field(default=4)
gradient_accumulation_steps: int = dataclasses.field(default=4)
warmup_steps: int = dataclasses.field(default=10)
max_steps: int = dataclasses.field(default=50)
learning_rate: float = dataclasses.field(default=3e-4)
fp16: bool = dataclasses.field(default=True)
logging_steps: int = dataclasses.field(default=1)
output_dir: str = dataclasses.field(default=os.path.join(os.getcwd(), 'outputs', 'opt'))
@dataclasses.dataclass
class ModelArguments:
model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, training_args = t.cast(t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses())
llm = openllm.LLM(model_args.model_id, quantize='int8')
model, tokenizer = llm.prepare_for_training(
adapter_type='lora', r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj'], lora_dropout=0.05, bias='none'
)
# ft on english_quotes
data = load_dataset('Abirate/english_quotes')
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)
trainer = load_trainer(model, tokenizer, data, training_args)
model.config.use_cache = False # silence just for warning, reenable for inference later
trainer.train()
trainer.model.save_pretrained(os.path.join(training_args.output_dir, 'lora'))