diff --git a/.github/actions/create_release_and_archive.sh b/.github/actions/create_release_and_archive.sh
index ae98aa4a..3829b07a 100755
--- a/.github/actions/create_release_and_archive.sh
+++ b/.github/actions/create_release_and_archive.sh
@@ -37,7 +37,7 @@ pip install --upgrade openllm==${TAG}
 
 ## Usage
 
-All available models: \`\`\`python -m openllm.models\`\`\`
+All available models: \`\`\`openllm models\`\`\`
 
 To start a LLM: \`\`\`python -m openllm start opt\`\`\`
 
diff --git a/src/openllm/_llm.py b/src/openllm/_llm.py
index 362fb26b..f7da19e9 100644
--- a/src/openllm/_llm.py
+++ b/src/openllm/_llm.py
@@ -58,7 +58,6 @@ from .utils import is_peft_available
 from .utils import is_torch_available
 from .utils import non_intrusive_setattr
 from .utils import normalize_attrs_to_model_tokenizer_pair
-from .utils import pkg
 from .utils import requires_dependencies
 from .utils import resolve_filepath
 from .utils import validate_is_path
@@ -1144,10 +1143,7 @@ class LLM(LLMInterface[M, T], ReprMixin):
         use_gradient_checkpointing: bool = True,
         **attrs: t.Any,
     ) -> tuple[peft.PeftModel, T]:
-        if pkg.pkg_version_info("peft")[:2] >= (0, 4):
-            from peft import prepare_model_for_kbit_training
-        else:
-            from peft import prepare_model_for_int8_training as prepare_model_for_kbit_training
+        from peft import prepare_model_for_kbit_training
 
         peft_config = (
             self.config["fine_tune_strategies"]
diff --git a/src/openllm/playground/llama2_qlora.py b/src/openllm/playground/llama2_qlora.py
index e0367c96..5f51c9a9 100644
--- a/src/openllm/playground/llama2_qlora.py
+++ b/src/openllm/playground/llama2_qlora.py
@@ -45,6 +45,8 @@ def find_all_linear_names(model):
 
 # Change this to the local converted path if you don't have access to the meta-llama model
 DEFAULT_MODEL_ID = "meta-llama/Llama-2-7b-hf"
+# change this to 'main' if you want to use the latest llama
+DEFAULT_MODEL_VERSION = "335a02887eb6684d487240bbc28b5699298c3135"
 DATASET_NAME = "databricks/databricks-dolly-15k"
 
 
@@ -119,13 +121,17 @@ def prepare_datasets(tokenizer, dataset_name=DATASET_NAME):
 
 @openllm.utils.requires_dependencies("peft", extra="fine-tune")
 def prepare_for_int4_training(
-    model_id: str, gradient_checkpointing: bool = True, bf16: bool = True
+    model_id: str,
+    model_version: str | None = None,
+    gradient_checkpointing: bool = True,
+    bf16: bool = True,
 ) -> tuple[peft.PeftModel, transformers.LlamaTokenizerFast]:
     from peft.tuners.lora import LoraLayer
 
     llm = openllm.AutoLLM.for_model(
         "llama",
         model_id=model_id,
+        model_version=model_version,
         ensure_available=True,
         quantize="int4",
         bnb_4bit_compute_dtype=torch.bfloat16,
@@ -138,7 +144,9 @@ def prepare_for_int4_training(
     modules = find_all_linear_names(llm.model)
     print(f"Found {len(modules)} modules to quantize: {modules}")
 
-    model, tokenizer = llm.prepare_for_training(adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing)
+    model, tokenizer = llm.prepare_for_training(
+        adapter_type="lora", use_gradient_checkpointing=gradient_checkpointing, target_modules=modules
+    )
 
     # pre-process the model by upcasting the layer norms in float 32 for
     for name, module in model.named_modules():
@@ -170,23 +178,27 @@ class TrainingArguments:
 @dataclasses.dataclass
 class ModelArguments:
     model_id: str = dataclasses.field(default=DEFAULT_MODEL_ID)
+    model_version: str = dataclasses.field(default=DEFAULT_MODEL_VERSION)
     seed: int = dataclasses.field(default=42)
     merge_weights: bool = dataclasses.field(default=False)
 
 
-parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
-if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-    # If we pass only one argument to the script and it's the path to a json file,
-    # let's parse it to get our arguments.
-    model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+if openllm.utils.in_notebook():
+    model_args, training_rags = ModelArguments(), TrainingArguments()
 else:
-    model_args, training_args = t.cast(
-        t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()
-    )
+    parser = transformers.HfArgumentParser((ModelArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, training_args = t.cast(
+            t.Tuple[ModelArguments, TrainingArguments], parser.parse_args_into_dataclasses()
+        )
 
 
 # import the model first hand
-openllm.import_model("llama", model_id=model_args.model_id)
+openllm.import_model("llama", model_id=model_args.model_id, model_version=model_args.model_version)
 
 
 def train_loop(model_args: ModelArguments, training_args: TrainingArguments):
diff --git a/src/openllm/serialisation/transformers.py b/src/openllm/serialisation/transformers.py
index d85526db..158e42b4 100644
--- a/src/openllm/serialisation/transformers.py
+++ b/src/openllm/serialisation/transformers.py
@@ -166,6 +166,10 @@ def import_model(
         metadata["_framework"] = model.model.framework
         signatures["generate"] = {"batchable": False}
     else:
+        if "quantization_config" in attrs and getattr(attrs["quantization_config"], "load_in_4bit", False):
+            # this model might be called with --quantize int4, therefore we need to pop this out
+            # since saving int4 is not yet supported
+            attrs.pop("quantization_config")
         model = t.cast(
             "_transformers.PreTrainedModel",
             infer_autoclass_from_llm_config(llm, config).from_pretrained(