add more version of llama

2026-08-01 10:38:50 -04:00 · 2024-06-04 19:54:26 +08:00
parent 47fefe30ed
commit 9d667bb46a
2 changed files with 43 additions and 3 deletions
--- a/recipe.yaml
+++ b/recipe.yaml
@@ -1,7 +1,6 @@
 "phi3:3.8b-mini-instruct-4k-fp16":
  alias:
    - 3.8b
-    - mini
  project: vllm-chat
  service_config:
    name: phi3
@@ -30,6 +29,36 @@
    model: meta-llama/Llama-2-7b-chat-hf
    max_model_len: 1024
  chat_template: llama-2-chat
+"llama2:13b-chat-fp16":
+  alias:
+    - 13b
+  project: vllm-chat
+  service_config:
+    name: llama2
+    traffic:
+      timeout: 300
+    resources:
+      gpu: 1
+      gpu_type: nvidia-tesla-a100
+  engine_config:
+    model: meta-llama/Llama-2-13b-chat-hf
+    max_model_len: 1024
+  chat_template: llama-2-chat
+"llama2:70b-chat-fp16":
+  alias:
+    - 70b
+  project: vllm-chat
+  service_config:
+    name: llama2
+    traffic:
+      timeout: 300
+    resources:
+      gpu: 2
+      gpu_type: nvidia-a100-80g
+  engine_config:
+    model: meta-llama/Llama-2-70b-chat-hf
+    max_model_len: 1024
+  chat_template: llama-2-chat
 "llama2:7b-chat-awq-4bit":
  alias:
    - 7b-4bit
@@ -68,7 +97,6 @@
 "mistral:7b-instruct-fp16":
  alias:
    - 7b
-    - 7b-instruct
  project: vllm-chat
  service_config:
    name: mistral
@@ -110,7 +138,7 @@
      gpu: 1
      gpu_type: nvidia-a100-80g
  engine_config:
-    model: meta-llama/Meta-Llama-3-8B
+    model: casperhansen/llama-3-70b-instruct-awq
    max_model_len: 2048
    quantization: awq
 "llama3:8b-instruct-fp16":
--- a/vllm-chat/.gitignore
+++ b/vllm-chat/.gitignore
@@ -0,0 +1,12 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Environments
+venv/
+
+# BentoML
+bentoml/client_id
+
+chattts/ChatTTS/