mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-20 14:46:38 -04:00
Compare commits
2 Commits
llama_cpp/
...
gosec_fix
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
63c5d843b6 | ||
|
|
a9b0e264f2 |
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
|||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
- name: Run Gosec Security Scanner
|
- name: Run Gosec Security Scanner
|
||||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||||
uses: securego/gosec@master
|
uses: securego/gosec@v2.21.0
|
||||||
with:
|
with:
|
||||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ ARG TARGETARCH
|
|||||||
ARG TARGETVARIANT
|
ARG TARGETVARIANT
|
||||||
|
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,exllama:/build/backend/python/exllama/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,huggingface-embeddings:/build/backend/python/sentencetransformers/run.sh,transformers:/build/backend/python/transformers/run.sh,sentencetransformers:/build/backend/python/sentencetransformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,openvoice:/build/backend/python/openvoice/run.sh,vall-e-x:/build/backend/python/vall-e-x/run.sh,vllm:/build/backend/python/vllm/run.sh,mamba:/build/backend/python/mamba/run.sh,exllama2:/build/backend/python/exllama2/run.sh,transformers-musicgen:/build/backend/python/transformers-musicgen/run.sh,parler-tts:/build/backend/python/parler-tts/run.sh"
|
||||||
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
@@ -418,9 +418,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "coqui" || -z "${EXTRA_BACKENDS}" ) && "$IMAG
|
|||||||
; fi && \
|
; fi && \
|
||||||
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
if [[ ( "${EXTRA_BACKENDS}" =~ "transformers-musicgen" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||||
make -C backend/python/transformers-musicgen \
|
make -C backend/python/transformers-musicgen \
|
||||||
; fi && \
|
|
||||||
if [[ ( "${EXTRA_BACKENDS}" =~ "exllama1" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
|
||||||
make -C backend/python/exllama \
|
|
||||||
; fi
|
; fi
|
||||||
|
|
||||||
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vall-e-x" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
|
||||||
|
|||||||
13
Makefile
13
Makefile
@@ -534,10 +534,10 @@ protogen-go-clean:
|
|||||||
$(RM) bin/*
|
$(RM) bin/*
|
||||||
|
|
||||||
.PHONY: protogen-python
|
.PHONY: protogen-python
|
||||||
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen mamba-protogen rerankers-protogen sentencetransformers-protogen transformers-protogen parler-tts-protogen transformers-musicgen-protogen vall-e-x-protogen vllm-protogen openvoice-protogen
|
||||||
|
|
||||||
.PHONY: protogen-python-clean
|
.PHONY: protogen-python-clean
|
||||||
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean exllama2-protogen-clean mamba-protogen-clean sentencetransformers-protogen-clean rerankers-protogen-clean transformers-protogen-clean transformers-musicgen-protogen-clean parler-tts-protogen-clean vall-e-x-protogen-clean vllm-protogen-clean openvoice-protogen-clean
|
||||||
|
|
||||||
.PHONY: autogptq-protogen
|
.PHONY: autogptq-protogen
|
||||||
autogptq-protogen:
|
autogptq-protogen:
|
||||||
@@ -571,14 +571,6 @@ diffusers-protogen:
|
|||||||
diffusers-protogen-clean:
|
diffusers-protogen-clean:
|
||||||
$(MAKE) -C backend/python/diffusers protogen-clean
|
$(MAKE) -C backend/python/diffusers protogen-clean
|
||||||
|
|
||||||
.PHONY: exllama-protogen
|
|
||||||
exllama-protogen:
|
|
||||||
$(MAKE) -C backend/python/exllama protogen
|
|
||||||
|
|
||||||
.PHONY: exllama-protogen-clean
|
|
||||||
exllama-protogen-clean:
|
|
||||||
$(MAKE) -C backend/python/exllama protogen-clean
|
|
||||||
|
|
||||||
.PHONY: exllama2-protogen
|
.PHONY: exllama2-protogen
|
||||||
exllama2-protogen:
|
exllama2-protogen:
|
||||||
$(MAKE) -C backend/python/exllama2 protogen
|
$(MAKE) -C backend/python/exllama2 protogen
|
||||||
@@ -675,7 +667,6 @@ prepare-extra-conda-environments: protogen-python
|
|||||||
$(MAKE) -C backend/python/parler-tts
|
$(MAKE) -C backend/python/parler-tts
|
||||||
$(MAKE) -C backend/python/vall-e-x
|
$(MAKE) -C backend/python/vall-e-x
|
||||||
$(MAKE) -C backend/python/openvoice
|
$(MAKE) -C backend/python/openvoice
|
||||||
$(MAKE) -C backend/python/exllama
|
|
||||||
$(MAKE) -C backend/python/exllama2
|
$(MAKE) -C backend/python/exllama2
|
||||||
|
|
||||||
prepare-test-extra: protogen-python
|
prepare-test-extra: protogen-python
|
||||||
|
|||||||
1
backend/python/exllama/.gitignore
vendored
1
backend/python/exllama/.gitignore
vendored
@@ -1 +0,0 @@
|
|||||||
source
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
export CONDA_ENV_PATH = "exllama.yml"
|
|
||||||
|
|
||||||
.PHONY: exllama
|
|
||||||
exllama: protogen
|
|
||||||
bash install.sh ${CONDA_ENV_PATH}
|
|
||||||
|
|
||||||
.PHONY: run
|
|
||||||
run: protogen
|
|
||||||
@echo "Running exllama..."
|
|
||||||
bash run.sh
|
|
||||||
@echo "exllama run."
|
|
||||||
|
|
||||||
.PHONY: protogen
|
|
||||||
protogen: backend_pb2_grpc.py backend_pb2.py
|
|
||||||
|
|
||||||
.PHONY: protogen-clean
|
|
||||||
protogen-clean:
|
|
||||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
|
||||||
|
|
||||||
backend_pb2_grpc.py backend_pb2.py:
|
|
||||||
python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
|
|
||||||
|
|
||||||
.PHONY: clean
|
|
||||||
clean: protogen-clean
|
|
||||||
$(RM) -r venv source __pycache__
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
# Creating a separate environment for the exllama project
|
|
||||||
|
|
||||||
```
|
|
||||||
make exllama
|
|
||||||
```
|
|
||||||
@@ -1,159 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
import grpc
|
|
||||||
from concurrent import futures
|
|
||||||
import time
|
|
||||||
import backend_pb2
|
|
||||||
import backend_pb2_grpc
|
|
||||||
import argparse
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
import os, glob
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
from torch import version as torch_version
|
|
||||||
|
|
||||||
from source.tokenizer import ExLlamaTokenizer
|
|
||||||
from source.generator import ExLlamaGenerator
|
|
||||||
from source.model import ExLlama, ExLlamaCache, ExLlamaConfig
|
|
||||||
|
|
||||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
|
||||||
|
|
||||||
# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
|
|
||||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
|
||||||
|
|
||||||
# Implement the BackendServicer class with the service methods
|
|
||||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
|
||||||
def generate(self,prompt, max_new_tokens):
|
|
||||||
self.generator.end_beam_search()
|
|
||||||
|
|
||||||
# Tokenizing the input
|
|
||||||
ids = self.generator.tokenizer.encode(prompt)
|
|
||||||
|
|
||||||
self.generator.gen_begin_reuse(ids)
|
|
||||||
initial_len = self.generator.sequence[0].shape[0]
|
|
||||||
has_leading_space = False
|
|
||||||
decoded_text = ''
|
|
||||||
for i in range(max_new_tokens):
|
|
||||||
token = self.generator.gen_single_token()
|
|
||||||
if i == 0 and self.generator.tokenizer.tokenizer.IdToPiece(int(token)).startswith('▁'):
|
|
||||||
has_leading_space = True
|
|
||||||
|
|
||||||
decoded_text = self.generator.tokenizer.decode(self.generator.sequence[0][initial_len:])
|
|
||||||
if has_leading_space:
|
|
||||||
decoded_text = ' ' + decoded_text
|
|
||||||
|
|
||||||
if token.item() == self.generator.tokenizer.eos_token_id:
|
|
||||||
break
|
|
||||||
return decoded_text
|
|
||||||
def Health(self, request, context):
|
|
||||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
|
||||||
def LoadModel(self, request, context):
|
|
||||||
try:
|
|
||||||
# https://github.com/turboderp/exllama/blob/master/example_cfg.py
|
|
||||||
model_directory = request.ModelFile
|
|
||||||
|
|
||||||
# Locate files we need within that directory
|
|
||||||
tokenizer_path = os.path.join(model_directory, "tokenizer.model")
|
|
||||||
model_config_path = os.path.join(model_directory, "config.json")
|
|
||||||
st_pattern = os.path.join(model_directory, "*.safetensors")
|
|
||||||
model_path = glob.glob(st_pattern)[0]
|
|
||||||
|
|
||||||
# Create config, model, tokenizer and generator
|
|
||||||
|
|
||||||
config = ExLlamaConfig(model_config_path) # create config from config.json
|
|
||||||
config.model_path = model_path # supply path to model weights file
|
|
||||||
if (request.ContextSize):
|
|
||||||
config.max_seq_len = request.ContextSize # override max sequence length
|
|
||||||
config.max_attention_size = request.ContextSize**2 # Should be set to context_size^2.
|
|
||||||
# https://github.com/turboderp/exllama/issues/220#issuecomment-1720324163
|
|
||||||
|
|
||||||
# Set Rope scaling.
|
|
||||||
if (request.RopeFreqScale):
|
|
||||||
# Alpha value for Rope scaling.
|
|
||||||
# Higher value increases context but adds perplexity.
|
|
||||||
# alpha_value and compress_pos_emb are mutually exclusive.
|
|
||||||
# https://github.com/turboderp/exllama/issues/115
|
|
||||||
config.alpha_value = request.RopeFreqScale
|
|
||||||
config.calculate_rotary_embedding_base()
|
|
||||||
|
|
||||||
model = ExLlama(config) # create ExLlama instance and load the weights
|
|
||||||
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file
|
|
||||||
|
|
||||||
cache = ExLlamaCache(model, batch_size = 2) # create cache for inference
|
|
||||||
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator
|
|
||||||
|
|
||||||
self.generator= generator
|
|
||||||
self.model = model
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
self.cache = cache
|
|
||||||
except Exception as err:
|
|
||||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
|
||||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
|
||||||
|
|
||||||
def Predict(self, request, context):
|
|
||||||
penalty = 1.15
|
|
||||||
if request.Penalty != 0.0:
|
|
||||||
penalty = request.Penalty
|
|
||||||
self.generator.settings.token_repetition_penalty_max = penalty
|
|
||||||
self.generator.settings.temperature = request.Temperature
|
|
||||||
self.generator.settings.top_k = request.TopK
|
|
||||||
self.generator.settings.top_p = request.TopP
|
|
||||||
|
|
||||||
tokens = 512
|
|
||||||
if request.Tokens != 0:
|
|
||||||
tokens = request.Tokens
|
|
||||||
|
|
||||||
if self.cache.batch_size == 1:
|
|
||||||
del self.cache
|
|
||||||
self.cache = ExLlamaCache(self.model, batch_size=2)
|
|
||||||
self.generator = ExLlamaGenerator(self.model, self.tokenizer, self.cache)
|
|
||||||
|
|
||||||
t = self.generate(request.Prompt, tokens)
|
|
||||||
|
|
||||||
# Remove prompt from response if present
|
|
||||||
if request.Prompt in t:
|
|
||||||
t = t.replace(request.Prompt, "")
|
|
||||||
|
|
||||||
return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
|
|
||||||
|
|
||||||
def PredictStream(self, request, context):
|
|
||||||
# Implement PredictStream RPC
|
|
||||||
#for reply in some_data_generator():
|
|
||||||
# yield reply
|
|
||||||
# Not implemented yet
|
|
||||||
return self.Predict(request, context)
|
|
||||||
|
|
||||||
|
|
||||||
def serve(address):
|
|
||||||
server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
|
||||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
|
||||||
server.add_insecure_port(address)
|
|
||||||
server.start()
|
|
||||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
|
||||||
|
|
||||||
# Define the signal handler function
|
|
||||||
def signal_handler(sig, frame):
|
|
||||||
print("Received termination signal. Shutting down...")
|
|
||||||
server.stop(0)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
# Set the signal handlers for SIGINT and SIGTERM
|
|
||||||
signal.signal(signal.SIGINT, signal_handler)
|
|
||||||
signal.signal(signal.SIGTERM, signal_handler)
|
|
||||||
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
server.stop(0)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--addr", default="localhost:50051", help="The address to bind the server to."
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
serve(args.addr)
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
LIMIT_TARGETS="cublas"
|
|
||||||
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
installRequirements
|
|
||||||
|
|
||||||
git clone https://github.com/turboderp/exllama $MY_DIR/source
|
|
||||||
uv pip install ${BUILD_ISOLATION_FLAG} --requirement ${MY_DIR}/source/requirements.txt
|
|
||||||
|
|
||||||
cp -v ./*py $MY_DIR/source/
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
transformers
|
|
||||||
accelerate
|
|
||||||
torch
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
|
||||||
torch
|
|
||||||
transformers
|
|
||||||
accelerate
|
|
||||||
@@ -1,3 +0,0 @@
|
|||||||
torch
|
|
||||||
transformers
|
|
||||||
accelerate
|
|
||||||
@@ -1,4 +0,0 @@
|
|||||||
grpcio==1.66.1
|
|
||||||
protobuf
|
|
||||||
certifi
|
|
||||||
setuptools
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
LIMIT_TARGETS="cublas"
|
|
||||||
BACKEND_FILE="${MY_DIR}/source/backend.py"
|
|
||||||
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
startBackend $@
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
source $(dirname $0)/../common/libbackend.sh
|
|
||||||
|
|
||||||
runUnittests
|
|
||||||
Reference in New Issue
Block a user