Files
LocalAI/backend/backend.proto
Ettore Di Giacinto 59108fbe32 feat: add distributed mode (#9124)
* feat: add distributed mode (experimental)

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix data races, mutexes, transactions

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactorings

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix events and tool stream in agent chat

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* use ginkgo

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(cron): compute correctly time boundaries avoiding re-triggering

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* enhancements, refactorings

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* do not flood of healthy checks

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* do not list obvious backends as text backends

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* tests fixups

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* refactoring and consolidation

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* Drop redundant healthcheck

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* enhancements, refactorings

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-03-30 00:47:27 +02:00

681 lines
19 KiB
Protocol Buffer

syntax = "proto3";
option go_package = "github.com/go-skynet/LocalAI/pkg/grpc/proto";
option java_multiple_files = true;
option java_package = "io.skynet.localai.backend";
option java_outer_classname = "LocalAIBackend";
package backend;
service Backend {
rpc Health(HealthMessage) returns (Reply) {}
rpc Free(HealthMessage) returns (Result) {}
rpc Predict(PredictOptions) returns (Reply) {}
rpc LoadModel(ModelOptions) returns (Result) {}
rpc PredictStream(PredictOptions) returns (stream Reply) {}
rpc Embedding(PredictOptions) returns (EmbeddingResult) {}
rpc GenerateImage(GenerateImageRequest) returns (Result) {}
rpc GenerateVideo(GenerateVideoRequest) returns (Result) {}
rpc AudioTranscription(TranscriptRequest) returns (TranscriptResult) {}
rpc TTS(TTSRequest) returns (Result) {}
rpc TTSStream(TTSRequest) returns (stream Reply) {}
rpc SoundGeneration(SoundGenerationRequest) returns (Result) {}
rpc TokenizeString(PredictOptions) returns (TokenizationResponse) {}
rpc Status(HealthMessage) returns (StatusResponse) {}
rpc Detect(DetectOptions) returns (DetectResponse) {}
rpc StoresSet(StoresSetOptions) returns (Result) {}
rpc StoresDelete(StoresDeleteOptions) returns (Result) {}
rpc StoresGet(StoresGetOptions) returns (StoresGetResult) {}
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
rpc Rerank(RerankRequest) returns (RerankResult) {}
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
rpc VAD(VADRequest) returns (VADResponse) {}
rpc AudioEncode(AudioEncodeRequest) returns (AudioEncodeResult) {}
rpc AudioDecode(AudioDecodeRequest) returns (AudioDecodeResult) {}
rpc ModelMetadata(ModelOptions) returns (ModelMetadataResponse) {}
// Fine-tuning RPCs
rpc StartFineTune(FineTuneRequest) returns (FineTuneJobResult) {}
rpc FineTuneProgress(FineTuneProgressRequest) returns (stream FineTuneProgressUpdate) {}
rpc StopFineTune(FineTuneStopRequest) returns (Result) {}
rpc ListCheckpoints(ListCheckpointsRequest) returns (ListCheckpointsResponse) {}
rpc ExportModel(ExportModelRequest) returns (Result) {}
// Quantization RPCs
rpc StartQuantization(QuantizationRequest) returns (QuantizationJobResult) {}
rpc QuantizationProgress(QuantizationProgressRequest) returns (stream QuantizationProgressUpdate) {}
rpc StopQuantization(QuantizationStopRequest) returns (Result) {}
}
// Define the empty request
message MetricsRequest {}
message MetricsResponse {
int32 slot_id = 1;
string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
float tokens_per_second = 3;
int32 tokens_generated = 4;
int32 prompt_tokens_processed = 5;
}
message RerankRequest {
string query = 1;
repeated string documents = 2;
int32 top_n = 3;
}
message RerankResult {
Usage usage = 1;
repeated DocumentResult results = 2;
}
message Usage {
int32 total_tokens = 1;
int32 prompt_tokens = 2;
}
message DocumentResult {
int32 index = 1;
string text = 2;
float relevance_score = 3;
}
message StoresKey {
repeated float Floats = 1;
}
message StoresValue {
bytes Bytes = 1;
}
message StoresSetOptions {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
}
message StoresDeleteOptions {
repeated StoresKey Keys = 1;
}
message StoresGetOptions {
repeated StoresKey Keys = 1;
}
message StoresGetResult {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
}
message StoresFindOptions {
StoresKey Key = 1;
int32 TopK = 2;
}
message StoresFindResult {
repeated StoresKey Keys = 1;
repeated StoresValue Values = 2;
repeated float Similarities = 3;
}
message HealthMessage {}
// The request message containing the user's name.
message PredictOptions {
string Prompt = 1;
int32 Seed = 2;
int32 Threads = 3;
int32 Tokens = 4;
int32 TopK = 5;
int32 Repeat = 6;
int32 Batch = 7;
int32 NKeep = 8;
float Temperature = 9;
float Penalty = 10;
bool F16KV = 11;
bool DebugMode = 12;
repeated string StopPrompts = 13;
bool IgnoreEOS = 14;
float TailFreeSamplingZ = 15;
float TypicalP = 16;
float FrequencyPenalty = 17;
float PresencePenalty = 18;
int32 Mirostat = 19;
float MirostatETA = 20;
float MirostatTAU = 21;
bool PenalizeNL = 22;
string LogitBias = 23;
bool MLock = 25;
bool MMap = 26;
bool PromptCacheAll = 27;
bool PromptCacheRO = 28;
string Grammar = 29;
string MainGPU = 30;
string TensorSplit = 31;
float TopP = 32;
string PromptCachePath = 33;
bool Debug = 34;
repeated int32 EmbeddingTokens = 35;
string Embeddings = 36;
float RopeFreqBase = 37;
float RopeFreqScale = 38;
float NegativePromptScale = 39;
string NegativePrompt = 40;
int32 NDraft = 41;
repeated string Images = 42;
bool UseTokenizerTemplate = 43;
repeated Message Messages = 44;
repeated string Videos = 45;
repeated string Audios = 46;
string CorrelationId = 47;
string Tools = 48; // JSON array of available tools/functions for tool calling
string ToolChoice = 49; // JSON string or object specifying tool choice behavior
int32 Logprobs = 50; // Number of top logprobs to return (maps to OpenAI logprobs parameter)
int32 TopLogprobs = 51; // Number of top logprobs to return per token (maps to OpenAI top_logprobs parameter)
map<string, string> Metadata = 52; // Generic per-request metadata (e.g., enable_thinking)
float MinP = 53; // Minimum probability sampling threshold (0.0 = disabled)
}
// ToolCallDelta represents an incremental tool call update from the C++ parser.
// Used for both streaming (partial diffs) and non-streaming (final tool calls).
message ToolCallDelta {
int32 index = 1; // tool call index (0-based)
string id = 2; // tool call ID (e.g., "call_abc123")
string name = 3; // function name (set on first appearance)
string arguments = 4; // arguments chunk (incremental in streaming, full in non-streaming)
}
// ChatDelta represents incremental content/reasoning/tool_call updates parsed by the C++ backend.
message ChatDelta {
string content = 1; // content text delta
string reasoning_content = 2; // reasoning/thinking text delta
repeated ToolCallDelta tool_calls = 3; // tool call deltas
}
// The response message containing the result
message Reply {
bytes message = 1;
int32 tokens = 2;
int32 prompt_tokens = 3;
double timing_prompt_processing = 4;
double timing_token_generation = 5;
bytes audio = 6;
bytes logprobs = 7; // JSON-encoded logprobs data matching OpenAI format
repeated ChatDelta chat_deltas = 8; // Parsed chat deltas from C++ autoparser (streaming + non-streaming)
}
message GrammarTrigger {
string word = 1;
}
message ModelOptions {
string Model = 1;
int32 ContextSize = 2;
int32 Seed = 3;
int32 NBatch = 4;
bool F16Memory = 5;
bool MLock = 6;
bool MMap = 7;
bool VocabOnly = 8;
bool LowVRAM = 9;
bool Embeddings = 10;
bool NUMA = 11;
int32 NGPULayers = 12;
string MainGPU = 13;
string TensorSplit = 14;
int32 Threads = 15;
float RopeFreqBase = 17;
float RopeFreqScale = 18;
float RMSNormEps = 19;
int32 NGQA = 20;
string ModelFile = 21;
// Diffusers
string PipelineType = 26;
string SchedulerType = 27;
bool CUDA = 28;
float CFGScale = 29;
bool IMG2IMG = 30;
string CLIPModel = 31;
string CLIPSubfolder = 32;
int32 CLIPSkip = 33;
string ControlNet = 48;
string Tokenizer = 34;
// LLM (llama.cpp)
string LoraBase = 35;
string LoraAdapter = 36;
float LoraScale = 42;
bool NoMulMatQ = 37;
string DraftModel = 39;
string AudioPath = 38;
// vllm
string Quantization = 40;
float GPUMemoryUtilization = 50;
bool TrustRemoteCode = 51;
bool EnforceEager = 52;
int32 SwapSpace = 53;
int32 MaxModelLen = 54;
int32 TensorParallelSize = 55;
string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;
string MMProj = 41;
string RopeScaling = 43;
float YarnExtFactor = 44;
float YarnAttnFactor = 45;
float YarnBetaFast = 46;
float YarnBetaSlow = 47;
string Type = 49;
string FlashAttention = 56;
bool NoKVOffload = 57;
string ModelPath = 59;
repeated string LoraAdapters = 60;
repeated float LoraScales = 61;
repeated string Options = 62;
string CacheTypeKey = 63;
string CacheTypeValue = 64;
repeated GrammarTrigger GrammarTriggers = 65;
bool Reranking = 71;
repeated string Overrides = 72;
}
message Result {
string message = 1;
bool success = 2;
}
message EmbeddingResult {
repeated float embeddings = 1;
}
message TranscriptRequest {
string dst = 2;
string language = 3;
uint32 threads = 4;
bool translate = 5;
bool diarize = 6;
string prompt = 7;
}
message TranscriptResult {
repeated TranscriptSegment segments = 1;
string text = 2;
}
message TranscriptSegment {
int32 id = 1;
int64 start = 2;
int64 end = 3;
string text = 4;
repeated int32 tokens = 5;
string speaker = 6;
}
message GenerateImageRequest {
int32 height = 1;
int32 width = 2;
int32 step = 4;
int32 seed = 5;
string positive_prompt = 6;
string negative_prompt = 7;
string dst = 8;
string src = 9;
// Diffusers
string EnableParameters = 10;
int32 CLIPSkip = 11;
// Reference images for models that support them (e.g., Flux Kontext)
repeated string ref_images = 12;
}
message GenerateVideoRequest {
string prompt = 1;
string negative_prompt = 2; // Negative prompt for video generation
string start_image = 3; // Path or base64 encoded image for the start frame
string end_image = 4; // Path or base64 encoded image for the end frame
int32 width = 5;
int32 height = 6;
int32 num_frames = 7; // Number of frames to generate
int32 fps = 8; // Frames per second
int32 seed = 9;
float cfg_scale = 10; // Classifier-free guidance scale
int32 step = 11; // Number of inference steps
string dst = 12; // Output path for the generated video
}
message TTSRequest {
string text = 1;
string model = 2;
string dst = 3;
string voice = 4;
optional string language = 5;
}
message VADRequest {
repeated float audio = 1;
}
message VADSegment {
float start = 1;
float end = 2;
}
message VADResponse {
repeated VADSegment segments = 1;
}
message SoundGenerationRequest {
string text = 1;
string model = 2;
string dst = 3;
optional float duration = 4;
optional float temperature = 5;
optional bool sample = 6;
optional string src = 7;
optional int32 src_divisor = 8;
optional bool think = 9;
optional string caption = 10;
optional string lyrics = 11;
optional int32 bpm = 12;
optional string keyscale = 13;
optional string language = 14;
optional string timesignature = 15;
optional bool instrumental = 17;
}
message TokenizationResponse {
int32 length = 1;
repeated int32 tokens = 2;
}
message MemoryUsageData {
uint64 total = 1;
map<string, uint64> breakdown = 2;
}
message StatusResponse {
enum State {
UNINITIALIZED = 0;
BUSY = 1;
READY = 2;
ERROR = -1;
}
State state = 1;
MemoryUsageData memory = 2;
}
message Message {
string role = 1;
string content = 2;
// Optional fields for OpenAI-compatible message format
string name = 3; // Tool name (for tool messages)
string tool_call_id = 4; // Tool call ID (for tool messages)
string reasoning_content = 5; // Reasoning content (for thinking models)
string tool_calls = 6; // Tool calls as JSON string (for assistant messages with tool calls)
}
message DetectOptions {
string src = 1;
}
message Detection {
float x = 1;
float y = 2;
float width = 3;
float height = 4;
float confidence = 5;
string class_name = 6;
}
message DetectResponse {
repeated Detection Detections = 1;
}
message ToolFormatMarkers {
string format_type = 1; // "json_native", "tag_with_json", "tag_with_tagged"
// Tool section markers
string section_start = 2; // e.g., "<tool_call>", "[TOOL_CALLS]"
string section_end = 3; // e.g., "</tool_call>"
string per_call_start = 4; // e.g., "<|tool_call_begin|>"
string per_call_end = 5; // e.g., "<|tool_call_end|>"
// Function name markers (TAG_WITH_JSON / TAG_WITH_TAGGED)
string func_name_prefix = 6; // e.g., "<function="
string func_name_suffix = 7; // e.g., ">"
string func_close = 8; // e.g., "</function>"
// Argument markers (TAG_WITH_TAGGED)
string arg_name_prefix = 9; // e.g., "<param="
string arg_name_suffix = 10; // e.g., ">"
string arg_value_prefix = 11;
string arg_value_suffix = 12; // e.g., "</param>"
string arg_separator = 13; // e.g., "\n"
// JSON format fields (JSON_NATIVE)
string name_field = 14; // e.g., "name"
string args_field = 15; // e.g., "arguments"
string id_field = 16; // e.g., "id"
bool fun_name_is_key = 17;
bool tools_array_wrapped = 18;
reserved 19;
// Reasoning markers
string reasoning_start = 20; // e.g., "<think>"
string reasoning_end = 21; // e.g., "</think>"
// Content markers
string content_start = 22;
string content_end = 23;
// Args wrapper markers
string args_start = 24; // e.g., "<args>"
string args_end = 25; // e.g., "</args>"
// JSON parameter ordering
string function_field = 26; // e.g., "function" (wrapper key in JSON)
repeated string parameter_order = 27;
// Generated ID field (alternative field name for generated IDs)
string gen_id_field = 28; // e.g., "call_id"
// Call ID markers (position and delimiters for tool call IDs)
string call_id_position = 29; // "none", "pre_func_name", "between_func_and_args", "post_args"
string call_id_prefix = 30; // e.g., "[CALL_ID]"
string call_id_suffix = 31; // e.g., ""
}
message AudioEncodeRequest {
bytes pcm_data = 1;
int32 sample_rate = 2;
int32 channels = 3;
map<string, string> options = 4;
}
message AudioEncodeResult {
repeated bytes frames = 1;
int32 sample_rate = 2;
int32 samples_per_frame = 3;
}
message AudioDecodeRequest {
repeated bytes frames = 1;
map<string, string> options = 2;
}
message AudioDecodeResult {
bytes pcm_data = 1;
int32 sample_rate = 2;
int32 samples_per_frame = 3;
}
message ModelMetadataResponse {
bool supports_thinking = 1;
string rendered_template = 2; // The rendered chat template with enable_thinking=true (empty if not applicable)
ToolFormatMarkers tool_format = 3; // Auto-detected tool format markers from differential template analysis
}
// Fine-tuning messages
message FineTuneRequest {
// Model identification
string model = 1; // HF model name or local path
string training_type = 2; // "lora", "loha", "lokr", "full" — what parameters to train
string training_method = 3; // "sft", "dpo", "grpo", "rloo", "reward", "kto", "orpo", "network_training"
// Adapter config (universal across LoRA/LoHa/LoKr for LLM + diffusion)
int32 adapter_rank = 10; // LoRA rank (r), default 16
int32 adapter_alpha = 11; // scaling factor, default 16
float adapter_dropout = 12; // default 0.0
repeated string target_modules = 13; // layer names to adapt
// Universal training hyperparameters
float learning_rate = 20; // default 2e-4
int32 num_epochs = 21; // default 3
int32 batch_size = 22; // default 2
int32 gradient_accumulation_steps = 23; // default 4
int32 warmup_steps = 24; // default 5
int32 max_steps = 25; // 0 = use epochs
int32 save_steps = 26; // 0 = only save final
float weight_decay = 27; // default 0.01
bool gradient_checkpointing = 28;
string optimizer = 29; // adamw_8bit, adamw, sgd, adafactor, prodigy
int32 seed = 30; // default 3407
string mixed_precision = 31; // fp16, bf16, fp8, no
// Dataset
string dataset_source = 40; // HF dataset ID, local file/dir path
string dataset_split = 41; // train, test, etc.
// Output
string output_dir = 50;
string job_id = 51; // client-assigned or auto-generated
// Resume training from a checkpoint
string resume_from_checkpoint = 55; // path to checkpoint dir to resume from
// Backend-specific AND method-specific extensibility
map<string, string> extra_options = 60;
}
message FineTuneJobResult {
string job_id = 1;
bool success = 2;
string message = 3;
}
message FineTuneProgressRequest {
string job_id = 1;
}
message FineTuneProgressUpdate {
string job_id = 1;
int32 current_step = 2;
int32 total_steps = 3;
float current_epoch = 4;
float total_epochs = 5;
float loss = 6;
float learning_rate = 7;
float grad_norm = 8;
float eval_loss = 9;
float eta_seconds = 10;
float progress_percent = 11;
string status = 12; // queued, caching, loading_model, loading_dataset, training, saving, completed, failed, stopped
string message = 13;
string checkpoint_path = 14; // set when a checkpoint is saved
string sample_path = 15; // set when a sample is generated (video/image backends)
map<string, float> extra_metrics = 16; // method-specific metrics
}
message FineTuneStopRequest {
string job_id = 1;
bool save_checkpoint = 2;
}
message ListCheckpointsRequest {
string output_dir = 1;
}
message ListCheckpointsResponse {
repeated CheckpointInfo checkpoints = 1;
}
message CheckpointInfo {
string path = 1;
int32 step = 2;
float epoch = 3;
float loss = 4;
string created_at = 5;
}
message ExportModelRequest {
string checkpoint_path = 1;
string output_path = 2;
string export_format = 3; // lora, loha, lokr, merged_16bit, merged_4bit, gguf, diffusers
string quantization_method = 4; // for GGUF: q4_k_m, q5_k_m, q8_0, f16, etc.
string model = 5; // base model name (for merge operations)
map<string, string> extra_options = 6;
}
// Quantization messages
message QuantizationRequest {
string model = 1; // HF model name or local path
string quantization_type = 2; // q4_k_m, q5_k_m, q8_0, f16, etc.
string output_dir = 3; // where to write output files
string job_id = 4; // client-assigned job ID
map<string, string> extra_options = 5; // hf_token, custom flags, etc.
}
message QuantizationJobResult {
string job_id = 1;
bool success = 2;
string message = 3;
}
message QuantizationProgressRequest {
string job_id = 1;
}
message QuantizationProgressUpdate {
string job_id = 1;
float progress_percent = 2;
string status = 3; // queued, downloading, converting, quantizing, completed, failed, stopped
string message = 4;
string output_file = 5; // set when completed — path to the output GGUF file
map<string, float> extra_metrics = 6; // e.g. file_size_mb, compression_ratio
}
message QuantizationStopRequest {
string job_id = 1;
}