Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
d0e6bf3aa7 fix(backend): don't let a client disconnect cancel the model load
Image generation (and the tts/transcript/embeddings/vad/rerank/llm helpers)
pass the request context to loader.Load so distributed routing decisions
reach the request's X-LocalAI-Node holder. That context also governs
cancellation of the load, so when a client disconnects mid-load the
LoadModel RPC is aborted, stopLoadProcess tears down the backend process,
and every retry restarts from scratch. Heavy diffusers/LLM models on a slow
host (e.g. a shared-memory iGPU) take long enough to load that the request
routinely ends first, so the model never finishes loading and the UI shows
"NetworkError when attempting to fetch resource".

Wrap the load context with context.WithoutCancel: the routing holder value
still propagates, but the request's cancellation no longer aborts the load,
so it runs to completion and caches for the next request. Inference keeps the
cancellable request context, so a disconnect still stops generation.

Adds a regression spec asserting a canceled request context does not cancel
the model load while the routing holder still reaches the router.

Fixes #10636

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-07-02 20:52:51 +00:00
24 changed files with 127 additions and 508 deletions

View File

@@ -1,5 +1,5 @@
IK_LLAMA_VERSION?=87fc8701ff4da81a7d2a91ec0695f95eb3066a47
IK_LLAMA_VERSION?=068b173649f2fd8dc96b35ada5a0b76d8985105d
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
CMAKE_ARGS?=

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=fdb1db877c526ec90f668eca1b858da5dba85560
LLAMA_VERSION?=4fc4ec5541b243957ae5099edb67372f8f3b550e
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -8,7 +8,7 @@
# Local development: point at a working checkout instead of cloning, e.g.
# make PRIVACY_FILTER_SRC=$HOME/c/privacy-filter.cpp grpc-server
PRIVACY_FILTER_VERSION?=735a6c28607ee82afc3a670383f41b55266a3b9a
PRIVACY_FILTER_VERSION?=595f59630c69d361b5196f2aba2c71c873d0c13c
PRIVACY_FILTER_REPO?=https://github.com/localai-org/privacy-filter.cpp
PRIVACY_FILTER_SRC?=

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# CrispASR version (release tag)
CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
CRISPASR_VERSION?=9a26976a8c8cf5af0afcdd04463cf8ba91e96a54
CRISPASR_VERSION?=fcbc8718e654995e3bd2d0c98bcb8e55e297d23c
SO_TARGET?=libgocrispasr.so
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=2574f5936571645f784b77623e1f09bad97d948a
STABLEDIFFUSION_GGML_VERSION?=3590aa8d626e671a1b1dc84506ea2932a243a480
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -20,15 +20,7 @@ def split_reasoning(text, think_start, think_end):
Returns ``(reasoning_content, remaining_text)``. When ``think_start`` is
empty or not found, returns ``("", text)`` unchanged.
"""
if not think_start or not text:
return "", text
if think_start not in text:
# Models like Qwen3.5 open assistant turns already INSIDE thinking, so
# the generated text carries only the closing tag. Everything before it
# is reasoning that would otherwise leak into the content.
if think_end and think_end in text:
head, _, tail = text.partition(think_end)
return head.strip(), tail.strip()
if not think_start or not text or think_start not in text:
return "", text
pattern = re.compile(
re.escape(think_start) + r"(.*?)" + re.escape(think_end or ""),

View File

@@ -1,75 +0,0 @@
"""Unit tests for the mlx/mlx-vlm shared helpers (mlx_utils.py).
Run standalone (Python standard library only, no backend venv needed):
python3 -m unittest mlx_utils_test
These mirror the server-less helper tests in backend/python/mlx/test.py
(TestSharedHelpers), but live here so they run on any platform: the mlx
test module imports grpc/backend_pb2 at import time and needs the MLX venv,
whereas mlx_utils only needs the standard library.
"""
import types
import unittest
from mlx_utils import parse_tool_calls, split_reasoning
class TestSplitReasoning(unittest.TestCase):
def test_both_tags(self):
r, c = split_reasoning(
"<think>step 1\nstep 2</think>The answer is 42.", "<think>", "</think>"
)
self.assertEqual(r, "step 1\nstep 2")
self.assertEqual(c, "The answer is 42.")
def test_implicit_opener_only_closing_tag(self):
# Qwen3.5 opens the assistant turn already inside thinking, so the
# output carries only the closing tag; everything before it is reasoning.
r, c = split_reasoning(
"The user is asking about the weather.\n</think>\n\nThe weather in Rome is sunny.",
"<think>",
"</think>",
)
self.assertEqual(r, "The user is asking about the weather.")
self.assertEqual(c, "The weather in Rome is sunny.")
def test_no_tags_at_all(self):
r, c = split_reasoning("just text", "<think>", "</think>")
self.assertEqual(r, "")
self.assertEqual(c, "just text")
def test_empty_think_end_and_no_opener_match(self):
# No think_end to anchor on, and the opener is absent → return unchanged.
r, c = split_reasoning("no opener here", "<think>", "")
self.assertEqual(r, "")
self.assertEqual(c, "no opener here")
def test_empty_text(self):
r, c = split_reasoning("", "<think>", "</think>")
self.assertEqual(r, "")
self.assertEqual(c, "")
class TestParseToolCalls(unittest.TestCase):
def test_with_shim(self):
tm = types.SimpleNamespace(
tool_call_start="<tool_call>",
tool_call_end="</tool_call>",
parse_tool_call=lambda body, tools: {
"name": "get_weather",
"arguments": {"location": body.strip()},
},
)
calls, remaining = parse_tool_calls(
"Sure: <tool_call>Paris</tool_call>", tm, tools=None
)
self.assertEqual(len(calls), 1)
self.assertEqual(calls[0]["name"], "get_weather")
self.assertEqual(calls[0]["arguments"], '{"location": "Paris"}')
self.assertEqual(calls[0]["index"], 0)
self.assertNotIn("<tool_call>", remaining)
if __name__ == "__main__":
unittest.main()

View File

@@ -58,18 +58,7 @@ def messages_to_dicts(proto_messages):
d["reasoning_content"] = msg.reasoning_content
if msg.tool_calls:
try:
tool_calls = json.loads(msg.tool_calls)
# Chat templates (e.g. Qwen) iterate function.arguments as a
# mapping, but the OpenAI wire format carries it as a JSON
# string — decode it back so the template's .items() works.
for tc in tool_calls:
fn = tc.get("function") if isinstance(tc, dict) else None
if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
try:
fn["arguments"] = json.loads(fn["arguments"])
except json.JSONDecodeError:
pass
d["tool_calls"] = tool_calls
d["tool_calls"] = json.loads(msg.tool_calls)
except json.JSONDecodeError:
pass
result.append(d)

View File

@@ -1,122 +0,0 @@
"""Unit tests for the shared python backend helpers (python_utils.py).
Run standalone (Python standard library only, no backend venv needed):
python3 -m unittest python_utils_test
These mirror the server-less helper tests in backend/python/mlx/test.py
(TestSharedHelpers), but live here so they run on any platform: the mlx
test module imports grpc/backend_pb2 at import time and needs the MLX venv,
whereas python_utils has no third-party dependency. Proto Message objects
are faked with types.SimpleNamespace (real proto fields default to "").
"""
import json
import types
import unittest
from python_utils import messages_to_dicts, parse_options
def _msg(**fields):
"""Fake a proto Message: every unset field is the empty string, as protobuf."""
defaults = {
"role": "",
"content": "",
"name": "",
"tool_call_id": "",
"reasoning_content": "",
"tool_calls": "",
}
defaults.update(fields)
return types.SimpleNamespace(**defaults)
class TestParseOptions(unittest.TestCase):
def test_type_inference(self):
opts = parse_options(
["temperature:0.7", "max_tokens:128", "trust:true", "name:hello", "no_colon_skipped"]
)
self.assertEqual(opts["temperature"], 0.7)
self.assertEqual(opts["max_tokens"], 128)
self.assertIs(opts["trust"], True)
self.assertEqual(opts["name"], "hello")
self.assertNotIn("no_colon_skipped", opts)
class TestMessagesToDicts(unittest.TestCase):
def test_basic_fields(self):
out = messages_to_dicts(
[
_msg(role="user", content="hi"),
_msg(role="tool", content="42", tool_call_id="call_1", name="f"),
]
)
self.assertEqual(out[0], {"role": "user", "content": "hi"})
self.assertEqual(out[1]["tool_call_id"], "call_1")
self.assertEqual(out[1]["name"], "f")
def test_tool_call_arguments_string_decoded_to_mapping(self):
# OpenAI wire format ships function.arguments as a JSON *string*; chat
# templates iterate it as a mapping, so it must come back as a dict.
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[
{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": '{"location": "Rome"}',
},
}
]
),
)
]
)
args = out[0]["tool_calls"][0]["function"]["arguments"]
self.assertEqual(args, {"location": "Rome"})
self.assertEqual(dict(args.items()), {"location": "Rome"})
def test_tool_call_arguments_already_mapping_is_idempotent(self):
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[{"function": {"name": "f", "arguments": {"a": 1}}}]
),
)
]
)
self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], {"a": 1})
def test_tool_call_arguments_invalid_json_left_as_string(self):
out = messages_to_dicts(
[
_msg(
role="assistant",
tool_calls=json.dumps(
[{"function": {"name": "f", "arguments": "not-json"}}]
),
)
]
)
self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], "not-json")
def test_tool_call_without_function_key(self):
out = messages_to_dicts(
[_msg(role="assistant", tool_calls=json.dumps([{"id": "call_1"}]))]
)
self.assertEqual(out[0]["tool_calls"], [{"id": "call_1"}])
def test_tool_calls_invalid_json_dropped(self):
out = messages_to_dicts([_msg(role="assistant", tool_calls="{not json")])
self.assertNotIn("tool_calls", out[0])
if __name__ == "__main__":
unittest.main()

View File

@@ -35,21 +35,6 @@ if [ "x${BUILD_PROFILE}" == "xcpu" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# AMD ROCm: vLLM ships prebuilt ROCm wheels, but on a DEDICATED index
# (https://wheels.vllm.ai/rocm/), NOT PyPI, and ONLY for CPython 3.12. On any
# other Python the installer silently falls back to the CUDA-only PyPI wheel,
# which is unusable on an AMD GPU (import fails, so the backend never finds the
# vllm module). Force Python 3.12 before the venv is created (matches the
# intel/l4t13 cp312 bump); the hipblas branch below pulls vllm from the ROCm
# wheel index. unsafe-best-match lets uv consult that index and PyPI together.
# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
if [ "x${BUILD_TYPE}" == "xhipblas" ]; then
PYTHON_VERSION="3.12"
PYTHON_PATCH="12"
PY_STANDALONE_TAG="20251120"
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi
# cublas13 pulls the vLLM wheel from a per-tag cu130 index (PyPI's vllm wheel
# is built against CUDA 12 and won't load on cu130). uv's default per-package
# first-match strategy would still pick the PyPI wheel, so allow it to consult
@@ -119,7 +104,7 @@ if [ "$(uname -s)" = "Darwin" ]; then
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
# vllm pin (requirements-cublas13-after.txt, bumped independently against
# vllm/vllm) until vllm-metal supports a newer vLLM.
VLLM_METAL_VERSION="v0.3.0.dev20260701212152"
VLLM_METAL_VERSION="v0.3.0.dev20260701132215"
# The coupled vLLM source version is whatever this vllm-metal release builds
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
@@ -209,22 +194,6 @@ elif [ "x${BUILD_TYPE}" == "xintel" ]; then
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
popd
# AMD ROCm: install vllm from its dedicated ROCm wheel index instead of the
# CUDA-only PyPI wheel. installRequirements brings the base ROCm
# torch/transformers (requirements-hipblas.txt), then we pull vllm (plus the
# matching ROCm torch, via --upgrade) from wheels.vllm.ai/rocm. This is the
# method upstream prescribes for AMD; the Python-3.12 pin is set above.
# There is intentionally no requirements-hipblas-after.txt: a bare `vllm`
# there would resolve to the CUDA wheel, and installRequirements never loads
# a ${BUILD_TYPE}-after file for hipblas anyway (BUILD_TYPE == BUILD_PROFILE).
# https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html?device=rocm
elif [ "x${BUILD_TYPE}" == "xhipblas" ]; then
installRequirements
# --upgrade reconciles the base ROCm torch to whatever the vllm ROCm wheel
# pins; --extra-index-url adds the ROCm wheel repository on top of PyPI.
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
--extra-index-url https://wheels.vllm.ai/rocm/ --upgrade vllm
# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
# requirements-cpu-after.txt and compiles vllm locally against the host's
# actual CPU. Not used by default because it takes ~30-40 minutes, but

View File

@@ -0,0 +1 @@
vllm

View File

@@ -157,6 +157,33 @@ var _ = Describe("X-LocalAI-Node ctx propagation contract", func() {
stampViaRouterCtx()
})
// Regression for #10636: a canceled request context must NOT cancel the
// model LOAD. The heavy image/audio backends bind the load to the request
// context so the routing holder reaches the SmartRouter; but a large
// diffusers/LLM model on a slow (e.g. shared-memory iGPU) host can take
// far longer to load than the client stays connected. If the request's
// cancellation propagates to the load, the LoadModel RPC is aborted, the
// backend process is torn down, and every retry restarts from scratch and
// never converges. The load must instead run to completion and cache while
// still carrying the request's routing holder value.
It("ImageGeneration does not propagate request cancellation to the model load", func() {
canceledCtx, cancel := context.WithCancel(reqCtx)
cancel() // client disconnected while the (slow) load was still running
_, err := backend.ImageGeneration(canceledCtx, 64, 64, 1, 0, "p", "", "", "/tmp/out.png", loader, modelCfg, appCfg, nil)
// The load reached the router (short-circuit sentinel), i.e. it was
// NOT aborted early by the already-canceled request context.
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("router short-circuit (test)"))
routerCtx := routerCtxOf()
Expect(routerCtx).ToNot(BeNil(), "router callback must have been invoked")
Expect(routerCtx.Err()).To(BeNil(),
"a canceled request must not cancel the model load")
// The routing holder value still propagates despite the decoupling.
stampViaRouterCtx()
})
It("does NOT leak the holder when the app context is used instead", func() {
// Sanity: the bug being fixed manifests as the router getting
// appCfg.Context (no holder) instead of reqCtx (holder). A direct

View File

@@ -40,10 +40,14 @@ func (e *modelEmbedder) Embed(ctx context.Context, text string) ([]float32, erro
func ModelEmbedding(ctx context.Context, s string, tokens []int, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
inferenceModel, err := loader.Load(opts...)
if err != nil {

View File

@@ -13,10 +13,14 @@ import (
func ImageGeneration(ctx context.Context, height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
inferenceModel, err := loader.Load(
opts...,
)

View File

@@ -111,7 +111,12 @@ func ModelInference(ctx context.Context, s string, messages schema.Messages, ima
}
ctx = distributedhdr.MaybeWithPrefixChain(ctx, c.ModelID(), chainSource)
opts := ModelOptions(*c, o, model.WithContext(ctx))
// context.WithoutCancel decouples the model load from the request's
// cancellation while preserving its routing values, so a slow load still
// completes and caches if the client disconnects instead of aborting the
// LoadModel RPC mid-load (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(*c, o, model.WithContext(context.WithoutCancel(ctx)))
inferenceModel, err := loader.Load(opts...)
if err != nil {
recordModelLoadFailure(o, c.Name, c.Backend, err, map[string]any{"model_file": modelFile})

View File

@@ -57,10 +57,14 @@ func (r *modelReranker) Rerank(ctx context.Context, query string, documents []st
}
func Rerank(ctx context.Context, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, modelConfig config.ModelConfig) (*proto.RerankResult, error) {
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
rerankModel, err := loader.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)

View File

@@ -45,10 +45,14 @@ func loadTranscriptionModel(ctx context.Context, ml *model.ModelLoader, modelCon
if modelConfig.Backend == "" {
modelConfig.Backend = model.WhisperBackend
}
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
transcriptionModel, err := ml.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)

View File

@@ -50,10 +50,14 @@ func ModelTTS(
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig,
) (string, *proto.Result, error) {
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
ttsModel, err := loader.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
@@ -153,7 +157,9 @@ func ModelTTSStream(
modelConfig config.ModelConfig,
audioCallback func([]byte) error,
) error {
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// See ModelTTS above: WithoutCancel decouples the load from request
// cancellation while preserving routing values (issue #10636).
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
ttsModel, err := loader.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)

View File

@@ -14,10 +14,14 @@ func VAD(request *schema.VADRequest,
ml *model.ModelLoader,
appConfig *config.ApplicationConfig,
modelConfig config.ModelConfig) (*schema.VADResponse, error) {
// model.WithContext(ctx) overrides the app-context default set in
// ModelOptions so distributed routing decisions reach the request's
// X-LocalAI-Node holder via distributedhdr.Stamp.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
// model.WithContext carries the request context into the load so distributed
// routing decisions reach the request's X-LocalAI-Node holder via
// distributedhdr.Stamp. context.WithoutCancel keeps those values but drops
// the request's cancellation, so a slow first load still completes and
// caches if the client disconnects instead of aborting the LoadModel RPC and
// tearing down the backend process (issue #10636). Inference below keeps the
// cancellable ctx, so a disconnect still stops generation.
opts := ModelOptions(modelConfig, appConfig, model.WithContext(context.WithoutCancel(ctx)))
vadModel, err := ml.Load(opts...)
if err != nil {
recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)

View File

@@ -1,133 +0,0 @@
import { test, expect } from './coverage-fixtures.js'
// Seeds two-message chat into localStorage so we don't need a live model.
async function seedChat(page, history) {
await page.addInitScript((h) => {
const chat = {
id: 'seed1', name: 'Seeded Chat', model: 'test-model',
history: h, systemPrompt: '', mcpMode: false, mcpServers: [],
clientMCPServers: [], temperature: null, topP: null, topK: null,
tokenUsage: { prompt: 0, completion: 0, total: 0 },
contextSize: null, createdAt: Date.now(), updatedAt: Date.now(),
}
localStorage.setItem('localai_chats_data', JSON.stringify({
chats: [chat], activeChatId: 'seed1', lastSaved: Date.now(),
}))
}, history)
}
async function mockModels(page) {
await page.route('**/api/models/capabilities', (route) => route.fulfill({
contentType: 'application/json',
body: JSON.stringify({ data: [{ id: 'test-model', capabilities: ['FLAG_CHAT'] }] }),
}))
await page.route('**/api/operations', (route) => route.fulfill({
contentType: 'application/json', body: JSON.stringify({ operations: [] }),
}))
}
const TWO_TURNS = [
{ role: 'user', content: 'first question' },
{ role: 'assistant', content: 'first answer' },
{ role: 'user', content: 'second question' },
{ role: 'assistant', content: 'second answer' },
]
test('duplicate creates an independent copy and switches to it', async ({ page }) => {
await mockModels(page)
await seedChat(page, TWO_TURNS)
await page.goto('/app/chat')
// Open the chats menu (Ctrl/Cmd+K) and duplicate the seeded chat.
// Wait for the menu trigger to mount so its global keydown listener is armed
// before we dispatch the shortcut.
await page.getByTitle('Conversations (Ctrl/Cmd+K)').waitFor()
await page.keyboard.press('Control+k')
await page.getByTitle('Duplicate chat').first().click()
// A new active chat named "Seeded Chat (fork)" with the same 4 messages.
await expect(page.locator('.chat-header-title')).toHaveText('Seeded Chat (fork)')
await expect(page.locator('.chat-message-user')).toHaveCount(2)
await expect(page.locator('.chat-message-assistant')).toHaveCount(2)
})
async function mockCompletion(page, replyText) {
await page.route('**/v1/chat/completions', (route) => {
const sse =
`data: ${JSON.stringify({ choices: [{ delta: { content: replyText } }] })}\n\n` +
`data: ${JSON.stringify({ choices: [{ delta: {}, finish_reason: 'stop' }], usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 } })}\n\n` +
`data: [DONE]\n\n`
route.fulfill({ status: 200, contentType: 'text/event-stream', body: sse })
})
}
test('retry regenerates the first answer and drops the later turn', async ({ page }) => {
await mockModels(page)
// Capture the outbound request body so we can assert the model receives the
// truncated history (not the stale downstream turns).
let sentMessages = null
await page.route('**/v1/chat/completions', (route) => {
sentMessages = route.request().postDataJSON()?.messages || []
const sse =
`data: ${JSON.stringify({ choices: [{ delta: { content: 'REGENERATED first answer' } }] })}\n\n` +
`data: ${JSON.stringify({ choices: [{ delta: {}, finish_reason: 'stop' }], usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 } })}\n\n` +
`data: [DONE]\n\n`
route.fulfill({ status: 200, contentType: 'text/event-stream', body: sse })
})
await seedChat(page, TWO_TURNS)
await page.goto('/app/chat')
// Hover the FIRST assistant message and click its retry button.
const firstAssistant = page.locator('.chat-message-assistant').first()
await firstAssistant.hover()
await firstAssistant.getByTitle('Regenerate').click()
// History is truncated to the first user turn, then the new answer streams in;
// the second Q/A turn is gone.
await expect(page.locator('.chat-message-assistant')).toContainText(['REGENERATED first answer'])
await expect(page.locator('.chat-message-user')).toHaveCount(1)
await expect(page.locator('.chat-message-assistant')).toHaveCount(1)
// The OUTBOUND payload must also be truncated: the resent user turn is present,
// but the downstream turn and the stale first answer must be gone.
const contents = (sentMessages || []).map(m =>
typeof m.content === 'string' ? m.content : JSON.stringify(m.content)
)
expect(contents.join('\n')).toContain('first question')
expect(contents.join('\n')).not.toContain('second question')
expect(contents.join('\n')).not.toContain('first answer')
})
test('copy chat puts the whole conversation on the clipboard', async ({ page, context }) => {
await context.grantPermissions(['clipboard-read', 'clipboard-write'])
await mockModels(page)
await seedChat(page, TWO_TURNS)
await page.goto('/app/chat')
// Wait for the menu trigger to mount so its global keydown listener is armed
// before we dispatch the shortcut (same mount-race guard as the duplicate test).
await page.getByTitle('Conversations (Ctrl/Cmd+K)').waitFor()
await page.keyboard.press('Control+k')
await page.getByTitle('Copy chat').first().click()
const clip = await page.evaluate(() => navigator.clipboard.readText())
expect(clip).toContain('# Seeded Chat')
expect(clip).toContain('first answer')
expect(clip).toContain('second answer')
})
test('branch from the first answer forks history up to that point', async ({ page }) => {
await mockModels(page)
await seedChat(page, TWO_TURNS)
await page.goto('/app/chat')
const firstAssistant = page.locator('.chat-message-assistant').first()
await firstAssistant.hover()
await firstAssistant.getByTitle('Branch from here').click()
// New active chat "Seeded Chat (fork)" contains only the first Q/A turn.
await expect(page.locator('.chat-header-title')).toHaveText('Seeded Chat (fork)')
await expect(page.locator('.chat-message-user')).toHaveCount(1)
await expect(page.locator('.chat-message-assistant')).toHaveCount(1)
await expect(page.locator('.chat-message-assistant')).toContainText(['first answer'])
})

View File

@@ -72,7 +72,6 @@
"actions": {
"copy": "Copy",
"regenerate": "Regenerate",
"branch": "Branch from here",
"jumpToLatest": "Jump to latest"
},
"streaming": {
@@ -101,9 +100,7 @@
"toasts": {
"selectModel": "Please select a model",
"copied": "Copied to clipboard",
"copyFailed": "Could not copy to clipboard",
"chatCopied": "Chat copied to clipboard",
"forked": "Created a new chat"
"copyFailed": "Could not copy to clipboard"
},
"menu": {
"trigger": "Chats",
@@ -113,8 +110,6 @@
"noMatch": "No conversations match your search",
"noConversations": "No conversations yet",
"rename": "Rename",
"duplicate": "Duplicate chat",
"copyChat": "Copy chat",
"exportMarkdown": "Export as Markdown",
"deleteChat": "Delete chat",
"newChat": "New chat",

View File

@@ -24,8 +24,6 @@ const ChatsMenu = forwardRef(function ChatsMenu({
onDeleteAll,
onRename,
onExport,
onCopyChat,
onDuplicate,
}, ref) {
const { t } = useTranslation('chat')
const [open, setOpen] = useState(false)
@@ -232,24 +230,6 @@ const ChatsMenu = forwardRef(function ChatsMenu({
>
<i className="fas fa-pen" />
</button>
{onDuplicate && (
<button
type="button"
onClick={(e) => { e.stopPropagation(); onDuplicate(chat); setOpen(false) }}
title={t('menu.duplicate')}
>
<i className="fas fa-clone" />
</button>
)}
{(chat.history?.length || 0) > 0 && onCopyChat && (
<button
type="button"
onClick={(e) => { e.stopPropagation(); onCopyChat(chat) }}
title={t('menu.copyChat')}
>
<i className="fas fa-clipboard" />
</button>
)}
{(chat.history?.length || 0) > 0 && onExport && (
<button
type="button"

View File

@@ -141,24 +141,6 @@ export function useChat(initialModel = '') {
return chat
}, [])
const forkChat = useCallback((chatId, uptoIndex) => {
const src = chats.find(c => c.id === chatId)
if (!src) return null
const end = typeof uptoIndex === 'number' ? uptoIndex : src.history.length
const forked = {
...src,
id: generateId(),
name: `${src.name} (fork)`,
history: structuredClone(src.history.slice(0, end)),
tokenUsage: { prompt: 0, completion: 0, total: 0 },
createdAt: Date.now(),
updatedAt: Date.now(),
}
setChats(prev => [forked, ...prev])
setActiveChatId(forked.id)
return forked
}, [chats])
const switchChat = useCallback((chatId) => {
setActiveChatId(chatId)
setStreamingContent('')
@@ -278,12 +260,8 @@ export function useChat(initialModel = '') {
if (chat?.systemPrompt) {
messages.push({ role: 'system', content: chat.systemPrompt })
}
// Filter out thinking/reasoning/tool_call/tool_result messages.
// options.baseHistory lets callers (e.g. mid-conversation retry) pass the
// intended truncated history synchronously; the closure `chat` still holds
// the stale pre-truncation state because setChats only schedules an update.
const baseHistory = options.baseHistory || chat?.history || []
const historyForApi = baseHistory.filter(m =>
// Filter out thinking/reasoning/tool_call/tool_result messages
const historyForApi = (chat?.history || []).filter(m =>
m.role !== 'thinking' && m.role !== 'reasoning' && m.role !== 'tool_call' && m.role !== 'tool_result'
)
messages.push(...historyForApi, { role: 'user', content: messageContent })
@@ -815,7 +793,6 @@ export function useChat(initialModel = '') {
tokensPerSecond,
maxTokensPerSecond,
addChat,
forkChat,
switchChat,
deleteChat,
deleteAllChats,

View File

@@ -33,7 +33,7 @@ function getLastMessagePreview(chat) {
return ''
}
function serializeChatAsMarkdown(chat) {
function exportChatAsMarkdown(chat) {
let md = `# ${chat.name}\n\n`
md += `Model: ${chat.model || 'Unknown'}\n`
md += `Date: ${new Date(chat.createdAt).toLocaleString()}\n\n---\n\n`
@@ -47,11 +47,7 @@ function serializeChatAsMarkdown(chat) {
md += `<details><summary>Thinking</summary>\n\n${msg.content}\n\n</details>\n\n`
}
}
return md
}
function downloadChatAsMarkdown(chat) {
const blob = new Blob([serializeChatAsMarkdown(chat)], { type: 'text/markdown' })
const blob = new Blob([md], { type: 'text/markdown' })
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
@@ -298,7 +294,7 @@ export default function Chat() {
const {
chats, activeChat, activeChatId, isStreaming, streamingChatId, streamingContent,
streamingReasoning, streamingToolCalls, tokensPerSecond, maxTokensPerSecond,
addChat, forkChat, switchChat, deleteChat, deleteAllChats, renameChat, updateChatSettings,
addChat, switchChat, deleteChat, deleteAllChats, renameChat, updateChatSettings,
sendMessage, stopGeneration, clearHistory, getContextUsagePercent, addMessage,
} = useChat(urlModel || '')
@@ -799,27 +795,34 @@ export default function Chat() {
await sendMessage(msg, files, mcpOptions)
}, [input, files, activeChat, sendMessage, addToast, getToolsForLLM, isClientTool, executeTool, hasAppUI, getAppResource, getToolDefinition])
const handleRegenerate = useCallback(async (targetIndex) => {
const handleRegenerate = useCallback(async () => {
if (!activeChat || isStreaming) return
const history = activeChat.history
const end = typeof targetIndex === 'number' ? targetIndex : history.length
// Nearest user message at or before the target answer.
let userIdx = -1
for (let i = Math.min(end, history.length) - 1; i >= 0; i--) {
if (history[i].role === 'user') { userIdx = i; break }
let lastUserMsg = null
let lastUserFiles = null
for (let i = history.length - 1; i >= 0; i--) {
if (history[i].role === 'user') {
lastUserMsg = typeof history[i].content === 'string' ? history[i].content : history[i].content?.[0]?.text || ''
lastUserFiles = history[i].files || []
break
}
}
if (userIdx === -1) return
const userMsg = typeof history[userIdx].content === 'string'
? history[userIdx].content
: history[userIdx].content?.[0]?.text || ''
const userFiles = history[userIdx].files || []
// Drop the user turn and everything after it; sendMessage re-appends it.
// Thread the truncated history through explicitly: updateChatSettings only
// schedules a state update, so sendMessage's closure would otherwise read
// the stale pre-truncation history for the outbound API payload.
const baseHistory = history.slice(0, userIdx)
updateChatSettings(activeChat.id, { history: baseHistory })
await sendMessage(userMsg, userFiles, { baseHistory })
if (!lastUserMsg) return
// Remove everything after and including the last user message
const newHistory = []
let foundLastUser = false
for (let i = history.length - 1; i >= 0; i--) {
if (!foundLastUser && history[i].role === 'user') {
foundLastUser = true
continue
}
if (foundLastUser) {
newHistory.unshift(history[i])
}
}
updateChatSettings(activeChat.id, { history: newHistory })
await sendMessage(lastUserMsg, lastUserFiles)
}, [activeChat, isStreaming, sendMessage, updateChatSettings])
const handleKeyDown = (e) => {
@@ -849,11 +852,6 @@ export default function Chat() {
}
}
const copyChatAsMarkdown = async (chat) => {
const ok = await copyToClipboard(serializeChatAsMarkdown(chat))
addToast(ok ? t('toasts.chatCopied') : t('toasts.copyFailed'), ok ? 'success' : 'error', ok ? 2000 : 3000)
}
const contextPercent = getContextUsagePercent()
// Recent chats for the empty state — exclude the current chat and any
@@ -894,9 +892,7 @@ export default function Chat() {
onDelete={deleteChat}
onDeleteAll={promptDeleteAll}
onRename={renameChat}
onExport={(chat) => downloadChatAsMarkdown(chat)}
onCopyChat={(chat) => copyChatAsMarkdown(chat)}
onDuplicate={(chat) => { if (forkChat(chat.id)) addToast(t('toasts.forked'), 'success', 2000) }}
onExport={(chat) => exportChatAsMarkdown(chat)}
/>
{activeChat.localaiAssistant && (
<span
@@ -1188,19 +1184,11 @@ export default function Chat() {
<button onClick={() => copyMessage(msg.content)} title={t('actions.copy')}>
<i className="fas fa-copy" />
</button>
{msg.role === 'assistant' && !isStreaming && (
<button onClick={() => handleRegenerate(i)} title={t('actions.regenerate')}>
{msg.role === 'assistant' && i === activeChat.history.length - 1 && !isStreaming && (
<button onClick={handleRegenerate} title={t('actions.regenerate')}>
<i className="fas fa-rotate" />
</button>
)}
{msg.role === 'assistant' && !isStreaming && (
<button
onClick={() => { forkChat(activeChat.id, i + 1); addToast(t('toasts.forked'), 'success', 2000) }}
title={t('actions.branch')}
>
<i className="fas fa-code-branch" />
</button>
)}
</div>
</div>
</div>