diff --git a/backend/python/common/mlx_utils.py b/backend/python/common/mlx_utils.py
index 6b34eb962..de624fd5b 100644
--- a/backend/python/common/mlx_utils.py
+++ b/backend/python/common/mlx_utils.py
@@ -20,7 +20,15 @@ def split_reasoning(text, think_start, think_end):
Returns ``(reasoning_content, remaining_text)``. When ``think_start`` is
empty or not found, returns ``("", text)`` unchanged.
"""
- if not think_start or not text or think_start not in text:
+ if not think_start or not text:
+ return "", text
+ if think_start not in text:
+ # Models like Qwen3.5 open assistant turns already INSIDE thinking, so
+ # the generated text carries only the closing tag. Everything before it
+ # is reasoning that would otherwise leak into the content.
+ if think_end and think_end in text:
+ head, _, tail = text.partition(think_end)
+ return head.strip(), tail.strip()
return "", text
pattern = re.compile(
re.escape(think_start) + r"(.*?)" + re.escape(think_end or ""),
diff --git a/backend/python/common/mlx_utils_test.py b/backend/python/common/mlx_utils_test.py
new file mode 100644
index 000000000..809354ede
--- /dev/null
+++ b/backend/python/common/mlx_utils_test.py
@@ -0,0 +1,75 @@
+"""Unit tests for the mlx/mlx-vlm shared helpers (mlx_utils.py).
+
+Run standalone (Python standard library only, no backend venv needed):
+ python3 -m unittest mlx_utils_test
+
+These mirror the server-less helper tests in backend/python/mlx/test.py
+(TestSharedHelpers), but live here so they run on any platform: the mlx
+test module imports grpc/backend_pb2 at import time and needs the MLX venv,
+whereas mlx_utils only needs the standard library.
+"""
+
+import types
+import unittest
+
+from mlx_utils import parse_tool_calls, split_reasoning
+
+
+class TestSplitReasoning(unittest.TestCase):
+ def test_both_tags(self):
+ r, c = split_reasoning(
+ "step 1\nstep 2The answer is 42.", "", ""
+ )
+ self.assertEqual(r, "step 1\nstep 2")
+ self.assertEqual(c, "The answer is 42.")
+
+ def test_implicit_opener_only_closing_tag(self):
+ # Qwen3.5 opens the assistant turn already inside thinking, so the
+ # output carries only the closing tag; everything before it is reasoning.
+ r, c = split_reasoning(
+ "The user is asking about the weather.\n\n\nThe weather in Rome is sunny.",
+ "",
+ "",
+ )
+ self.assertEqual(r, "The user is asking about the weather.")
+ self.assertEqual(c, "The weather in Rome is sunny.")
+
+ def test_no_tags_at_all(self):
+ r, c = split_reasoning("just text", "", "")
+ self.assertEqual(r, "")
+ self.assertEqual(c, "just text")
+
+ def test_empty_think_end_and_no_opener_match(self):
+ # No think_end to anchor on, and the opener is absent → return unchanged.
+ r, c = split_reasoning("no opener here", "", "")
+ self.assertEqual(r, "")
+ self.assertEqual(c, "no opener here")
+
+ def test_empty_text(self):
+ r, c = split_reasoning("", "", "")
+ self.assertEqual(r, "")
+ self.assertEqual(c, "")
+
+
+class TestParseToolCalls(unittest.TestCase):
+ def test_with_shim(self):
+ tm = types.SimpleNamespace(
+ tool_call_start="",
+ tool_call_end="",
+ parse_tool_call=lambda body, tools: {
+ "name": "get_weather",
+ "arguments": {"location": body.strip()},
+ },
+ )
+ calls, remaining = parse_tool_calls(
+ "Sure: Paris", tm, tools=None
+ )
+ self.assertEqual(len(calls), 1)
+ self.assertEqual(calls[0]["name"], "get_weather")
+ self.assertEqual(calls[0]["arguments"], '{"location": "Paris"}')
+ self.assertEqual(calls[0]["index"], 0)
+ self.assertNotIn("", remaining)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/backend/python/common/python_utils.py b/backend/python/common/python_utils.py
index aa61ab578..c89813e2c 100644
--- a/backend/python/common/python_utils.py
+++ b/backend/python/common/python_utils.py
@@ -58,7 +58,18 @@ def messages_to_dicts(proto_messages):
d["reasoning_content"] = msg.reasoning_content
if msg.tool_calls:
try:
- d["tool_calls"] = json.loads(msg.tool_calls)
+ tool_calls = json.loads(msg.tool_calls)
+ # Chat templates (e.g. Qwen) iterate function.arguments as a
+ # mapping, but the OpenAI wire format carries it as a JSON
+ # string — decode it back so the template's .items() works.
+ for tc in tool_calls:
+ fn = tc.get("function") if isinstance(tc, dict) else None
+ if isinstance(fn, dict) and isinstance(fn.get("arguments"), str):
+ try:
+ fn["arguments"] = json.loads(fn["arguments"])
+ except json.JSONDecodeError:
+ pass
+ d["tool_calls"] = tool_calls
except json.JSONDecodeError:
pass
result.append(d)
diff --git a/backend/python/common/python_utils_test.py b/backend/python/common/python_utils_test.py
new file mode 100644
index 000000000..c395ce92d
--- /dev/null
+++ b/backend/python/common/python_utils_test.py
@@ -0,0 +1,122 @@
+"""Unit tests for the shared python backend helpers (python_utils.py).
+
+Run standalone (Python standard library only, no backend venv needed):
+ python3 -m unittest python_utils_test
+
+These mirror the server-less helper tests in backend/python/mlx/test.py
+(TestSharedHelpers), but live here so they run on any platform: the mlx
+test module imports grpc/backend_pb2 at import time and needs the MLX venv,
+whereas python_utils has no third-party dependency. Proto Message objects
+are faked with types.SimpleNamespace (real proto fields default to "").
+"""
+
+import json
+import types
+import unittest
+
+from python_utils import messages_to_dicts, parse_options
+
+
+def _msg(**fields):
+ """Fake a proto Message: every unset field is the empty string, as protobuf."""
+ defaults = {
+ "role": "",
+ "content": "",
+ "name": "",
+ "tool_call_id": "",
+ "reasoning_content": "",
+ "tool_calls": "",
+ }
+ defaults.update(fields)
+ return types.SimpleNamespace(**defaults)
+
+
+class TestParseOptions(unittest.TestCase):
+ def test_type_inference(self):
+ opts = parse_options(
+ ["temperature:0.7", "max_tokens:128", "trust:true", "name:hello", "no_colon_skipped"]
+ )
+ self.assertEqual(opts["temperature"], 0.7)
+ self.assertEqual(opts["max_tokens"], 128)
+ self.assertIs(opts["trust"], True)
+ self.assertEqual(opts["name"], "hello")
+ self.assertNotIn("no_colon_skipped", opts)
+
+
+class TestMessagesToDicts(unittest.TestCase):
+ def test_basic_fields(self):
+ out = messages_to_dicts(
+ [
+ _msg(role="user", content="hi"),
+ _msg(role="tool", content="42", tool_call_id="call_1", name="f"),
+ ]
+ )
+ self.assertEqual(out[0], {"role": "user", "content": "hi"})
+ self.assertEqual(out[1]["tool_call_id"], "call_1")
+ self.assertEqual(out[1]["name"], "f")
+
+ def test_tool_call_arguments_string_decoded_to_mapping(self):
+ # OpenAI wire format ships function.arguments as a JSON *string*; chat
+ # templates iterate it as a mapping, so it must come back as a dict.
+ out = messages_to_dicts(
+ [
+ _msg(
+ role="assistant",
+ tool_calls=json.dumps(
+ [
+ {
+ "id": "call_1",
+ "type": "function",
+ "function": {
+ "name": "get_weather",
+ "arguments": '{"location": "Rome"}',
+ },
+ }
+ ]
+ ),
+ )
+ ]
+ )
+ args = out[0]["tool_calls"][0]["function"]["arguments"]
+ self.assertEqual(args, {"location": "Rome"})
+ self.assertEqual(dict(args.items()), {"location": "Rome"})
+
+ def test_tool_call_arguments_already_mapping_is_idempotent(self):
+ out = messages_to_dicts(
+ [
+ _msg(
+ role="assistant",
+ tool_calls=json.dumps(
+ [{"function": {"name": "f", "arguments": {"a": 1}}}]
+ ),
+ )
+ ]
+ )
+ self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], {"a": 1})
+
+ def test_tool_call_arguments_invalid_json_left_as_string(self):
+ out = messages_to_dicts(
+ [
+ _msg(
+ role="assistant",
+ tool_calls=json.dumps(
+ [{"function": {"name": "f", "arguments": "not-json"}}]
+ ),
+ )
+ ]
+ )
+ self.assertEqual(out[0]["tool_calls"][0]["function"]["arguments"], "not-json")
+
+ def test_tool_call_without_function_key(self):
+ out = messages_to_dicts(
+ [_msg(role="assistant", tool_calls=json.dumps([{"id": "call_1"}]))]
+ )
+ self.assertEqual(out[0]["tool_calls"], [{"id": "call_1"}])
+
+ def test_tool_calls_invalid_json_dropped(self):
+ out = messages_to_dicts([_msg(role="assistant", tool_calls="{not json")])
+ self.assertNotIn("tool_calls", out[0])
+
+
+if __name__ == "__main__":
+ unittest.main()