cmd: add prompt rendering mcp

2026-01-06 06:31:14 -05:00 · 2025-12-23 16:41:25 -05:00
2 changed files with 467 additions and 0 deletions
--- a/cmd/prompt-rendering/README.md
+++ b/cmd/prompt-rendering/README.md
@@ -0,0 +1,156 @@
+# HuggingFace Prompt Renderer MCP Server
+
+Model Context Protocol (MCP) server for rendering conversation messages into
+model-specific prompt strings using HuggingFace tokenizer chat templates.
+
+## Requirements
+
+- [uv](https://docs.astral.sh/uv/) - Fast Python package installer
+
+## Usage
+
+### MCP Server Mode
+
+Run the MCP server over stdio for use with MCP clients:
+
+```bash
+uv run cmd/prompt-rendering/server.py --mcp
+```
+
+Add to your MCP client configuration (e.g., for Claude Desktop):
+
+```json
+{
+  "mcpServers": {
+    "huggingface-prompt-renderer": {
+      "command": "uv",
+      "args": [
+        "run",
+        "--directory",
+        "<path-to-ollama-repo>",
+        "cmd/prompt-rendering/server.py",
+        "--mcp"
+      ]
+    }
+  }
+}
+```
+
+### FastAPI Server Mode
+
+Start a FastAPI server for manual HTTP testing:
+
+```bash
+# Start on default port 8000
+uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
+
+# Start on custom port
+uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 9000
+```
+
+#### Endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| POST | `/generate-prompt` | Generate prompt from messages |
+| GET | `/health` | Health check |
+
+### Test with curl
+
+```bash
+# Basic user message
+curl -X POST http://localhost:8000/generate-prompt \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "Hello!"}]
+  }'
+
+# With tools
+curl -X POST http://localhost:8000/generate-prompt \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "system", "content": "You are a helpful assistant."},
+      {"role": "user", "content": "What is the weather?"}
+    ],
+    "model": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+    "tools": [{
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "description": "Get the current weather",
+        "parameters": {
+          "type": "object",
+          "required": ["location"],
+          "properties": {
+            "location": {"type": "string", "description": "The city"}
+          }
+        }
+      }
+    }]
+  }'
+
+# With tool calls
+curl -X POST http://localhost:8000/generate-prompt \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [
+      {"role": "user", "content": "What is the weather in SF?"},
+      {
+        "role": "assistant",
+        "tool_calls": [{
+          "id": "call_1",
+          "type": "function",
+          "function": {
+            "name": "get_weather",
+            "arguments": {"location": "San Francisco"}
+          }
+        }]
+      },
+      {"role": "tool", "content": "{\"temperature\": 68}", "tool_call_id": "call_1"}
+    ],
+    "tools": [{
+      "type": "function",
+      "function": {
+        "name": "get_weather",
+        "parameters": {
+          "type": "object",
+          "properties": {"location": {"type": "string"}}
+        }
+      }
+    }]
+  }'
+```
+
+## Supported Message Formats
+
+The server supports multiple message formats:
+
+| Format | Description |
+|--------|-------------|
+| OpenAI | Standard `role`, `content`, `tool_calls`, `tool_call_id` |
+| OLMo | Adds `functions` and `function_calls` fields |
+| DeepSeek | Tool call arguments must be JSON strings |
+
+## Tool Support
+
+| Setting | Description |
+|---------|-------------|
+| `inject_tools_as_functions=true` | Injects tools into system message as `functions` key (OLMo-style) |
+| `inject_tools_as_functions=false` | Passes tools separately to `apply_chat_template` (standard transformers) |
+
+## Models
+
+The server uses HuggingFace's `transformers` library and supports any model
+with a chat template. Default: `Qwen/Qwen3-Coder-480B-A35B-Instruct`
+
+## Dependencies
+
+The script uses PEP 723 inline dependency metadata. When run with `uv`,
+dependencies are automatically installed into an isolated environment:
+
+- `fastapi` - Web framework
+- `uvicorn` - ASGI server
+- `transformers` - HuggingFace tokenizer
+- `jinja2` - Template engine
+- `mcp` - Model Context Protocol
--- a/cmd/prompt-rendering/server.py
+++ b/cmd/prompt-rendering/server.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "fastapi",
+#   "uvicorn",
+#   "transformers",
+#   "jinja2",
+#   "mcp",
+# ]
+# ///
+"""
+HuggingFace Prompt Renderer MCP Server
+
+Model Context Protocol (MCP) server for rendering conversation messages into
+model-specific prompt strings using HuggingFace tokenizer chat templates.
+
+Usage:
+    # Run MCP server over stdio
+    uv run cmd/prompt-rendering/server.py --mcp
+
+    # Start FastAPI server for manual testing
+    uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
+
+    # Test with curl
+    curl -X POST http://localhost:8000/generate-prompt \\
+      -H "Content-Type: application/json" \\
+      -d '{"messages": [{"role": "user", "content": "Hello!"}]}'
+"""
+
+from typing import Any, Dict, List, Optional
+
+import argparse
+import json
+
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+import uvicorn
+from transformers import AutoTokenizer
+
+try:
+    from mcp.server.fastmcp import FastMCP
+except Exception:
+    FastMCP = None
+
+# Cache for tokenizers to avoid reloading
+_tokenizer_cache: Dict[str, Any] = {}
+
+
+class Message(BaseModel):
+    role: str
+    content: Optional[str] = None
+    tool_calls: Optional[List[Dict[str, Any]]] = None
+    tool_call_id: Optional[str] = None
+    functions: Optional[str] = None  # For OLMo-style function passing
+    function_calls: Optional[str] = None  # For OLMo-style function call results
+
+
+class GeneratePromptRequest(BaseModel):
+    messages: List[Message]
+    model: str
+    tools: Optional[List[Dict[str, Any]]] = None
+    # Whether to inject tools into system message as 'functions' key (for OLMo-style templates)
+    inject_tools_as_functions: Optional[bool] = True
+
+
+class GeneratePromptResponse(BaseModel):
+    prompt: str
+    model: str
+
+
+# FastAPI app
+app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0")
+
+
+def get_tokenizer(model_name: str) -> Any:
+    """Get or create tokenizer for the given model."""
+    if model_name not in _tokenizer_cache:
+        _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
+            model_name, trust_remote_code=True
+        )
+    return _tokenizer_cache[model_name]
+
+
+def is_deepseek_model(model_name: str) -> bool:
+    """Check if this is a DeepSeek model."""
+    return "deepseek" in model_name.lower()
+
+
+def normalize_messages(
+    raw_messages: List[Any],
+    tools: Optional[List[Dict[str, Any]]],
+    inject_tools_as_functions: bool,
+    model: str,
+) -> List[Dict[str, Any]]:
+    """Normalize messages for different chat template formats."""
+    messages: List[Dict[str, Any]] = []
+    tools_json = json.dumps(tools) if tools else None
+    is_deepseek = is_deepseek_model(model)
+
+    for msg in raw_messages:
+        message = msg if isinstance(msg, Message) else Message(**msg)
+        message_dict: Dict[str, Any] = {"role": message.role, "content": None}
+
+        if message.content is not None:
+            message_dict["content"] = message.content
+
+        # Handle explicit functions field (OLMo-style)
+        if message.functions is not None:
+            message_dict["functions"] = message.functions
+        # Inject tools into system message as 'functions' (for OLMo templates)
+        elif inject_tools_as_functions and message.role == "system" and tools_json:
+            message_dict["functions"] = tools_json
+
+        # Handle explicit function_calls field (OLMo-style)
+        if message.function_calls is not None:
+            message_dict["function_calls"] = message.function_calls
+        # Convert tool_calls for templates
+        elif message.tool_calls is not None:
+            if is_deepseek:
+                # DeepSeek format: arguments must be a JSON string
+                tool_calls = []
+                for tool_call in message.tool_calls:
+                    tc = {
+                        "type": "function",
+                        "function": {
+                            "name": tool_call["function"]["name"],
+                            "arguments": json.dumps(tool_call["function"]["arguments"])
+                            if isinstance(tool_call["function"]["arguments"], dict)
+                            else tool_call["function"]["arguments"],
+                        },
+                    }
+                    tool_calls.append(tc)
+                message_dict["tool_calls"] = tool_calls
+            elif inject_tools_as_functions:
+                # Convert to OLMo function_calls format
+                message_dict["function_calls"] = json.dumps(message.tool_calls)
+            else:
+                # Standard transformers format
+                tool_calls = []
+                for tool_call in message.tool_calls:
+                    tool_call_copy = tool_call.copy()
+                    if (
+                        "function" in tool_call_copy
+                        and "arguments" in tool_call_copy["function"]
+                    ):
+                        try:
+                            tool_call_copy["function"]["arguments"] = json.loads(
+                                tool_call_copy["function"]["arguments"]
+                            )
+                        except (json.JSONDecodeError, TypeError):
+                            pass
+                    tool_calls.append(tool_call_copy)
+                message_dict["tool_calls"] = tool_calls
+
+        if message.tool_call_id is not None:
+            message_dict["tool_call_id"] = message.tool_call_id
+
+        messages.append(message_dict)
+
+    return messages
+
+
+def build_prompt(
+    raw_messages: List[Any],
+    model: str,
+    tools: Optional[List[Dict[str, Any]]],
+    inject_tools_as_functions: bool,
+) -> str:
+    """Build prompt from messages using the model's chat template."""
+    messages = normalize_messages(
+        raw_messages=raw_messages,
+        tools=tools,
+        inject_tools_as_functions=inject_tools_as_functions,
+        model=model,
+    )
+
+    tokenizer = get_tokenizer(model)
+
+    # For OLMo-style templates, don't pass tools separately (they're in messages)
+    if tools and not inject_tools_as_functions:
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tools=tools,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    else:
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+
+    return prompt
+
+
+@app.post("/generate-prompt", response_model=GeneratePromptResponse)
+async def generate_prompt(request: GeneratePromptRequest):
+    """
+    Generate a prompt from messages using the specified model's chat template.
+    Optionally includes tool definitions if provided.
+    """
+    try:
+        prompt = build_prompt(
+            raw_messages=request.messages,
+            model=request.model,
+            tools=request.tools,
+            inject_tools_as_functions=request.inject_tools_as_functions,
+        )
+        return GeneratePromptResponse(prompt=prompt, model=request.model)
+
+    except Exception as e:
+        import traceback
+
+        traceback.print_exc()
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to generate prompt: {str(e)}",
+        )
+
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {"status": "healthy"}
+
+
+if FastMCP is not None:
+    mcp = FastMCP("huggingface-prompt-renderer")
+
+    @mcp.tool()
+    def generate_prompt_tool(
+        messages: List[Dict[str, Any]],
+        model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct",
+        tools: Optional[List[Dict[str, Any]]] = None,
+        inject_tools_as_functions: bool = True,
+    ) -> Dict[str, str]:
+        """
+        Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates.
+
+        This tool takes a list of message objects and applies the target model's chat template to produce
+        the exact prompt string that would be fed to the model. It handles various message formats including
+        standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting.
+
+        Use this tool to:
+        - Verify that a model's chat template correctly formats your conversation
+        - Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response
+        - Compare prompt output across different models to understand template differences
+        - Debug issues with message formatting that cause unexpected model behavior
+
+        Message format supports:
+        - role: "user", "assistant", "system", "tool"
+        - content: string content of the message
+        - tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}})
+        - tool_call_id: for tool role messages, references the call being responded to
+        - functions: optional field for OLMo-style tool definitions
+        - function_calls: optional field for OLMo-style tool call results
+
+        Parameters:
+        - messages: List of message dictionaries forming the conversation
+        - model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
+        - tools: Optional list of tool/function definitions for function calling models
+        - inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template.
+
+        Returns: Dictionary with 'prompt' (rendered string) and 'model' keys.
+
+        Recommended test cases:
+        1. Simple conversation: user -> assistant
+        2. Tool calling: user -> assistant with tool_call -> tool response -> assistant
+        3. Multiple tool calls in one assistant message
+        4. Multiple tool responses interleaved with assistant reasoning
+        5. Nested tool calls (assistant calls tool, uses result to call another)
+        6. System message with tool definitions
+        7. Empty or None content in messages
+        8. Very long messages to test truncation handling
+        """
+        prompt = build_prompt(
+            raw_messages=messages,
+            model=model,
+            tools=tools,
+            inject_tools_as_functions=inject_tools_as_functions,
+        )
+        return {"prompt": prompt, "model": model}
+else:
+    mcp = None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="HuggingFace Prompt Renderer MCP Server",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument(
+        "--mcp", action="store_true", help="Run MCP server over stdio"
+    )
+    parser.add_argument("--host", default="0.0.0.0", help="FastAPI host")
+    parser.add_argument("--port", type=int, default=8000, help="FastAPI port")
+    args = parser.parse_args()
+
+    if args.mcp:
+        if mcp is None:
+            raise RuntimeError("MCP server requested but mcp is not installed.")
+        mcp.run()
+    else:
+        uvicorn.run(app, host=args.host, port=args.port)
+
+
+if __name__ == "__main__":
+    main()