cmd: add prompt rendering mcp

2026-01-18 20:39:13 -05:00 · 2025-12-23 16:41:25 -05:00
2 changed files with 467 additions and 0 deletions
--- a/cmd/prompt-rendering/README.md
+++ b/cmd/prompt-rendering/README.md
@@ -0,0 +1,156 @@
 # HuggingFace Prompt Renderer MCP Server
 Model Context Protocol (MCP) server for rendering conversation messages into
 model-specific prompt strings using HuggingFace tokenizer chat templates.
 ## Requirements
 - [uv](https://docs.astral.sh/uv/) - Fast Python package installer
 ## Usage
 ### MCP Server Mode
 Run the MCP server over stdio for use with MCP clients:
 ```bash
 uv run cmd/prompt-rendering/server.py --mcp
 ```
 Add to your MCP client configuration (e.g., for Claude Desktop):
 ```json
 {
  "mcpServers": {
    "huggingface-prompt-renderer": {
      "command": "uv",
      "args": [
        "run",
        "--directory",
        "<path-to-ollama-repo>",
        "cmd/prompt-rendering/server.py",
        "--mcp"
      ]
    }
  }
 }
 ```
 ### FastAPI Server Mode
 Start a FastAPI server for manual HTTP testing:
 ```bash
 # Start on default port 8000
 uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
 # Start on custom port
 uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 9000
 ```
 #### Endpoints
 | Method | Path | Description |
 |--------|------|-------------|
 | POST | `/generate-prompt` | Generate prompt from messages |
 | GET | `/health` | Health check |
 ### Test with curl
 ```bash
 # Basic user message
 curl -X POST http://localhost:8000/generate-prompt \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [{"role": "user", "content": "Hello!"}]
  }'
 # With tools
 curl -X POST http://localhost:8000/generate-prompt \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "system", "content": "You are a helpful assistant."},
      {"role": "user", "content": "What is the weather?"}
    ],
    "model": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
    "tools": [{
      "type": "function",
      "function": {
        "name": "get_weather",
        "description": "Get the current weather",
        "parameters": {
          "type": "object",
          "required": ["location"],
          "properties": {
            "location": {"type": "string", "description": "The city"}
          }
        }
      }
    }]
  }'
 # With tool calls
 curl -X POST http://localhost:8000/generate-prompt \
  -H "Content-Type: application/json" \
  -d '{
    "messages": [
      {"role": "user", "content": "What is the weather in SF?"},
      {
        "role": "assistant",
        "tool_calls": [{
          "id": "call_1",
          "type": "function",
          "function": {
            "name": "get_weather",
            "arguments": {"location": "San Francisco"}
          }
        }]
      },
      {"role": "tool", "content": "{\"temperature\": 68}", "tool_call_id": "call_1"}
    ],
    "tools": [{
      "type": "function",
      "function": {
        "name": "get_weather",
        "parameters": {
          "type": "object",
          "properties": {"location": {"type": "string"}}
        }
      }
    }]
  }'
 ```
 ## Supported Message Formats
 The server supports multiple message formats:
 | Format | Description |
 |--------|-------------|
 | OpenAI | Standard `role`, `content`, `tool_calls`, `tool_call_id` |
 | OLMo | Adds `functions` and `function_calls` fields |
 | DeepSeek | Tool call arguments must be JSON strings |
 ## Tool Support
 | Setting | Description |
 |---------|-------------|
 | `inject_tools_as_functions=true` | Injects tools into system message as `functions` key (OLMo-style) |
 | `inject_tools_as_functions=false` | Passes tools separately to `apply_chat_template` (standard transformers) |
 ## Models
 The server uses HuggingFace's `transformers` library and supports any model
 with a chat template. Default: `Qwen/Qwen3-Coder-480B-A35B-Instruct`
 ## Dependencies
 The script uses PEP 723 inline dependency metadata. When run with `uv`,
 dependencies are automatically installed into an isolated environment:
 - `fastapi` - Web framework
 - `uvicorn` - ASGI server
 - `transformers` - HuggingFace tokenizer
 - `jinja2` - Template engine
 - `mcp` - Model Context Protocol
--- a/cmd/prompt-rendering/server.py
+++ b/cmd/prompt-rendering/server.py
@@ -0,0 +1,311 @@
 #!/usr/bin/env python3
 # /// script
 # requires-python = ">=3.10"
 # dependencies = [
 #   "fastapi",
 #   "uvicorn",
 #   "transformers",
 #   "jinja2",
 #   "mcp",
 # ]
 # ///
 """
 HuggingFace Prompt Renderer MCP Server
 Model Context Protocol (MCP) server for rendering conversation messages into
 model-specific prompt strings using HuggingFace tokenizer chat templates.
 Usage:
    # Run MCP server over stdio
    uv run cmd/prompt-rendering/server.py --mcp
    # Start FastAPI server for manual testing
    uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
    # Test with curl
    curl -X POST http://localhost:8000/generate-prompt \\
      -H "Content-Type: application/json" \\
      -d '{"messages": [{"role": "user", "content": "Hello!"}]}'
 """
 from typing import Any, Dict, List, Optional
 import argparse
 import json
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 import uvicorn
 from transformers import AutoTokenizer
 try:
    from mcp.server.fastmcp import FastMCP
 except Exception:
    FastMCP = None
 # Cache for tokenizers to avoid reloading
 _tokenizer_cache: Dict[str, Any] = {}
 class Message(BaseModel):
    role: str
    content: Optional[str] = None
    tool_calls: Optional[List[Dict[str, Any]]] = None
    tool_call_id: Optional[str] = None
    functions: Optional[str] = None  # For OLMo-style function passing
    function_calls: Optional[str] = None  # For OLMo-style function call results
 class GeneratePromptRequest(BaseModel):
    messages: List[Message]
    model: str
    tools: Optional[List[Dict[str, Any]]] = None
    # Whether to inject tools into system message as 'functions' key (for OLMo-style templates)
    inject_tools_as_functions: Optional[bool] = True
 class GeneratePromptResponse(BaseModel):
    prompt: str
    model: str
 # FastAPI app
 app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0")
 def get_tokenizer(model_name: str) -> Any:
    """Get or create tokenizer for the given model."""
    if model_name not in _tokenizer_cache:
        _tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
            model_name, trust_remote_code=True
        )
    return _tokenizer_cache[model_name]
 def is_deepseek_model(model_name: str) -> bool:
    """Check if this is a DeepSeek model."""
    return "deepseek" in model_name.lower()
 def normalize_messages(
    raw_messages: List[Any],
    tools: Optional[List[Dict[str, Any]]],
    inject_tools_as_functions: bool,
    model: str,
 ) -> List[Dict[str, Any]]:
    """Normalize messages for different chat template formats."""
    messages: List[Dict[str, Any]] = []
    tools_json = json.dumps(tools) if tools else None
    is_deepseek = is_deepseek_model(model)
    for msg in raw_messages:
        message = msg if isinstance(msg, Message) else Message(**msg)
        message_dict: Dict[str, Any] = {"role": message.role, "content": None}
        if message.content is not None:
            message_dict["content"] = message.content
        # Handle explicit functions field (OLMo-style)
        if message.functions is not None:
            message_dict["functions"] = message.functions
        # Inject tools into system message as 'functions' (for OLMo templates)
        elif inject_tools_as_functions and message.role == "system" and tools_json:
            message_dict["functions"] = tools_json
        # Handle explicit function_calls field (OLMo-style)
        if message.function_calls is not None:
            message_dict["function_calls"] = message.function_calls
        # Convert tool_calls for templates
        elif message.tool_calls is not None:
            if is_deepseek:
                # DeepSeek format: arguments must be a JSON string
                tool_calls = []
                for tool_call in message.tool_calls:
                    tc = {
                        "type": "function",
                        "function": {
                            "name": tool_call["function"]["name"],
                            "arguments": json.dumps(tool_call["function"]["arguments"])
                            if isinstance(tool_call["function"]["arguments"], dict)
                            else tool_call["function"]["arguments"],
                        },
                    }
                    tool_calls.append(tc)
                message_dict["tool_calls"] = tool_calls
            elif inject_tools_as_functions:
                # Convert to OLMo function_calls format
                message_dict["function_calls"] = json.dumps(message.tool_calls)
            else:
                # Standard transformers format
                tool_calls = []
                for tool_call in message.tool_calls:
                    tool_call_copy = tool_call.copy()
                    if (
                        "function" in tool_call_copy
                        and "arguments" in tool_call_copy["function"]
                    ):
                        try:
                            tool_call_copy["function"]["arguments"] = json.loads(
                                tool_call_copy["function"]["arguments"]
                            )
                        except (json.JSONDecodeError, TypeError):
                            pass
                    tool_calls.append(tool_call_copy)
                message_dict["tool_calls"] = tool_calls
        if message.tool_call_id is not None:
            message_dict["tool_call_id"] = message.tool_call_id
        messages.append(message_dict)
    return messages
 def build_prompt(
    raw_messages: List[Any],
    model: str,
    tools: Optional[List[Dict[str, Any]]],
    inject_tools_as_functions: bool,
 ) -> str:
    """Build prompt from messages using the model's chat template."""
    messages = normalize_messages(
        raw_messages=raw_messages,
        tools=tools,
        inject_tools_as_functions=inject_tools_as_functions,
        model=model,
    )
    tokenizer = get_tokenizer(model)
    # For OLMo-style templates, don't pass tools separately (they're in messages)
    if tools and not inject_tools_as_functions:
        prompt = tokenizer.apply_chat_template(
            messages,
            tools=tools,
            tokenize=False,
            add_generation_prompt=True,
        )
    else:
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
    return prompt
@app.post("/generate-prompt", response_model=GeneratePromptResponse)
 async def generate_prompt(request: GeneratePromptRequest):
    """
    Generate a prompt from messages using the specified model's chat template.
    Optionally includes tool definitions if provided.
    """
    try:
        prompt = build_prompt(
            raw_messages=request.messages,
            model=request.model,
            tools=request.tools,
            inject_tools_as_functions=request.inject_tools_as_functions,
        )
        return GeneratePromptResponse(prompt=prompt, model=request.model)
    except Exception as e:
        import traceback
        traceback.print_exc()
        raise HTTPException(
            status_code=500,
            detail=f"Failed to generate prompt: {str(e)}",
        )
@app.get("/health")
 async def health_check():
    """Health check endpoint."""
    return {"status": "healthy"}
 if FastMCP is not None:
    mcp = FastMCP("huggingface-prompt-renderer")
    @mcp.tool()
    def generate_prompt_tool(
        messages: List[Dict[str, Any]],
        model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct",
        tools: Optional[List[Dict[str, Any]]] = None,
        inject_tools_as_functions: bool = True,
    ) -> Dict[str, str]:
        """
        Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates.
        This tool takes a list of message objects and applies the target model's chat template to produce
        the exact prompt string that would be fed to the model. It handles various message formats including
        standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting.
        Use this tool to:
        - Verify that a model's chat template correctly formats your conversation
        - Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response
        - Compare prompt output across different models to understand template differences
        - Debug issues with message formatting that cause unexpected model behavior
        Message format supports:
        - role: "user", "assistant", "system", "tool"
        - content: string content of the message
        - tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}})
        - tool_call_id: for tool role messages, references the call being responded to
        - functions: optional field for OLMo-style tool definitions
        - function_calls: optional field for OLMo-style tool call results
        Parameters:
        - messages: List of message dictionaries forming the conversation
        - model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
        - tools: Optional list of tool/function definitions for function calling models
        - inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template.
        Returns: Dictionary with 'prompt' (rendered string) and 'model' keys.
        Recommended test cases:
        1. Simple conversation: user -> assistant
        2. Tool calling: user -> assistant with tool_call -> tool response -> assistant
        3. Multiple tool calls in one assistant message
        4. Multiple tool responses interleaved with assistant reasoning
        5. Nested tool calls (assistant calls tool, uses result to call another)
        6. System message with tool definitions
        7. Empty or None content in messages
        8. Very long messages to test truncation handling
        """
        prompt = build_prompt(
            raw_messages=messages,
            model=model,
            tools=tools,
            inject_tools_as_functions=inject_tools_as_functions,
        )
        return {"prompt": prompt, "model": model}
 else:
    mcp = None
 def main():
    parser = argparse.ArgumentParser(
        description="HuggingFace Prompt Renderer MCP Server",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument(
        "--mcp", action="store_true", help="Run MCP server over stdio"
    )
    parser.add_argument("--host", default="0.0.0.0", help="FastAPI host")
    parser.add_argument("--port", type=int, default=8000, help="FastAPI port")
    args = parser.parse_args()
    if args.mcp:
        if mcp is None:
            raise RuntimeError("MCP server requested but mcp is not installed.")
        mcp.run()
    else:
        uvicorn.run(app, host=args.host, port=args.port)
 if __name__ == "__main__":
    main()