Compare commits

...

1 Commits

Author SHA1 Message Date
ParthSareen
6d9a37ad62 cmd: add prompt rendering mcp 2025-12-23 16:41:25 -05:00
2 changed files with 467 additions and 0 deletions

View File

@@ -0,0 +1,156 @@
# HuggingFace Prompt Renderer MCP Server
Model Context Protocol (MCP) server for rendering conversation messages into
model-specific prompt strings using HuggingFace tokenizer chat templates.
## Requirements
- [uv](https://docs.astral.sh/uv/) - Fast Python package installer
## Usage
### MCP Server Mode
Run the MCP server over stdio for use with MCP clients:
```bash
uv run cmd/prompt-rendering/server.py --mcp
```
Add to your MCP client configuration (e.g., for Claude Desktop):
```json
{
"mcpServers": {
"huggingface-prompt-renderer": {
"command": "uv",
"args": [
"run",
"--directory",
"<path-to-ollama-repo>",
"cmd/prompt-rendering/server.py",
"--mcp"
]
}
}
}
```
### FastAPI Server Mode
Start a FastAPI server for manual HTTP testing:
```bash
# Start on default port 8000
uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
# Start on custom port
uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 9000
```
#### Endpoints
| Method | Path | Description |
|--------|------|-------------|
| POST | `/generate-prompt` | Generate prompt from messages |
| GET | `/health` | Health check |
### Test with curl
```bash
# Basic user message
curl -X POST http://localhost:8000/generate-prompt \
-H "Content-Type: application/json" \
-d '{
"messages": [{"role": "user", "content": "Hello!"}]
}'
# With tools
curl -X POST http://localhost:8000/generate-prompt \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is the weather?"}
],
"model": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
"tools": [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"required": ["location"],
"properties": {
"location": {"type": "string", "description": "The city"}
}
}
}
}]
}'
# With tool calls
curl -X POST http://localhost:8000/generate-prompt \
-H "Content-Type: application/json" \
-d '{
"messages": [
{"role": "user", "content": "What is the weather in SF?"},
{
"role": "assistant",
"tool_calls": [{
"id": "call_1",
"type": "function",
"function": {
"name": "get_weather",
"arguments": {"location": "San Francisco"}
}
}]
},
{"role": "tool", "content": "{\"temperature\": 68}", "tool_call_id": "call_1"}
],
"tools": [{
"type": "function",
"function": {
"name": "get_weather",
"parameters": {
"type": "object",
"properties": {"location": {"type": "string"}}
}
}
}]
}'
```
## Supported Message Formats
The server supports multiple message formats:
| Format | Description |
|--------|-------------|
| OpenAI | Standard `role`, `content`, `tool_calls`, `tool_call_id` |
| OLMo | Adds `functions` and `function_calls` fields |
| DeepSeek | Tool call arguments must be JSON strings |
## Tool Support
| Setting | Description |
|---------|-------------|
| `inject_tools_as_functions=true` | Injects tools into system message as `functions` key (OLMo-style) |
| `inject_tools_as_functions=false` | Passes tools separately to `apply_chat_template` (standard transformers) |
## Models
The server uses HuggingFace's `transformers` library and supports any model
with a chat template. Default: `Qwen/Qwen3-Coder-480B-A35B-Instruct`
## Dependencies
The script uses PEP 723 inline dependency metadata. When run with `uv`,
dependencies are automatically installed into an isolated environment:
- `fastapi` - Web framework
- `uvicorn` - ASGI server
- `transformers` - HuggingFace tokenizer
- `jinja2` - Template engine
- `mcp` - Model Context Protocol

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "fastapi",
# "uvicorn",
# "transformers",
# "jinja2",
# "mcp",
# ]
# ///
"""
HuggingFace Prompt Renderer MCP Server
Model Context Protocol (MCP) server for rendering conversation messages into
model-specific prompt strings using HuggingFace tokenizer chat templates.
Usage:
# Run MCP server over stdio
uv run cmd/prompt-rendering/server.py --mcp
# Start FastAPI server for manual testing
uv run cmd/prompt-rendering/server.py --host 0.0.0.0 --port 8000
# Test with curl
curl -X POST http://localhost:8000/generate-prompt \\
-H "Content-Type: application/json" \\
-d '{"messages": [{"role": "user", "content": "Hello!"}]}'
"""
from typing import Any, Dict, List, Optional
import argparse
import json
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
from transformers import AutoTokenizer
try:
from mcp.server.fastmcp import FastMCP
except Exception:
FastMCP = None
# Cache for tokenizers to avoid reloading
_tokenizer_cache: Dict[str, Any] = {}
class Message(BaseModel):
role: str
content: Optional[str] = None
tool_calls: Optional[List[Dict[str, Any]]] = None
tool_call_id: Optional[str] = None
functions: Optional[str] = None # For OLMo-style function passing
function_calls: Optional[str] = None # For OLMo-style function call results
class GeneratePromptRequest(BaseModel):
messages: List[Message]
model: str
tools: Optional[List[Dict[str, Any]]] = None
# Whether to inject tools into system message as 'functions' key (for OLMo-style templates)
inject_tools_as_functions: Optional[bool] = True
class GeneratePromptResponse(BaseModel):
prompt: str
model: str
# FastAPI app
app = FastAPI(title="HuggingFace Prompt Generator", version="1.0.0")
def get_tokenizer(model_name: str) -> Any:
"""Get or create tokenizer for the given model."""
if model_name not in _tokenizer_cache:
_tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
model_name, trust_remote_code=True
)
return _tokenizer_cache[model_name]
def is_deepseek_model(model_name: str) -> bool:
"""Check if this is a DeepSeek model."""
return "deepseek" in model_name.lower()
def normalize_messages(
raw_messages: List[Any],
tools: Optional[List[Dict[str, Any]]],
inject_tools_as_functions: bool,
model: str,
) -> List[Dict[str, Any]]:
"""Normalize messages for different chat template formats."""
messages: List[Dict[str, Any]] = []
tools_json = json.dumps(tools) if tools else None
is_deepseek = is_deepseek_model(model)
for msg in raw_messages:
message = msg if isinstance(msg, Message) else Message(**msg)
message_dict: Dict[str, Any] = {"role": message.role, "content": None}
if message.content is not None:
message_dict["content"] = message.content
# Handle explicit functions field (OLMo-style)
if message.functions is not None:
message_dict["functions"] = message.functions
# Inject tools into system message as 'functions' (for OLMo templates)
elif inject_tools_as_functions and message.role == "system" and tools_json:
message_dict["functions"] = tools_json
# Handle explicit function_calls field (OLMo-style)
if message.function_calls is not None:
message_dict["function_calls"] = message.function_calls
# Convert tool_calls for templates
elif message.tool_calls is not None:
if is_deepseek:
# DeepSeek format: arguments must be a JSON string
tool_calls = []
for tool_call in message.tool_calls:
tc = {
"type": "function",
"function": {
"name": tool_call["function"]["name"],
"arguments": json.dumps(tool_call["function"]["arguments"])
if isinstance(tool_call["function"]["arguments"], dict)
else tool_call["function"]["arguments"],
},
}
tool_calls.append(tc)
message_dict["tool_calls"] = tool_calls
elif inject_tools_as_functions:
# Convert to OLMo function_calls format
message_dict["function_calls"] = json.dumps(message.tool_calls)
else:
# Standard transformers format
tool_calls = []
for tool_call in message.tool_calls:
tool_call_copy = tool_call.copy()
if (
"function" in tool_call_copy
and "arguments" in tool_call_copy["function"]
):
try:
tool_call_copy["function"]["arguments"] = json.loads(
tool_call_copy["function"]["arguments"]
)
except (json.JSONDecodeError, TypeError):
pass
tool_calls.append(tool_call_copy)
message_dict["tool_calls"] = tool_calls
if message.tool_call_id is not None:
message_dict["tool_call_id"] = message.tool_call_id
messages.append(message_dict)
return messages
def build_prompt(
raw_messages: List[Any],
model: str,
tools: Optional[List[Dict[str, Any]]],
inject_tools_as_functions: bool,
) -> str:
"""Build prompt from messages using the model's chat template."""
messages = normalize_messages(
raw_messages=raw_messages,
tools=tools,
inject_tools_as_functions=inject_tools_as_functions,
model=model,
)
tokenizer = get_tokenizer(model)
# For OLMo-style templates, don't pass tools separately (they're in messages)
if tools and not inject_tools_as_functions:
prompt = tokenizer.apply_chat_template(
messages,
tools=tools,
tokenize=False,
add_generation_prompt=True,
)
else:
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
return prompt
@app.post("/generate-prompt", response_model=GeneratePromptResponse)
async def generate_prompt(request: GeneratePromptRequest):
"""
Generate a prompt from messages using the specified model's chat template.
Optionally includes tool definitions if provided.
"""
try:
prompt = build_prompt(
raw_messages=request.messages,
model=request.model,
tools=request.tools,
inject_tools_as_functions=request.inject_tools_as_functions,
)
return GeneratePromptResponse(prompt=prompt, model=request.model)
except Exception as e:
import traceback
traceback.print_exc()
raise HTTPException(
status_code=500,
detail=f"Failed to generate prompt: {str(e)}",
)
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {"status": "healthy"}
if FastMCP is not None:
mcp = FastMCP("huggingface-prompt-renderer")
@mcp.tool()
def generate_prompt_tool(
messages: List[Dict[str, Any]],
model: str = "Qwen/Qwen3-Coder-480B-A35B-Instruct",
tools: Optional[List[Dict[str, Any]]] = None,
inject_tools_as_functions: bool = True,
) -> Dict[str, str]:
"""
Render conversation messages into a model-specific prompt string using HuggingFace tokenizer chat templates.
This tool takes a list of message objects and applies the target model's chat template to produce
the exact prompt string that would be fed to the model. It handles various message formats including
standard OpenAI-style, OLMo-style (functions/function_calls), and DeepSeek-specific formatting.
Use this tool to:
- Verify that a model's chat template correctly formats your conversation
- Test edge cases: tool calling, tool responses, interleaved thinking and tool calls, multiple tools in single response
- Compare prompt output across different models to understand template differences
- Debug issues with message formatting that cause unexpected model behavior
Message format supports:
- role: "user", "assistant", "system", "tool"
- content: string content of the message
- tool_calls: list of tool call objects (OpenAI format: {type, function: {name, arguments}})
- tool_call_id: for tool role messages, references the call being responded to
- functions: optional field for OLMo-style tool definitions
- function_calls: optional field for OLMo-style tool call results
Parameters:
- messages: List of message dictionaries forming the conversation
- model: HuggingFace model identifier (default: Qwen/Qwen3-Coder-480B-A35B-Instruct)
- tools: Optional list of tool/function definitions for function calling models
- inject_tools_as_functions: If True, injects tools into system message as 'functions' key (OLMo-style). If False, passes tools separately to apply_chat_template.
Returns: Dictionary with 'prompt' (rendered string) and 'model' keys.
Recommended test cases:
1. Simple conversation: user -> assistant
2. Tool calling: user -> assistant with tool_call -> tool response -> assistant
3. Multiple tool calls in one assistant message
4. Multiple tool responses interleaved with assistant reasoning
5. Nested tool calls (assistant calls tool, uses result to call another)
6. System message with tool definitions
7. Empty or None content in messages
8. Very long messages to test truncation handling
"""
prompt = build_prompt(
raw_messages=messages,
model=model,
tools=tools,
inject_tools_as_functions=inject_tools_as_functions,
)
return {"prompt": prompt, "model": model}
else:
mcp = None
def main():
parser = argparse.ArgumentParser(
description="HuggingFace Prompt Renderer MCP Server",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--mcp", action="store_true", help="Run MCP server over stdio"
)
parser.add_argument("--host", default="0.0.0.0", help="FastAPI host")
parser.add_argument("--port", type=int, default=8000, help="FastAPI port")
args = parser.parse_args()
if args.mcp:
if mcp is None:
raise RuntimeError("MCP server requested but mcp is not installed.")
mcp.run()
else:
uvicorn.run(app, host=args.host, port=args.port)
if __name__ == "__main__":
main()