#!/usr/bin/env node /** * Duplicate Issue Detector * * Triggered on new issue creation. Compares the new issue against an * existing embedding index, then uses an LLM to * confirm duplicates before posting a comment for maintainer review. */ import { pipeline } from '@huggingface/transformers'; import { existsSync, readFileSync } from 'node:fs'; import { addLabel, dotProduct, fetchIssues, getIssue, issueText, postComment, } from './utils.mjs'; const SIMILARITY_THRESHOLD = 0.55; const TOP_K = 5; const MAX_COMMENT_CANDIDATES = 3; const MODEL_NAME = process.env.EMBEDDING_MODEL || 'Xenova/all-MiniLM-L6-v2'; const GROQ_MODEL = process.env.GROQ_MODEL || 'llama-3.3-70b-versatile'; const INDEX_PATH = 'issue_index.json'; const LABEL_NAME = 'possible-duplicate'; const GROQ_API_KEY = process.env.GROQ_API_KEY || ''; const ISSUE_NUMBER = parseInt(process.env.ISSUE_NUMBER, 10); function loadIndex(path) { if (!existsSync(path)) { console.error( `Index file not found at ${path}. Run build-index.mjs first.` ); process.exit(1); } const data = JSON.parse(readFileSync(path, 'utf-8')); console.log(`Loaded index with ${data.issues.length} issues`); return data; } function findSimilar( queryEmbedding, index, { topK = TOP_K, threshold = SIMILARITY_THRESHOLD, excludeNumber } = {} ) { const { issues, embeddings } = index; if (!issues.length) return []; const scored = issues.map((issue, i) => ({ ...issue, score: dotProduct(queryEmbedding, embeddings[i]), })); return scored .sort((a, b) => b.score - a.score) .filter( (c) => c.score >= threshold && (!excludeNumber || c.number !== excludeNumber) ) .slice(0, topK); } const CONFIRM_SYSTEM_PROMPT = `You are a GitHub issue triage assistant. You will be given a NEW issue and one \ or more CANDIDATE issues that may be duplicates. For each candidate, determine if the new issue is truly a duplicate (same root \ problem/request) or merely related (similar area but different issue). Respond ONLY with a JSON array of objects, each with: - "number": the candidate issue number - "duplicate": true or false - "reason": one-sentence explanation Example: [{"number": 123, "duplicate": true, "reason": "Both report the same crash when ..."}]`; async function confirmWithLlm(newIssue, candidates) { if (!GROQ_API_KEY) { console.warn('GROQ_API_KEY not set — skipping LLM confirmation'); return candidates; } const candidateText = candidates .map( (c) => `### Candidate #${c.number} (similarity: ${c.score.toFixed(2)})\n` + `**Title:** ${c.title}\n` + `**State:** ${c.state}\n` + `**Body preview:** ${(c.body_preview || 'N/A').slice(0, 500)}` ) .join('\n\n'); const userPrompt = `## NEW ISSUE #${newIssue.number}\n` + `**Title:** ${newIssue.title}\n` + `**Body:**\n${(newIssue.body || 'No body').slice(0, 1500)}\n\n` + `---\n\n` + `## CANDIDATES\n${candidateText}`; try { const resp = await fetch( 'https://api.groq.com/openai/v1/chat/completions', { method: 'POST', headers: { Authorization: `Bearer ${GROQ_API_KEY}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ model: GROQ_MODEL, messages: [ { role: 'system', content: CONFIRM_SYSTEM_PROMPT }, { role: 'user', content: userPrompt }, ], temperature: 0.1, max_tokens: 1024, }), signal: AbortSignal.timeout(30_000), } ); if (!resp.ok) { const text = await resp.text(); throw new Error(`Groq API error ${resp.status}: ${text}`); } let content = (await resp.json()).choices[0].message.content.trim(); if (content.startsWith('```')) { content = content .split('\n') .slice(1) .join('\n') .replace(/```\s*$/, '') .trim(); } const verdicts = JSON.parse(content); if (!Array.isArray(verdicts)) { throw new Error('Invalid LLM response format - expected array'); } const verdictMap = new Map(verdicts.map((v) => [v.number, v])); const confirmed = []; for (const c of candidates) { const verdict = verdictMap.get(c.number); if (verdict?.duplicate) { c.llm_reason = verdict.reason || ''; confirmed.push(c); } else { const reason = verdict?.reason || 'not evaluated'; console.log(` #${c.number} ruled out by LLM: ${reason}`); } } return confirmed; } catch (err) { console.warn( `LLM confirmation failed: ${err.message} - falling back to all candidates` ); return candidates; } } function formatComment(candidates) { const lines = [ '**Possible duplicate detected**', '', 'This issue may be a duplicate of the following (detected via semantic similarity + LLM review):', '', ]; for (const c of candidates.slice(0, MAX_COMMENT_CANDIDATES)) { const confidence = `${(c.score * 100).toFixed(0)}%`; let line = `- #${c.number} (${confidence} match) — ${c.title}`; if (c.llm_reason) { line += `\n > *${c.llm_reason}*`; } lines.push(line); } lines.push( '', 'A maintainer will review this. If this is **not** a duplicate, no action is needed.', '', `` ); return lines.join('\n'); } async function main() { if (!ISSUE_NUMBER) { console.error('ISSUE_NUMBER not set'); process.exit(1); } console.log(`Processing issue #${ISSUE_NUMBER}`); const issue = await getIssue(ISSUE_NUMBER); const oneHourAgo = new Date(Date.now() - 60 * 60 * 1000).toISOString(); const recentIssues = await fetchIssues({ creator: issue.user.login, since: oneHourAgo, state: 'all', }); if (recentIssues.length > 10) { console.log( `User ${issue.user.login} created ${recentIssues.length} issues in the last hour - skipping to prevent spam` ); return; } if (issue.pull_request) { console.log('Skipping - this is a pull request'); return; } if (issue.user.type === 'Bot') { console.log('Skipping - issue created by bot'); return; } console.log(`Loading model: ${MODEL_NAME}`); const extractor = await pipeline('feature-extraction', MODEL_NAME, { dtype: 'fp32', }); const index = loadIndex(INDEX_PATH); const text = issueText(issue.title, issue.body); const output = await extractor(text, { pooling: 'mean', normalize: true }); const queryEmbedding = output.tolist()[0]; let candidates = findSimilar(queryEmbedding, index, { topK: TOP_K, threshold: SIMILARITY_THRESHOLD, excludeNumber: issue.number, }); if (!candidates.length) { console.log('No similar issues found above threshold - done'); return; } console.log(`Found ${candidates.length} candidates above threshold:`); for (const c of candidates) { console.log(` #${c.number} (${c.score.toFixed(3)}) - ${c.title}`); } console.log('Running LLM confirmation via Groq...'); candidates = await confirmWithLlm(issue, candidates); if (!candidates.length) { console.log('LLM ruled out all candidates - done'); return; } const comment = formatComment(candidates); await postComment(ISSUE_NUMBER, comment); await addLabel(ISSUE_NUMBER, LABEL_NAME); console.log('Done!'); } main().catch((err) => { console.error(err); process.exit(1); });