From e96057fb2384ae725bb212c937d2ee7a41a1dbef Mon Sep 17 00:00:00 2001 From: MartinBraquet Date: Tue, 7 Apr 2026 17:38:08 +0200 Subject: [PATCH] Implement async profile extraction with caching and processing state handling --- backend/api/src/llm-extract-profile.ts | 124 ++++++++++++++++++----- common/src/api/schema.ts | 5 +- web/components/optional-profile-form.tsx | 12 ++- 3 files changed, 112 insertions(+), 29 deletions(-) diff --git a/backend/api/src/llm-extract-profile.ts b/backend/api/src/llm-extract-profile.ts index bac6a71c..380f13c8 100644 --- a/backend/api/src/llm-extract-profile.ts +++ b/backend/api/src/llm-extract-profile.ts @@ -35,11 +35,20 @@ const MAX_CONTEXT_LENGTH = 7 * 10 * 30 * 50 const USE_CACHE = true const CACHE_DIR = join(tmpdir(), 'compass-llm-cache') const CACHE_TTL_MS = 24 * 60 * 60 * 1000 // 24 hours +const PROCESSING_TTL_MS = 10 * 60 * 1000 // 10 minutes -function getCacheKey(content: string): string { +interface ParsedBody { + content?: string + url?: string + locale?: string +} + +function getCacheKey(parsedBody: ParsedBody): string { if (!USE_CACHE) return '' const hash = createHash('sha256') - hash.update(content) + // Normalize: sort keys for consistent hashing + const normalized = JSON.stringify(parsedBody, Object.keys(parsedBody).sort()) + hash.update(normalized) return hash.digest('hex') } @@ -215,6 +224,69 @@ async function setCachedResult( } } +async function isProcessing(cacheKey: string): Promise { + if (!USE_CACHE) return false + try { + const processingFile = join(CACHE_DIR, `${cacheKey}.processing`) + const stats = await fs.stat(processingFile) + // Check if processing lock is still valid (not expired) + if (Date.now() - stats.mtime.getTime() > PROCESSING_TTL_MS) { + // Stale processing lock, remove it + await fs.unlink(processingFile).catch(() => {}) + return false + } + return true + } catch { + return false + } +} + +async function setProcessing(cacheKey: string): Promise { + if (!USE_CACHE) return + try { + await fs.mkdir(CACHE_DIR, {recursive: true}) + const processingFile = join(CACHE_DIR, `${cacheKey}.processing`) + await fs.writeFile(processingFile, Date.now().toString(), 'utf-8') + } catch { + // Don't throw - processing flag failure shouldn't break the main flow + } +} + +async function clearProcessing(cacheKey: string): Promise { + if (!USE_CACHE) return + try { + const processingFile = join(CACHE_DIR, `${cacheKey}.processing`) + await fs.unlink(processingFile) + } catch { + // Ignore errors + } +} + +async function processAndCache( + cacheKey: string, + content?: string | undefined, + url?: string | undefined, + locale?: string, +): Promise { + try { + let bio: JSONContent | undefined + if (!content) { + bio = await fetchOnlineProfile(url) + debug(JSON.stringify(bio, null, 2)) + content = parseJsonContentToText(bio) + } + const profile = await callLLM(content, locale) + if (bio) { + profile.bio = bio + } + await setCachedResult(cacheKey, profile) + } catch (error) { + log('Async LLM processing failed', {cacheKey, error}) + } finally { + await clearProcessing(cacheKey) + } +} + async function callGemini(text: string) { const apiKey = process.env.GEMINI_API_KEY @@ -311,7 +383,10 @@ async function _callClaude(text: string) { return outputText } -async function callLLM(content: string, locale?: string): Promise> { +export async function callLLM( + content: string, + locale?: string, +): Promise> { const [INTERESTS, CAUSE_AREAS, WORK_AREAS] = await Promise.all([ getOptions('interests', locale), getOptions('causes', locale), @@ -437,19 +512,10 @@ TEXT TO ANALYZE: const text = EXTRACTION_PROMPT + content if (text.length > MAX_CONTEXT_LENGTH) { log('Content exceeds maximum length, will be cropped', {length: text.length}) - // throw APIErrors.badRequest('Content exceeds maximum length') } debug({text}) - const cacheKey = getCacheKey(text) - const cached = await getCachedResult(cacheKey) - if (cached) { - debug('Using cached LLM result', {cacheKey: cacheKey.substring(0, 8)}) - return cached - } - const outputText = await callGemini(text) - // const outputText = JSON.stringify({}) if (!outputText) { throw APIErrors.internalServerError('Failed to parse LLM response') @@ -465,8 +531,6 @@ TEXT TO ANALYZE: throw APIErrors.internalServerError('Failed to parse extracted data') } - await setCachedResult(cacheKey, parsed) - return parsed } @@ -511,7 +575,7 @@ export const llmExtractProfileEndpoint: APIHandler<'llm-extract-profile'> = asyn auth, ) => { const {url, locale} = parsedBody - let content = parsedBody.content + const content = parsedBody.content log('Extracting profile from content', { contentLength: content?.length, @@ -524,22 +588,28 @@ export const llmExtractProfileEndpoint: APIHandler<'llm-extract-profile'> = asyn throw APIErrors.badRequest('Content and URL cannot be provided together') } - let bio - if (!content) { - bio = await fetchOnlineProfile(url) - debug(JSON.stringify(bio, null, 2)) - content = parseJsonContentToText(bio) + // Check cache based on parsedBody hash + const cacheKey = getCacheKey(parsedBody) + const cached = await getCachedResult(cacheKey) + if (cached) { + log('Returning cached profile', {cacheKey: cacheKey.substring(0, 8)}) + return {profile: cached, status: 'success'} } - const extracted = await callLLM(content, locale) - - if (bio) { - extracted.bio = bio + // Check if already processing + if (await isProcessing(cacheKey)) { + log('Profile extraction already in progress', {cacheKey: cacheKey.substring(0, 8)}) + return {profile: {}, status: 'pending'} } - debug(JSON.stringify(bio)) + // Start processing asynchronously + await setProcessing(cacheKey) - log('Profile extracted successfully', {extracted}) + // Kick off async processing (don't await) + processAndCache(cacheKey, content, url, locale).catch((err) => { + log('Unexpected error in async processing', {cacheKey, error: err}) + }) - return extracted + log('Started async profile extraction', {cacheKey: cacheKey.substring(0, 8)}) + return {profile: {}, status: 'pending'} } diff --git a/common/src/api/schema.ts b/common/src/api/schema.ts index ce1f84e4..5253c2ec 100644 --- a/common/src/api/schema.ts +++ b/common/src/api/schema.ts @@ -1283,7 +1283,10 @@ export const API = (_apiTypeCheck = { locale: z.string().optional(), }) .strict(), - returns: {} as Partial, + returns: {} as { + profile: Partial + status: string + }, summary: 'Extract profile information from text using LLM', tag: 'Profiles', }, diff --git a/web/components/optional-profile-form.tsx b/web/components/optional-profile-form.tsx index 50607504..74c3edaa 100644 --- a/web/components/optional-profile-form.tsx +++ b/web/components/optional-profile-form.tsx @@ -102,7 +102,17 @@ export const OptionalProfileUserForm = (props: { ...(isInputUrl ? {url: urlize(llmContent).trim()} : {content: llmContent.trim()}), } try { - let extractedProfile = await api('llm-extract-profile', payload) + let extractedProfile: Partial = {} + let status: string | undefined = 'pending' + while (status === 'pending') { + const response = await api('llm-extract-profile', payload) + status = response.status + console.log(status) + if (status === 'pending') { + await new Promise((resolve) => setTimeout(resolve, 1000)) + } + extractedProfile = response.profile + } extractedProfile = removeNullOrUndefinedProps(extractedProfile) for (const data of Object.entries(extractedProfile)) { const key = data[0] as keyof ProfileWithoutUser