From 44dae8efb3d079f6e73884aaee63e02bfd4e8d86 Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Tue, 17 Jun 2025 17:43:31 -0700 Subject: [PATCH] feat: add serpCluster integration and schema --- config.json | 2 ++ jina-ai/config.json | 10 ++++++++-- src/agent.ts | 30 ++++++++++++++++++++++------- src/tools/jina-search.ts | 2 +- src/tools/serp-cluster.ts | 40 +++++++++++++++++++++++++++++++++++++++ src/utils/schemas.ts | 15 +++++++++++++++ 6 files changed, 89 insertions(+), 10 deletions(-) create mode 100644 src/tools/serp-cluster.ts diff --git a/config.json b/config.json index bb229dd..1a67c9c 100644 --- a/config.json +++ b/config.json @@ -45,6 +45,7 @@ "temperature": 0.1 }, "researchPlanner": {}, + "serpCluster": {}, "agent": { "temperature": 0.7 }, @@ -79,6 +80,7 @@ "queryRewriter": { "temperature": 0.1 }, + "serpCluster": {}, "agent": { "temperature": 0.7 }, diff --git a/jina-ai/config.json b/jina-ai/config.json index c8dd248..b2c168c 100644 --- a/jina-ai/config.json +++ b/jina-ai/config.json @@ -47,6 +47,7 @@ "evaluator": { "maxTokens": 2000 }, + "serpCluster": {}, "errorAnalyzer": { "maxTokens": 1000 }, @@ -60,7 +61,9 @@ "model": "gemini-2.0-flash-lite" }, "finalizer": {}, - "reducer": {"maxTokens": 16000} + "reducer": { + "maxTokens": 16000 + } } }, "openai": { @@ -79,6 +82,7 @@ "queryRewriter": { "temperature": 0.1 }, + "serpCluster": {}, "agent": { "temperature": 0.7 }, @@ -89,7 +93,9 @@ "temperature": 0 }, "finalizer": {}, - "reducer": {"maxTokens": 16000} + "reducer": { + "maxTokens": 16000 + } } } } diff --git a/src/agent.ts b/src/agent.ts index 45bd191..dfe1c0c 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -48,6 +48,7 @@ import { researchPlan } from './tools/research-planner'; import { reduceAnswers } from './tools/reducer'; import { AxiosError } from 'axios'; import { dedupImagesWithEmbeddings } from './utils/image-tools'; +import { serpCluster } from './tools/serp-cluster'; async function wait(seconds: number) { logDebug(`Waiting ${seconds}s...`); @@ -361,12 +362,27 @@ async function executeSearchQueries( searchedQueries.push(query.q) - newKnowledge.push({ - question: `What do Internet say about "${oldQuery}"?`, - answer: removeHTMLtags(minResults.map(r => r.description).join('; ')), - type: 'side-info', - updated: query.tbs ? formatDateRange(query) : undefined - }); + try { + const clusters = await serpCluster(minResults, context, SchemaGen); + clusters.forEach(c => { + newKnowledge.push({ + question: c.question, + answer: c.insight, + references: c.urls, + type: 'url', + }); + }); + } catch (error) { + logWarning('serpCluster failed:', { error }); + newKnowledge.push({ + question: `What do Internet say about "${oldQuery}"?`, + answer: removeHTMLtags(minResults.map(r => r.description).join('; ')), + type: 'side-info', + updated: query.tbs ? formatDateRange(query) : undefined + }); + } + + } if (searchedQueries.length === 0) { if (onlyHostnames && onlyHostnames.length > 0) { @@ -405,7 +421,7 @@ export async function getResponse(question?: string, searchLanguageCode?: string, searchProvider?: string, withImages: boolean = false, - teamSize: number = 2 + teamSize: number = 1 ): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], imageReferences?: ImageReference[] }> { let step = 0; diff --git a/src/tools/jina-search.ts b/src/tools/jina-search.ts index 9d5c6f5..0bcbcf1 100644 --- a/src/tools/jina-search.ts +++ b/src/tools/jina-search.ts @@ -2,7 +2,7 @@ import { TokenTracker } from "../utils/token-tracker"; import { JinaSearchResponse, SERPQuery } from '../types'; import { JINA_API_KEY } from "../config"; import axiosClient from '../utils/axios-client'; -import { logInfo, logError, logDebug, logWarning } from '../logging'; +import { logError, logDebug } from '../logging'; export async function search( query: SERPQuery, diff --git a/src/tools/serp-cluster.ts b/src/tools/serp-cluster.ts new file mode 100644 index 0000000..4cde45f --- /dev/null +++ b/src/tools/serp-cluster.ts @@ -0,0 +1,40 @@ +import { PromptPair, TrackerContext } from '../types'; +import { ObjectGeneratorSafe } from "../utils/safe-generator"; +import { Schemas } from "../utils/schemas"; +import { logInfo, logError } from '../logging'; +import { SearchSnippet } from '../types'; + +function getPrompt(results: SearchSnippet[]): PromptPair { + return { + system: ` +You are a search engine result analyzer. You look at the SERP API response and group them into meaningful cluster. + +Each cluster should contain a summary of the content, key data and insights, the corresponding URLs and search advice. Respond in JSON format. +`, + user: + ` +${JSON.stringify(results)} +` + }; +} +const TOOL_NAME = 'serpCluster'; + +export async function serpCluster(results: SearchSnippet[], trackers: TrackerContext, schemaGen: Schemas): Promise { + try { + const generator = new ObjectGeneratorSafe(trackers.tokenTracker); + const prompt = getPrompt(results); + const result = await generator.generateObject({ + model: TOOL_NAME, + schema: schemaGen.getSerpClusterSchema(), + system: prompt.system, + prompt: prompt.user, + }); + trackers?.actionTracker.trackThink(result.object.think); + const clusters = result.object.clusters; + logInfo(TOOL_NAME, { clusters }); + return clusters; + } catch (error) { + logError(TOOL_NAME, { error }); + throw error; + } +} \ No newline at end of file diff --git a/src/utils/schemas.ts b/src/utils/schemas.ts index 79bfab3..2d255b5 100644 --- a/src/utils/schemas.ts +++ b/src/utils/schemas.ts @@ -6,6 +6,7 @@ import { logDebug } from '../logging'; export const MAX_URLS_PER_STEP = 5 export const MAX_QUERIES_PER_STEP = 5 export const MAX_REFLECT_PER_STEP = 2 +export const MAX_CLUSTERS = 5 function getLanguagePrompt(question: string): PromptPair { return { @@ -173,6 +174,20 @@ export class Schemas { }); } + getSerpClusterSchema(): z.ZodObject { + return z.object({ + think: z.string().describe(`Explain why you cluster the search results like this. ${this.getLanguagePrompt()}`).max(500), + clusters: z.array( + z.object({ + question: z.string().describe('What question this cluster answers.').max(100), + insight: z.string().describe('Summary and list key numbers, data and insights that worth to be highlighted. End with an actionable advice such as "Visit these URLs if you want to understand [what...]". Do not use "This cluster..."').max(200), + urls: z.array(z.string().describe('URLs in this cluster.').max(100)) + })) + .max(MAX_CLUSTERS) + .describe(`'The optimal clustering of search engine results, orthogonal to each other. Maximum ${MAX_CLUSTERS} clusters allowed.'`) + }); + } + getQueryRewriterSchema(): z.ZodObject { return z.object({ think: z.string().describe(`Explain why you choose those search queries. ${this.getLanguagePrompt()}`).max(500),