feat: add serpCluster integration and schema

This commit is contained in:
Han Xiao
2025-06-17 17:43:31 -07:00
parent 96aca78d6a
commit 44dae8efb3
6 changed files with 89 additions and 10 deletions

View File

@@ -45,6 +45,7 @@
"temperature": 0.1 "temperature": 0.1
}, },
"researchPlanner": {}, "researchPlanner": {},
"serpCluster": {},
"agent": { "agent": {
"temperature": 0.7 "temperature": 0.7
}, },
@@ -79,6 +80,7 @@
"queryRewriter": { "queryRewriter": {
"temperature": 0.1 "temperature": 0.1
}, },
"serpCluster": {},
"agent": { "agent": {
"temperature": 0.7 "temperature": 0.7
}, },

View File

@@ -47,6 +47,7 @@
"evaluator": { "evaluator": {
"maxTokens": 2000 "maxTokens": 2000
}, },
"serpCluster": {},
"errorAnalyzer": { "errorAnalyzer": {
"maxTokens": 1000 "maxTokens": 1000
}, },
@@ -60,7 +61,9 @@
"model": "gemini-2.0-flash-lite" "model": "gemini-2.0-flash-lite"
}, },
"finalizer": {}, "finalizer": {},
"reducer": {"maxTokens": 16000} "reducer": {
"maxTokens": 16000
}
} }
}, },
"openai": { "openai": {
@@ -79,6 +82,7 @@
"queryRewriter": { "queryRewriter": {
"temperature": 0.1 "temperature": 0.1
}, },
"serpCluster": {},
"agent": { "agent": {
"temperature": 0.7 "temperature": 0.7
}, },
@@ -89,7 +93,9 @@
"temperature": 0 "temperature": 0
}, },
"finalizer": {}, "finalizer": {},
"reducer": {"maxTokens": 16000} "reducer": {
"maxTokens": 16000
}
} }
} }
} }

View File

@@ -48,6 +48,7 @@ import { researchPlan } from './tools/research-planner';
import { reduceAnswers } from './tools/reducer'; import { reduceAnswers } from './tools/reducer';
import { AxiosError } from 'axios'; import { AxiosError } from 'axios';
import { dedupImagesWithEmbeddings } from './utils/image-tools'; import { dedupImagesWithEmbeddings } from './utils/image-tools';
import { serpCluster } from './tools/serp-cluster';
async function wait(seconds: number) { async function wait(seconds: number) {
logDebug(`Waiting ${seconds}s...`); logDebug(`Waiting ${seconds}s...`);
@@ -361,12 +362,27 @@ async function executeSearchQueries(
searchedQueries.push(query.q) searchedQueries.push(query.q)
newKnowledge.push({ try {
question: `What do Internet say about "${oldQuery}"?`, const clusters = await serpCluster(minResults, context, SchemaGen);
answer: removeHTMLtags(minResults.map(r => r.description).join('; ')), clusters.forEach(c => {
type: 'side-info', newKnowledge.push({
updated: query.tbs ? formatDateRange(query) : undefined question: c.question,
}); answer: c.insight,
references: c.urls,
type: 'url',
});
});
} catch (error) {
logWarning('serpCluster failed:', { error });
newKnowledge.push({
question: `What do Internet say about "${oldQuery}"?`,
answer: removeHTMLtags(minResults.map(r => r.description).join('; ')),
type: 'side-info',
updated: query.tbs ? formatDateRange(query) : undefined
});
}
} }
if (searchedQueries.length === 0) { if (searchedQueries.length === 0) {
if (onlyHostnames && onlyHostnames.length > 0) { if (onlyHostnames && onlyHostnames.length > 0) {
@@ -405,7 +421,7 @@ export async function getResponse(question?: string,
searchLanguageCode?: string, searchLanguageCode?: string,
searchProvider?: string, searchProvider?: string,
withImages: boolean = false, withImages: boolean = false,
teamSize: number = 2 teamSize: number = 1
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], imageReferences?: ImageReference[] }> { ): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], imageReferences?: ImageReference[] }> {
let step = 0; let step = 0;

View File

@@ -2,7 +2,7 @@ import { TokenTracker } from "../utils/token-tracker";
import { JinaSearchResponse, SERPQuery } from '../types'; import { JinaSearchResponse, SERPQuery } from '../types';
import { JINA_API_KEY } from "../config"; import { JINA_API_KEY } from "../config";
import axiosClient from '../utils/axios-client'; import axiosClient from '../utils/axios-client';
import { logInfo, logError, logDebug, logWarning } from '../logging'; import { logError, logDebug } from '../logging';
export async function search( export async function search(
query: SERPQuery, query: SERPQuery,

40
src/tools/serp-cluster.ts Normal file
View File

@@ -0,0 +1,40 @@
import { PromptPair, TrackerContext } from '../types';
import { ObjectGeneratorSafe } from "../utils/safe-generator";
import { Schemas } from "../utils/schemas";
import { logInfo, logError } from '../logging';
import { SearchSnippet } from '../types';
function getPrompt(results: SearchSnippet[]): PromptPair {
return {
system: `
You are a search engine result analyzer. You look at the SERP API response and group them into meaningful cluster.
Each cluster should contain a summary of the content, key data and insights, the corresponding URLs and search advice. Respond in JSON format.
`,
user:
`
${JSON.stringify(results)}
`
};
}
const TOOL_NAME = 'serpCluster';
export async function serpCluster(results: SearchSnippet[], trackers: TrackerContext, schemaGen: Schemas): Promise<any[]> {
try {
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const prompt = getPrompt(results);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: schemaGen.getSerpClusterSchema(),
system: prompt.system,
prompt: prompt.user,
});
trackers?.actionTracker.trackThink(result.object.think);
const clusters = result.object.clusters;
logInfo(TOOL_NAME, { clusters });
return clusters;
} catch (error) {
logError(TOOL_NAME, { error });
throw error;
}
}

View File

@@ -6,6 +6,7 @@ import { logDebug } from '../logging';
export const MAX_URLS_PER_STEP = 5 export const MAX_URLS_PER_STEP = 5
export const MAX_QUERIES_PER_STEP = 5 export const MAX_QUERIES_PER_STEP = 5
export const MAX_REFLECT_PER_STEP = 2 export const MAX_REFLECT_PER_STEP = 2
export const MAX_CLUSTERS = 5
function getLanguagePrompt(question: string): PromptPair { function getLanguagePrompt(question: string): PromptPair {
return { return {
@@ -173,6 +174,20 @@ export class Schemas {
}); });
} }
getSerpClusterSchema(): z.ZodObject<any> {
return z.object({
think: z.string().describe(`Explain why you cluster the search results like this. ${this.getLanguagePrompt()}`).max(500),
clusters: z.array(
z.object({
question: z.string().describe('What question this cluster answers.').max(100),
insight: z.string().describe('Summary and list key numbers, data and insights that worth to be highlighted. End with an actionable advice such as "Visit these URLs if you want to understand [what...]". Do not use "This cluster..."').max(200),
urls: z.array(z.string().describe('URLs in this cluster.').max(100))
}))
.max(MAX_CLUSTERS)
.describe(`'The optimal clustering of search engine results, orthogonal to each other. Maximum ${MAX_CLUSTERS} clusters allowed.'`)
});
}
getQueryRewriterSchema(): z.ZodObject<any> { getQueryRewriterSchema(): z.ZodObject<any> {
return z.object({ return z.object({
think: z.string().describe(`Explain why you choose those search queries. ${this.getLanguagePrompt()}`).max(500), think: z.string().describe(`Explain why you choose those search queries. ${this.getLanguagePrompt()}`).max(500),