feat: add serpCluster integration and schema

This commit is contained in:
Han Xiao 2025-06-17 17:43:31 -07:00
parent 96aca78d6a
commit 44dae8efb3
6 changed files with 89 additions and 10 deletions

View File

@ -45,6 +45,7 @@
"temperature": 0.1
},
"researchPlanner": {},
"serpCluster": {},
"agent": {
"temperature": 0.7
},
@ -79,6 +80,7 @@
"queryRewriter": {
"temperature": 0.1
},
"serpCluster": {},
"agent": {
"temperature": 0.7
},

View File

@ -47,6 +47,7 @@
"evaluator": {
"maxTokens": 2000
},
"serpCluster": {},
"errorAnalyzer": {
"maxTokens": 1000
},
@ -60,7 +61,9 @@
"model": "gemini-2.0-flash-lite"
},
"finalizer": {},
"reducer": {"maxTokens": 16000}
"reducer": {
"maxTokens": 16000
}
}
},
"openai": {
@ -79,6 +82,7 @@
"queryRewriter": {
"temperature": 0.1
},
"serpCluster": {},
"agent": {
"temperature": 0.7
},
@ -89,7 +93,9 @@
"temperature": 0
},
"finalizer": {},
"reducer": {"maxTokens": 16000}
"reducer": {
"maxTokens": 16000
}
}
}
}

View File

@ -48,6 +48,7 @@ import { researchPlan } from './tools/research-planner';
import { reduceAnswers } from './tools/reducer';
import { AxiosError } from 'axios';
import { dedupImagesWithEmbeddings } from './utils/image-tools';
import { serpCluster } from './tools/serp-cluster';
async function wait(seconds: number) {
logDebug(`Waiting ${seconds}s...`);
@ -361,6 +362,18 @@ async function executeSearchQueries(
searchedQueries.push(query.q)
try {
const clusters = await serpCluster(minResults, context, SchemaGen);
clusters.forEach(c => {
newKnowledge.push({
question: c.question,
answer: c.insight,
references: c.urls,
type: 'url',
});
});
} catch (error) {
logWarning('serpCluster failed:', { error });
newKnowledge.push({
question: `What do Internet say about "${oldQuery}"?`,
answer: removeHTMLtags(minResults.map(r => r.description).join('; ')),
@ -368,6 +381,9 @@ async function executeSearchQueries(
updated: query.tbs ? formatDateRange(query) : undefined
});
}
}
if (searchedQueries.length === 0) {
if (onlyHostnames && onlyHostnames.length > 0) {
logWarning(`No results found for queries: ${uniqQOnly.join(', ')} on hostnames: ${onlyHostnames.join(', ')}`);
@ -405,7 +421,7 @@ export async function getResponse(question?: string,
searchLanguageCode?: string,
searchProvider?: string,
withImages: boolean = false,
teamSize: number = 2
teamSize: number = 1
): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], imageReferences?: ImageReference[] }> {
let step = 0;

View File

@ -2,7 +2,7 @@ import { TokenTracker } from "../utils/token-tracker";
import { JinaSearchResponse, SERPQuery } from '../types';
import { JINA_API_KEY } from "../config";
import axiosClient from '../utils/axios-client';
import { logInfo, logError, logDebug, logWarning } from '../logging';
import { logError, logDebug } from '../logging';
export async function search(
query: SERPQuery,

40
src/tools/serp-cluster.ts Normal file
View File

@ -0,0 +1,40 @@
import { PromptPair, TrackerContext } from '../types';
import { ObjectGeneratorSafe } from "../utils/safe-generator";
import { Schemas } from "../utils/schemas";
import { logInfo, logError } from '../logging';
import { SearchSnippet } from '../types';
function getPrompt(results: SearchSnippet[]): PromptPair {
return {
system: `
You are a search engine result analyzer. You look at the SERP API response and group them into meaningful cluster.
Each cluster should contain a summary of the content, key data and insights, the corresponding URLs and search advice. Respond in JSON format.
`,
user:
`
${JSON.stringify(results)}
`
};
}
const TOOL_NAME = 'serpCluster';
export async function serpCluster(results: SearchSnippet[], trackers: TrackerContext, schemaGen: Schemas): Promise<any[]> {
try {
const generator = new ObjectGeneratorSafe(trackers.tokenTracker);
const prompt = getPrompt(results);
const result = await generator.generateObject({
model: TOOL_NAME,
schema: schemaGen.getSerpClusterSchema(),
system: prompt.system,
prompt: prompt.user,
});
trackers?.actionTracker.trackThink(result.object.think);
const clusters = result.object.clusters;
logInfo(TOOL_NAME, { clusters });
return clusters;
} catch (error) {
logError(TOOL_NAME, { error });
throw error;
}
}

View File

@ -6,6 +6,7 @@ import { logDebug } from '../logging';
export const MAX_URLS_PER_STEP = 5
export const MAX_QUERIES_PER_STEP = 5
export const MAX_REFLECT_PER_STEP = 2
export const MAX_CLUSTERS = 5
function getLanguagePrompt(question: string): PromptPair {
return {
@ -173,6 +174,20 @@ export class Schemas {
});
}
getSerpClusterSchema(): z.ZodObject<any> {
return z.object({
think: z.string().describe(`Explain why you cluster the search results like this. ${this.getLanguagePrompt()}`).max(500),
clusters: z.array(
z.object({
question: z.string().describe('What question this cluster answers.').max(100),
insight: z.string().describe('Summary and list key numbers, data and insights that worth to be highlighted. End with an actionable advice such as "Visit these URLs if you want to understand [what...]". Do not use "This cluster..."').max(200),
urls: z.array(z.string().describe('URLs in this cluster.').max(100))
}))
.max(MAX_CLUSTERS)
.describe(`'The optimal clustering of search engine results, orthogonal to each other. Maximum ${MAX_CLUSTERS} clusters allowed.'`)
});
}
getQueryRewriterSchema(): z.ZodObject<any> {
return z.object({
think: z.string().describe(`Explain why you choose those search queries. ${this.getLanguagePrompt()}`).max(500),