mirror of
https://github.com/jina-ai/node-DeepResearch.git
synced 2025-12-26 06:28:56 +08:00
fix: multi-aspect
This commit is contained in:
parent
c02588a92c
commit
ad7e524554
55
src/agent.ts
55
src/agent.ts
@ -19,7 +19,8 @@ import {zodToJsonSchema} from "zod-to-json-schema";
|
||||
import {ObjectGeneratorSafe} from "./utils/safe-generator";
|
||||
import {CodeSandbox} from "./tools/code-sandbox";
|
||||
import {serperSearch} from './tools/serper-search';
|
||||
import {normalizeUrl} from "./utils/url-tools";
|
||||
import {getUnvisitedURLs, normalizeUrl} from "./utils/url-tools";
|
||||
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
|
||||
|
||||
async function sleep(ms: number) {
|
||||
const seconds = Math.ceil(ms / 1000);
|
||||
@ -40,8 +41,9 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
|
||||
|
||||
if (allowSearch) {
|
||||
actions.push("search");
|
||||
properties.searchQuery = z.string().max(30)
|
||||
.describe(`Required when action='search'. Must be a short, keyword-based query that BM25, tf-idf based search engines can understand. Write the query in the language that potential answers might be written in, then in ${languageStyle}.`).optional();
|
||||
properties.searchRequests = z.array(
|
||||
z.string().max(30)
|
||||
.describe(`A natual language search request in ${languageStyle}. Based on the deep intention behind the original question and the expected answer format.`)).describe(`Required when action='search'. Always prefer a single request, only add another request if the original question covers multiple aspects or elements and one search request is definitely not enough, each request focus on one specific aspect of the original question. Minimize mutual information between each request. Maximum ${MAX_QUERIES_PER_STEP} search requests.`).max(MAX_QUERIES_PER_STEP);
|
||||
}
|
||||
|
||||
if (allowCoding) {
|
||||
@ -85,11 +87,7 @@ function getSchema(allowReflect: boolean, allowRead: boolean, allowAnswer: boole
|
||||
|
||||
}
|
||||
|
||||
function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedURLs: string[]): SearchResult[] {
|
||||
return Object.entries(allURLs)
|
||||
.filter(([url]) => !visitedURLs.includes(url))
|
||||
.map(([, result]) => result);
|
||||
}
|
||||
|
||||
|
||||
function getPrompt(
|
||||
context?: string[],
|
||||
@ -226,14 +224,13 @@ ${urlList}
|
||||
actionSections.push(`
|
||||
<action-search>
|
||||
- Use web search to find relevant information
|
||||
- Choose optimal search queries and language based on the expected answer format
|
||||
- Focus on one specific aspect of the original question
|
||||
- Suggest unique keywords and alternative search angles
|
||||
- Build a search request based on the deep intention behind the original question and the expected answer format
|
||||
- Always prefer a single search request, only add another request if the original question covers multiple aspects or elements and one query is not enough, each request focus on one specific aspect of the original question
|
||||
${allKeywords?.length ? `
|
||||
- Previous unsuccessful queries to avoid:
|
||||
<bad-queries>
|
||||
- Avoid those unsuccessful search requests and queries:
|
||||
<bad-requests>
|
||||
${allKeywords.join('\n')}
|
||||
</bad-queries>
|
||||
</bad-requests>
|
||||
`.trim() : ''}
|
||||
</action-search>
|
||||
`);
|
||||
@ -243,7 +240,7 @@ ${allKeywords.join('\n')}
|
||||
actionSections.push(`
|
||||
<action-answer>
|
||||
- For greetings, casual conversation, or general knowledge questions, answer directly without references.
|
||||
- If the question is clearly within your knowledge cutoff (i.e. Aug. 2024), provide a confident answer directly.
|
||||
- If the question is clearly within your knowledge cutoff (i.e. Aug. 2024) and requires no up-to-date knowledge to get better answer, then provide a confident answer directly.
|
||||
- For all other questions, provide a verified answer with references. Each reference must include exactQuote and url.
|
||||
- If uncertain, use <action-reflect>
|
||||
</action-answer>
|
||||
@ -292,9 +289,7 @@ ${actionSections.join('\n\n')}
|
||||
return removeExtraLineBreaks(sections.join('\n\n'));
|
||||
}
|
||||
|
||||
const removeExtraLineBreaks = (text: string) => {
|
||||
return text.replace(/\n{2,}/gm, '\n\n');
|
||||
}
|
||||
|
||||
|
||||
const allContext: StepAction[] = []; // all steps in the current session, including those leads to wrong results
|
||||
|
||||
@ -302,14 +297,8 @@ function updateContext(step: any) {
|
||||
allContext.push(step)
|
||||
}
|
||||
|
||||
function chooseK(a: string[], k: number) {
|
||||
// randomly sample k from `a` without repitition
|
||||
return a.sort(() => 0.5 - Math.random()).slice(0, k);
|
||||
}
|
||||
|
||||
function removeHTMLtags(text: string) {
|
||||
return text.replace(/<[^>]*>?/gm, '');
|
||||
}
|
||||
|
||||
|
||||
|
||||
export async function getResponse(question?: string,
|
||||
@ -560,13 +549,15 @@ But then you realized you have asked them before. You decided to to think out of
|
||||
|
||||
allowReflect = false;
|
||||
}
|
||||
} else if (thisStep.action === 'search' && thisStep.searchQuery) {
|
||||
} else if (thisStep.action === 'search' && thisStep.searchRequests) {
|
||||
// dedup search requests
|
||||
thisStep.searchRequests = chooseK((await dedupQueries(thisStep.searchRequests, [], context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
|
||||
|
||||
// rewrite queries
|
||||
let {queries: keywordsQueries} = await rewriteQuery(thisStep, context);
|
||||
const oldKeywords = keywordsQueries;
|
||||
// avoid exisitng searched queries
|
||||
const {unique_queries: dedupedQueries} = await dedupQueries(keywordsQueries, allKeywords, context.tokenTracker);
|
||||
keywordsQueries = chooseK(dedupedQueries, MAX_QUERIES_PER_STEP);
|
||||
keywordsQueries = chooseK((await dedupQueries(keywordsQueries, allKeywords, context.tokenTracker)).unique_queries, MAX_QUERIES_PER_STEP);
|
||||
|
||||
let anyResult = false;
|
||||
|
||||
if (keywordsQueries.length > 0) {
|
||||
@ -639,7 +630,7 @@ You found quite some information and add them to your URL list and **visit** the
|
||||
if (!anyResult || !keywordsQueries?.length) {
|
||||
diaryContext.push(`
|
||||
At step ${step}, you took the **search** action and look for external information for the question: "${currentQuestion}".
|
||||
In particular, you tried to search for the following keywords: ${oldKeywords.join(', ')}.
|
||||
In particular, you tried to search for the following keywords: ${keywordsQueries.join(', ')}.
|
||||
But then you realized you have already searched for these keywords before, no new information is returned.
|
||||
You decided to think out of the box or cut from a completely different angle.
|
||||
`);
|
||||
@ -800,6 +791,8 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
(thisStep as AnswerAction).isFinal = true;
|
||||
context.actionTracker.trackAction({totalStep, thisStep, gaps, badAttempts});
|
||||
}
|
||||
|
||||
(thisStep as AnswerAction).mdAnswer = buildMdFromAnswer((thisStep as AnswerAction))
|
||||
console.log(thisStep)
|
||||
|
||||
await storeContext(system, schema, [allContext, allKeywords, allQuestions, allKnowledge], totalStep);
|
||||
@ -807,7 +800,7 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
|
||||
result: thisStep,
|
||||
context,
|
||||
visitedURLs: [...new Set([...visitedURLs, ...Object.keys(allURLs)])],
|
||||
readURLs: visitedURLs
|
||||
readURLs: visitedURLs,
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
107
src/app.ts
107
src/app.ts
@ -28,102 +28,6 @@ app.get('/health', (req, res) => {
|
||||
res.json({status: 'ok'});
|
||||
});
|
||||
|
||||
function buildMdFromAnswer(answer: AnswerAction) {
|
||||
const footnoteRegex = /\[\^(\d+)]/g;
|
||||
|
||||
// Helper function to format references
|
||||
const formatReferences = (refs: typeof answer.references) => {
|
||||
return refs.map((ref, i) => {
|
||||
const cleanQuote = ref.exactQuote
|
||||
.replace(/[^\p{L}\p{N}\s]/gu, ' ')
|
||||
.replace(/\s+/g, ' ');
|
||||
|
||||
const citation = `[^${i + 1}]: ${cleanQuote}`;
|
||||
|
||||
if (!ref.url?.startsWith('http')) return citation;
|
||||
|
||||
const domainName = new URL(ref.url).hostname.replace('www.', '');
|
||||
return `${citation} [${domainName}](${ref.url})`;
|
||||
}).join('\n\n');
|
||||
};
|
||||
|
||||
// First case: no references - remove any footnote citations
|
||||
if (!answer.references?.length) {
|
||||
return answer.answer.replace(footnoteRegex, '');
|
||||
}
|
||||
|
||||
// Extract all footnotes from answer
|
||||
const footnotes: string[] = [];
|
||||
let match;
|
||||
while ((match = footnoteRegex.exec(answer.answer)) !== null) {
|
||||
footnotes.push(match[1]);
|
||||
}
|
||||
|
||||
// No footnotes in answer but we have references - append them at the end
|
||||
if (footnotes.length === 0) {
|
||||
const appendedCitations = Array.from(
|
||||
{length: answer.references.length},
|
||||
(_, i) => `[^${i + 1}]`
|
||||
).join('');
|
||||
|
||||
const references = formatReferences(answer.references);
|
||||
|
||||
return `
|
||||
${answer.answer}
|
||||
|
||||
⁜${appendedCitations}
|
||||
|
||||
${references}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
// Check if correction is needed
|
||||
const needsCorrection =
|
||||
(footnotes.length === answer.references.length && footnotes.every(n => n === footnotes[0])) ||
|
||||
(footnotes.every(n => n === footnotes[0]) && parseInt(footnotes[0]) > answer.references.length) ||
|
||||
(footnotes.length > 0 && footnotes.every(n => parseInt(n) > answer.references.length));
|
||||
|
||||
// New case: we have more references than footnotes
|
||||
if (answer.references.length > footnotes.length && !needsCorrection) {
|
||||
// Get the used indices
|
||||
const usedIndices = new Set(footnotes.map(n => parseInt(n)));
|
||||
|
||||
// Create citations for unused references
|
||||
const unusedReferences = Array.from(
|
||||
{length: answer.references.length},
|
||||
(_, i) => !usedIndices.has(i + 1) ? `[^${i + 1}]` : ''
|
||||
).join('');
|
||||
|
||||
return `
|
||||
${answer.answer}
|
||||
|
||||
⁜${unusedReferences}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
if (!needsCorrection) {
|
||||
return `
|
||||
${answer.answer}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
// Apply correction: sequentially number the footnotes
|
||||
let currentIndex = 0;
|
||||
const correctedAnswer = answer.answer.replace(footnoteRegex, () =>
|
||||
`[^${++currentIndex}]`
|
||||
);
|
||||
|
||||
return `
|
||||
${correctedAnswer}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
async function* streamTextNaturally(text: string, streamingState: StreamingState) {
|
||||
// Split text into chunks that preserve CJK characters, URLs, and regular words
|
||||
const chunks = splitTextIntoChunks(text);
|
||||
@ -452,6 +356,7 @@ async function processQueue(streamingState: StreamingState, res: Response, reque
|
||||
|
||||
streamingState.processingQueue = false;
|
||||
}
|
||||
|
||||
app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
// Check authentication only if secret is set
|
||||
if (secret) {
|
||||
@ -559,13 +464,17 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
}
|
||||
|
||||
try {
|
||||
const {result: finalStep, visitedURLs: visitedURLs, readURLs: readURLs} = await getResponse(undefined, tokenBudget, maxBadAttempts, context, body.messages)
|
||||
const {
|
||||
result: finalStep,
|
||||
visitedURLs: visitedURLs,
|
||||
readURLs: readURLs
|
||||
} = await getResponse(undefined, tokenBudget, maxBadAttempts, context, body.messages)
|
||||
|
||||
const usage = context.tokenTracker.getTotalUsageSnakeCase();
|
||||
if (body.stream) {
|
||||
// Complete any ongoing streaming before sending final answer
|
||||
await completeCurrentStreaming(streamingState, res, requestId, created, body.model);
|
||||
const finalAnswer = buildMdFromAnswer(finalStep as AnswerAction);
|
||||
const finalAnswer = (finalStep as AnswerAction).mdAnswer;
|
||||
// Send closing think tag
|
||||
const closeThinkChunk: ChatCompletionChunk = {
|
||||
id: requestId,
|
||||
@ -613,7 +522,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
|
||||
index: 0,
|
||||
message: {
|
||||
role: 'assistant',
|
||||
content: finalStep.action === 'answer' ? buildMdFromAnswer(finalStep) : finalStep.think
|
||||
content: finalStep.action === 'answer' ? (finalStep.mdAnswer || '') : finalStep.think
|
||||
},
|
||||
logprobs: null,
|
||||
finish_reason: 'stop'
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
import { z } from 'zod';
|
||||
import {z} from 'zod';
|
||||
import {SearchAction, TrackerContext} from '../types';
|
||||
import {ObjectGeneratorSafe} from "../utils/safe-generator";
|
||||
|
||||
@ -13,8 +13,7 @@ const responseSchema = z.object({
|
||||
});
|
||||
|
||||
|
||||
|
||||
function getPrompt(action: SearchAction): string {
|
||||
function getPrompt(query: string, think: string): string {
|
||||
return `You are an expert search query generator with deep psychological understanding. You optimize user queries by extensively analyzing potential user intents and generating comprehensive search variations.
|
||||
|
||||
<rules>
|
||||
@ -185,8 +184,8 @@ queries: [
|
||||
]
|
||||
|
||||
Now, process this query:
|
||||
Input Query: ${action.searchQuery}
|
||||
Intention: ${action.think}
|
||||
Input Query: ${query}
|
||||
Intention: ${think}
|
||||
`;
|
||||
}
|
||||
|
||||
@ -195,17 +194,23 @@ const TOOL_NAME = 'queryRewriter';
|
||||
export async function rewriteQuery(action: SearchAction, trackers?: TrackerContext): Promise<{ queries: string[] }> {
|
||||
try {
|
||||
const generator = new ObjectGeneratorSafe(trackers?.tokenTracker);
|
||||
const prompt = getPrompt(action);
|
||||
const allQueries = [...action.searchRequests];
|
||||
|
||||
const result = await generator.generateObject({
|
||||
model: TOOL_NAME,
|
||||
schema: responseSchema,
|
||||
prompt,
|
||||
const queryPromises = action.searchRequests.map(async (req) => {
|
||||
const prompt = getPrompt(req, action.think);
|
||||
const result = await generator.generateObject({
|
||||
model: TOOL_NAME,
|
||||
schema: responseSchema,
|
||||
prompt,
|
||||
});
|
||||
trackers?.actionTracker.trackThink(result.object.think);
|
||||
return result.object.queries;
|
||||
});
|
||||
|
||||
console.log(TOOL_NAME, result.object.queries);
|
||||
trackers?.actionTracker.trackThink(result.object.think);
|
||||
return { queries: result.object.queries };
|
||||
const queryResults = await Promise.all(queryPromises);
|
||||
queryResults.forEach(queries => allQueries.push(...queries));
|
||||
console.log(TOOL_NAME, allQueries);
|
||||
return { queries: allQueries };
|
||||
} catch (error) {
|
||||
console.error(`Error in ${TOOL_NAME}`, error);
|
||||
throw error;
|
||||
|
||||
@ -8,7 +8,7 @@ type BaseAction = {
|
||||
|
||||
export type SearchAction = BaseAction & {
|
||||
action: "search";
|
||||
searchQuery: string;
|
||||
searchRequests: string[];
|
||||
};
|
||||
|
||||
export type AnswerAction = BaseAction & {
|
||||
@ -19,6 +19,7 @@ export type AnswerAction = BaseAction & {
|
||||
url: string;
|
||||
}>;
|
||||
isFinal?: boolean;
|
||||
mdAnswer?: string;
|
||||
};
|
||||
|
||||
|
||||
|
||||
127
src/utils/text-tools.ts
Normal file
127
src/utils/text-tools.ts
Normal file
@ -0,0 +1,127 @@
|
||||
import {AnswerAction} from "../types";
|
||||
|
||||
export function buildMdFromAnswer(answer: AnswerAction) {
|
||||
// Standard footnote regex
|
||||
const footnoteRegex = /\[\^(\d+)]/g;
|
||||
|
||||
// New regex to catch grouped footnotes like [^1, ^2, ^3] or [^1,^2,^3]
|
||||
const groupedFootnoteRegex = /\[\^(\d+)(?:,\s*\^(\d+))+]/g;
|
||||
|
||||
// Helper function to format references
|
||||
const formatReferences = (refs: typeof answer.references) => {
|
||||
return refs.map((ref, i) => {
|
||||
const cleanQuote = ref.exactQuote
|
||||
.replace(/[^\p{L}\p{N}\s]/gu, ' ')
|
||||
.replace(/\s+/g, ' ');
|
||||
|
||||
const citation = `[^${i + 1}]: ${cleanQuote}`;
|
||||
|
||||
if (!ref.url?.startsWith('http')) return citation;
|
||||
|
||||
const domainName = new URL(ref.url).hostname.replace('www.', '');
|
||||
return `${citation} [${domainName}](${ref.url})`;
|
||||
}).join('\n\n');
|
||||
};
|
||||
|
||||
// First case: no references - remove any footnote citations
|
||||
if (!answer.references?.length) {
|
||||
return answer.answer
|
||||
.replace(groupedFootnoteRegex, (match) => {
|
||||
// Extract all numbers from the grouped footnote
|
||||
const numbers = match.match(/\d+/g) || [];
|
||||
return numbers.map(num => `[^${num}]`).join(', ');
|
||||
})
|
||||
.replace(footnoteRegex, '');
|
||||
}
|
||||
|
||||
// Fix grouped footnotes first
|
||||
const processedAnswer = answer.answer.replace(groupedFootnoteRegex, (match) => {
|
||||
// Extract all numbers from the grouped footnote
|
||||
const numbers = match.match(/\d+/g) || [];
|
||||
return numbers.map(num => `[^${num}]`).join(', ');
|
||||
});
|
||||
|
||||
// Now extract all footnotes from the processed answer
|
||||
const footnotes: string[] = [];
|
||||
let match;
|
||||
while ((match = footnoteRegex.exec(processedAnswer)) !== null) {
|
||||
footnotes.push(match[1]);
|
||||
}
|
||||
|
||||
// No footnotes in answer but we have references - append them at the end
|
||||
if (footnotes.length === 0) {
|
||||
const appendedCitations = Array.from(
|
||||
{length: answer.references.length},
|
||||
(_, i) => `[^${i + 1}]`
|
||||
).join('');
|
||||
|
||||
const references = formatReferences(answer.references);
|
||||
|
||||
return `
|
||||
${processedAnswer}
|
||||
|
||||
⁜${appendedCitations}
|
||||
|
||||
${references}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
// Check if correction is needed
|
||||
const needsCorrection =
|
||||
(footnotes.length === answer.references.length && footnotes.every(n => n === footnotes[0])) ||
|
||||
(footnotes.every(n => n === footnotes[0]) && parseInt(footnotes[0]) > answer.references.length) ||
|
||||
(footnotes.length > 0 && footnotes.every(n => parseInt(n) > answer.references.length));
|
||||
|
||||
// New case: we have more references than footnotes
|
||||
if (answer.references.length > footnotes.length && !needsCorrection) {
|
||||
// Get the used indices
|
||||
const usedIndices = new Set(footnotes.map(n => parseInt(n)));
|
||||
|
||||
// Create citations for unused references
|
||||
const unusedReferences = Array.from(
|
||||
{length: answer.references.length},
|
||||
(_, i) => !usedIndices.has(i + 1) ? `[^${i + 1}]` : ''
|
||||
).join('');
|
||||
|
||||
return `
|
||||
${processedAnswer}
|
||||
|
||||
⁜${unusedReferences}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
if (!needsCorrection) {
|
||||
return `
|
||||
${processedAnswer}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
// Apply correction: sequentially number the footnotes
|
||||
let currentIndex = 0;
|
||||
const correctedAnswer = processedAnswer.replace(footnoteRegex, () =>
|
||||
`[^${++currentIndex}]`
|
||||
);
|
||||
|
||||
return `
|
||||
${correctedAnswer}
|
||||
|
||||
${formatReferences(answer.references)}
|
||||
`.trim();
|
||||
}
|
||||
|
||||
export const removeExtraLineBreaks = (text: string) => {
|
||||
return text.replace(/\n{2,}/gm, '\n\n');
|
||||
}
|
||||
|
||||
export function chooseK(a: string[], k: number) {
|
||||
// randomly sample k from `a` without repitition
|
||||
return a.sort(() => 0.5 - Math.random()).slice(0, k);
|
||||
}
|
||||
|
||||
export function removeHTMLtags(text: string) {
|
||||
return text.replace(/<[^>]*>?/gm, '');
|
||||
}
|
||||
@ -1,3 +1,5 @@
|
||||
import {SearchResult} from "../types";
|
||||
|
||||
export function normalizeUrl(urlString: string, debug = false): string {
|
||||
if (!urlString?.trim()) {
|
||||
throw new Error('Empty URL');
|
||||
@ -93,3 +95,10 @@ export function normalizeUrl(urlString: string, debug = false): string {
|
||||
throw new Error(`Invalid URL "${urlString}": ${error}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export function getUnvisitedURLs(allURLs: Record<string, SearchResult>, visitedURLs: string[]): SearchResult[] {
|
||||
return Object.entries(allURLs)
|
||||
.filter(([url]) => !visitedURLs.includes(url))
|
||||
.map(([, result]) => result);
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user