node-DeepResearch/src/tools/build-ref.ts
2025-04-14 22:09:26 +08:00

456 lines
17 KiB
TypeScript

import {segmentText} from './segment';
import {JinaEmbeddingRequest, JinaEmbeddingResponse, Reference, TrackerContext, WebContent} from "../types";
import {Schemas} from "../utils/schemas";
import axios, {AxiosError} from 'axios';
import {JINA_API_KEY} from "../config";
import {cosineSimilarity, jaccardRank} from "./cosine";
const BATCH_SIZE = 2000;
const API_URL = "https://api.jina.ai/v1/embeddings";
// Simplified function to get embeddings in a single request
async function getEmbeddings(
texts: string[],
tokenTracker?: any
): Promise<{ embeddings: number[][], tokens: number }> {
console.log(`[embeddings] Getting embeddings for ${texts.length} texts`);
if (!JINA_API_KEY) {
throw new Error('JINA_API_KEY is not set');
}
// Handle empty input case
if (texts.length === 0) {
return {embeddings: [], tokens: 0};
}
// Process in batches of 2000
const allEmbeddings: number[][] = [];
let totalTokens = 0;
const batchCount = Math.ceil(texts.length / BATCH_SIZE);
for (let i = 0; i < texts.length; i += BATCH_SIZE) {
const batchTexts = texts.slice(i, i + BATCH_SIZE);
const currentBatch = Math.floor(i / BATCH_SIZE) + 1;
console.log(`[embeddings] Processing batch ${currentBatch}/${batchCount} (${batchTexts.length} texts)`);
const request: JinaEmbeddingRequest = {
model: "jina-embeddings-v3",
task: "text-matching",
late_chunking: false, // Late chunking turned off always
dimensions: 1024,
embedding_type: "float",
input: batchTexts,
truncate: true
};
try {
const response = await axios.post<JinaEmbeddingResponse>(
API_URL,
request,
{
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${JINA_API_KEY}`
}
}
);
// Validate response format
if (!response.data.data || response.data.data.length !== batchTexts.length) {
console.error('Invalid response from Jina API:', response.data);
continue;
}
// Sort embeddings by index to maintain original order
const batchEmbeddings = response.data.data
.sort((a, b) => a.index - b.index)
.map(item => item.embedding);
allEmbeddings.push(...batchEmbeddings);
totalTokens += response.data.usage.total_tokens;
console.log(`[embeddings] Batch ${currentBatch} complete. Tokens used: ${response.data.usage.total_tokens}, total so far: ${totalTokens}`);
} catch (error) {
console.error('Error calling Jina Embeddings API:', error);
if (error instanceof AxiosError && error.response?.status === 402) {
return {embeddings: [], tokens: 0};
}
throw error;
}
}
// Track token usage if tracker is provided
if (tokenTracker) {
tokenTracker.trackUsage('embeddings', {
promptTokens: totalTokens,
completionTokens: 0,
totalTokens: totalTokens
});
}
console.log(`[embeddings] Complete. Generated ${allEmbeddings.length} embeddings using ${totalTokens} tokens`);
return {embeddings: allEmbeddings, tokens: totalTokens};
}
export async function buildReferences(
answer: string,
webContents: Record<string, WebContent>,
context: TrackerContext,
schema: Schemas,
maxRef: number = 10,
minChunkLength: number = 80,
minRelScore: number = 0.75
): Promise<{ answer: string, references: Array<Reference> }> {
console.log(`[buildReferences] Starting with maxRef=${maxRef}, minChunkLength=${minChunkLength}, minRelScore=${minRelScore}`);
console.log(`[buildReferences] Answer length: ${answer.length} chars, Web content sources: ${Object.keys(webContents).length}`);
// Step 1: Chunk the answer
console.log(`[buildReferences] Step 1: Chunking answer text`);
const {chunks: answerChunks, chunk_positions: answerChunkPositions} = await segmentText(answer, context);
console.log(`[buildReferences] Answer segmented into ${answerChunks.length} chunks`);
// Step 2: Prepare all web content chunks, filtering out those below minimum length
console.log(`[buildReferences] Step 2: Preparing web content chunks and filtering by minimum length (${minChunkLength} chars)`);
const allWebContentChunks: string[] = [];
const chunkToSourceMap: any = {}; // Maps chunk index to source information
const validWebChunkIndices = new Set<number>(); // Tracks indices of valid web chunks (those above minimum length)
let chunkIndex = 0;
for (const [url, content] of Object.entries(webContents)) {
if (!content.chunks || content.chunks.length === 0) continue;
for (let i = 0; i < content.chunks.length; i++) {
const chunk = content.chunks[i];
allWebContentChunks.push(chunk);
chunkToSourceMap[chunkIndex] = {
url,
title: content.title || url,
text: chunk,
};
// Track valid web chunks (above minimum length)
if (chunk.length >= minChunkLength) {
validWebChunkIndices.add(chunkIndex);
}
chunkIndex++;
}
}
console.log(`[buildReferences] Collected ${allWebContentChunks.length} web chunks, ${validWebChunkIndices.size} above minimum length`);
if (allWebContentChunks.length === 0) {
console.log(`[buildReferences] No web content chunks available, returning without references`);
return {answer, references: []};
}
// Step 3: Filter answer chunks by minimum length
console.log(`[buildReferences] Step 3: Filtering answer chunks by minimum length`);
const validAnswerChunks: string[] = [];
const validAnswerChunkIndices: number[] = [];
const validAnswerChunkPositions: [number, number][] = [];
context.actionTracker.trackThink('cross_reference', schema.languageCode);
for (let i = 0; i < answerChunks.length; i++) {
const answerChunk = answerChunks[i];
const answerChunkPosition = answerChunkPositions[i];
// Skip empty chunks or chunks below minimum length
if (!answerChunk.trim() || answerChunk.length < minChunkLength) continue;
validAnswerChunks.push(answerChunk);
validAnswerChunkIndices.push(i);
validAnswerChunkPositions.push(answerChunkPosition);
}
console.log(`[buildReferences] Found ${validAnswerChunks.length}/${answerChunks.length} valid answer chunks above minimum length`);
if (validAnswerChunks.length === 0) {
console.log(`[buildReferences] No valid answer chunks, returning without references`);
return {answer, references: []};
}
// Step 4: Get embeddings for BOTH answer chunks and valid web chunks in a single request
console.log(`[buildReferences] Step 4: Getting embeddings for all chunks in a single request (only including web chunks above min length)`);
// Create maps to track the original indices
const chunkIndexMap = new Map<number, { type: 'answer' | 'web', originalIndex: number }>();
// Combine all chunks into a single array for embedding
const allChunks: string[] = [];
// Add answer chunks first
validAnswerChunks.forEach((chunk, index) => {
allChunks.push(chunk);
chunkIndexMap.set(allChunks.length - 1, {type: 'answer', originalIndex: index});
});
// Then add web chunks that meet minimum length requirement
for (let i = 0; i < allWebContentChunks.length; i++) {
// Only include valid web chunks (those above minimum length)
if (validWebChunkIndices.has(i)) {
allChunks.push(allWebContentChunks[i]);
chunkIndexMap.set(allChunks.length - 1, {type: 'web', originalIndex: i});
}
}
console.log(`[buildReferences] Requesting embeddings for ${allChunks.length} total chunks (${validAnswerChunks.length} answer + ${validWebChunkIndices.size} web)`);
try {
// Get embeddings for all chunks in one request
const embeddingsResult = await getEmbeddings(allChunks, context.tokenTracker);
const allEmbeddings = embeddingsResult.embeddings;
// Separate the embeddings back into answer and web chunks
const answerEmbeddings: number[][] = [];
const webEmbeddingMap = new Map<number, number[]>(); // Maps original web chunk index to embedding
// Sort embeddings back to their original collections
for (let i = 0; i < allEmbeddings.length; i++) {
const embedding = allEmbeddings[i];
const mapping = chunkIndexMap.get(i);
if (mapping) {
if (mapping.type === 'answer') {
answerEmbeddings[mapping.originalIndex] = embedding;
} else {
webEmbeddingMap.set(mapping.originalIndex, embedding);
}
}
}
console.log(`[buildReferences] Successfully generated and separated embeddings: ${answerEmbeddings.length} answer, ${webEmbeddingMap.size} web`);
// Step 5: Compute pairwise cosine similarity
console.log(`[buildReferences] Step 5: Computing pairwise cosine similarity between answer and web chunks`);
const allMatches = [];
for (let i = 0; i < validAnswerChunks.length; i++) {
const answerChunkIndex = validAnswerChunkIndices[i];
const answerChunk = validAnswerChunks[i];
const answerChunkPosition = validAnswerChunkPositions[i];
const answerEmbedding = answerEmbeddings[i];
const matchesForChunk = [];
// Compute similarity with each valid web content chunk
// All web chunks in webEmbeddingMap are already pre-filtered to be above minimum length
for (const webChunkIndex of validWebChunkIndices) {
const webEmbedding = webEmbeddingMap.get(webChunkIndex);
if (webEmbedding) {
const score = cosineSimilarity(answerEmbedding, webEmbedding);
matchesForChunk.push({
webChunkIndex,
relevanceScore: score
});
}
}
// Sort by relevance score and take the top matches
matchesForChunk.sort((a, b) => b.relevanceScore - a.relevanceScore);
// Add the top matches to all matches with answerChunk information
for (const match of matchesForChunk) {
allMatches.push({
webChunkIndex: match.webChunkIndex,
answerChunkIndex: answerChunkIndex,
relevanceScore: match.relevanceScore,
answerChunk: answerChunk,
answerChunkPosition: answerChunkPosition
});
}
console.log(`[buildReferences] Processed answer chunk ${i + 1}/${validAnswerChunks.length}, top score: ${matchesForChunk[0]?.relevanceScore.toFixed(4)}`);
}
// Log statistics about relevance scores
if (allMatches.length > 0) {
const relevanceScores = allMatches.map(match => match.relevanceScore);
const minRelevance = Math.min(...relevanceScores);
const maxRelevance = Math.max(...relevanceScores);
const sumRelevance = relevanceScores.reduce((sum, score) => sum + score, 0);
const meanRelevance = sumRelevance / relevanceScores.length;
console.log('Reference relevance statistics:', {
min: minRelevance.toFixed(4),
max: maxRelevance.toFixed(4),
mean: meanRelevance.toFixed(4),
count: relevanceScores.length
});
}
// Step 6: Sort all matches by relevance
allMatches.sort((a, b) => b.relevanceScore - a.relevanceScore);
console.log(`[buildReferences] Step 6: Sorted ${allMatches.length} potential matches by relevance score`);
// Step 7: Filter matches as before
console.log(`[buildReferences] Step 7: Filtering matches to ensure uniqueness and threshold (min: ${minRelScore})`);
const usedWebChunks = new Set();
const usedAnswerChunks = new Set();
const filteredMatches = [];
for (const match of allMatches) {
// Only consider matches with relevance score >= minRelScore
if (match.relevanceScore < minRelScore) continue;
if (!usedWebChunks.has(match.webChunkIndex) && !usedAnswerChunks.has(match.answerChunkIndex)) {
filteredMatches.push(match);
usedWebChunks.add(match.webChunkIndex);
usedAnswerChunks.add(match.answerChunkIndex);
// Break if we've reached the max number of references
if (filteredMatches.length >= maxRef) break;
}
}
console.log(`[buildReferences] Selected ${filteredMatches.length}/${allMatches.length} references after filtering`);
return buildFinalResult(answer, filteredMatches, chunkToSourceMap);
} catch (error) {
console.error('Embedding failed, falling back to Jaccard similarity', error);
console.log(`[buildReferences] Fallback: Using Jaccard similarity instead of embeddings`);
// Process all chunks with Jaccard fallback
const allMatches = [];
for (let i = 0; i < validAnswerChunks.length; i++) {
const answerChunk = validAnswerChunks[i];
const answerChunkIndex = validAnswerChunkIndices[i];
const answerChunkPosition = validAnswerChunkPositions[i];
console.log(`[buildReferences] Processing answer chunk ${i + 1}/${validAnswerChunks.length} with Jaccard similarity`);
const fallbackResult = await jaccardRank(answerChunk, allWebContentChunks);
for (const match of fallbackResult.results) {
if (validWebChunkIndices.has(match.index)) {
allMatches.push({
webChunkIndex: match.index,
answerChunkIndex: answerChunkIndex,
relevanceScore: match.relevance_score,
answerChunk: answerChunk,
answerChunkPosition: answerChunkPosition
});
}
}
}
// Sort all matches by relevance and continue with the rest of the function
allMatches.sort((a, b) => b.relevanceScore - a.relevanceScore);
console.log(`[buildReferences] Fallback complete. Found ${allMatches.length} potential matches`);
// Filter matches as before
const usedWebChunks = new Set();
const usedAnswerChunks = new Set();
const filteredMatches = [];
for (const match of allMatches) {
if (!usedWebChunks.has(match.webChunkIndex) && !usedAnswerChunks.has(match.answerChunkIndex)) {
// Check if the relevance score meets the minimum threshold
if (match.relevanceScore >= minRelScore) {
filteredMatches.push(match);
usedWebChunks.add(match.webChunkIndex);
usedAnswerChunks.add(match.answerChunkIndex);
// Break if we've reached the max number of references
if (filteredMatches.length >= maxRef) break;
}
}
}
console.log(`[buildReferences] Selected ${filteredMatches.length} references using fallback method`);
return buildFinalResult(answer, filteredMatches, chunkToSourceMap);
}
}
// Helper function to build the final result
function buildFinalResult(
answer: string,
filteredMatches: any[],
chunkToSourceMap: any
): { answer: string, references: Array<Reference> } {
console.log(`[buildFinalResult] Building final result with ${filteredMatches.length} references`);
// Build reference objects
const references: Reference[] = filteredMatches.map((match) => {
const source = chunkToSourceMap[match.webChunkIndex];
return {
exactQuote: source.text,
url: source.url,
title: source.title,
dateTime: source.dateTime,
relevanceScore: match.relevanceScore,
answerChunk: match.answerChunk,
answerChunkPosition: match.answerChunkPosition
};
});
// Inject reference markers ([^1], [^2], etc.) into the answer
let modifiedAnswer = answer;
// Sort references by position in the answer (to insert markers in correct order)
const referencesByPosition = [...references]
.sort((a, b) => a.answerChunkPosition![0] - b.answerChunkPosition![0]);
console.log(`[buildFinalResult] Injecting reference markers into answer`);
// Insert markers from beginning to end, tracking offset
let offset = 0;
for (let i = 0; i < referencesByPosition.length; i++) {
const ref = referencesByPosition[i];
const marker = `[^${i + 1}]`;
// Calculate position to insert the marker (end of the chunk + current offset)
let insertPosition = ref.answerChunkPosition![1] + offset;
// Look ahead to check if there's a list item coming next
const textAfterInsert = modifiedAnswer.substring(insertPosition);
const nextListItemMatch = textAfterInsert.match(/^\s*\n\s*\*/);
// If we're at a position where the next content is a list item,
// we need to adjust WHERE we place the footnote
if (nextListItemMatch) {
// Move the marker to right after the last content character,
// but INSIDE any punctuation at the end of the content
const beforeText = modifiedAnswer.substring(Math.max(0, insertPosition - 30), insertPosition);
const lastPunctuation = beforeText.match(/[!。?!.?]$/);
if (lastPunctuation) {
// If there's punctuation at the end, insert the marker before it
insertPosition--;
}
} else {
// The original conditions for newlines and table pipes can remain
const chunkEndText = modifiedAnswer.substring(Math.max(0, insertPosition - 5), insertPosition);
const newlineMatch = chunkEndText.match(/\n+$/);
const tableEndMatch = chunkEndText.match(/\s*\|\s*$/);
if (newlineMatch) {
// Move the insertion position before the newline(s)
insertPosition -= newlineMatch[0].length;
} else if (tableEndMatch) {
// Move the insertion position before the table end pipe
insertPosition -= tableEndMatch[0].length;
}
}
// Insert the marker
modifiedAnswer =
modifiedAnswer.slice(0, insertPosition) +
marker +
modifiedAnswer.slice(insertPosition);
// Update offset for subsequent insertions
offset += marker.length;
}
console.log(`[buildFinalResult] Complete. Generated ${references.length} references`);
return {
answer: modifiedAnswer,
references
};
}