fix: url datetime guessing

This commit is contained in:
Han Xiao
2025-03-06 17:15:46 +08:00
parent 7bb5b18f2e
commit dbeee0c8f5
3 changed files with 42 additions and 3 deletions

View File

@@ -33,7 +33,7 @@ import {
countUrlParts, countUrlParts,
getUnvisitedURLs, getUnvisitedURLs,
normalizeUrl, sampleMultinomial, normalizeUrl, sampleMultinomial,
weightedURLToString weightedURLToString, getLastModified
} from "./utils/url-tools"; } from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools"; import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas"; import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
@@ -384,10 +384,17 @@ export async function getResponse(question?: string,
return { return {
exactQuote: ref?.exactQuote || '', exactQuote: ref?.exactQuote || '',
title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '', title: normalizedUrl ? (allURLs[normalizedUrl]?.title || '') : '',
url: normalizedUrl url: normalizedUrl,
} }
}); });
// parallel process guess all url datetime
await Promise.all(thisStep.references.map(async ref => {
ref.dateTime = await getLastModified(ref.url) || ref?.dateTime || ''
}));
console.log('Updated references:', thisStep.references)
if (step === 1 && thisStep.references.length === 0) { if (step === 1 && thisStep.references.length === 0) {
// LLM is so confident and answer immediately, skip all evaluations // LLM is so confident and answer immediately, skip all evaluations
// however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?" // however, if it does give any reference, it must be evaluated, case study: "How to configure a timeout when loading a huggingface dataset with python?"

View File

@@ -220,7 +220,7 @@ export class Schemas {
z.object({ z.object({
exactQuote: z.string().describe("Exact relevant quote from the document, must be a soundbite, short and to the point, no fluff").max(30), exactQuote: z.string().describe("Exact relevant quote from the document, must be a soundbite, short and to the point, no fluff").max(30),
url: z.string().describe("source URL; must be copy directly from existing knowledge real URLs, avoid example.com or any placeholder fake URLs").max(100), url: z.string().describe("source URL; must be copy directly from existing knowledge real URLs, avoid example.com or any placeholder fake URLs").max(100),
dateTime: z.string().describe("Apply this evidence hierarchy to determine the source timestamp: (1) Explicit dates in metadata/content, (2) Internal time references, (3) Contextual clues, (4) Version history if available. Format as YYYY-MM-DD when possible; otherwise provide narrowest defensible range with confidence level (High/Medium/Low).").max(16), dateTime: z.string().describe("Apply this evidence hierarchy to determine the source timestamp: (1) Explicit dates in metadata/content, (2) Internal time references, (3) Contextual clues, (4) Version history if available. Format as YYYY-MM-DD when possible; otherwise provide narrowest defensible range.").max(16),
}).required() }).required()
).describe("Required when action='answer'. Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document"), ).describe("Required when action='answer'. Must be an array of references that support the answer, each reference must contain an exact quote and the URL of the document"),
answer: z.string() answer: z.string()

View File

@@ -297,3 +297,35 @@ export function sampleMultinomial<T>(items: [T, number][]): T | null {
// Fallback (should rarely happen due to floating point precision) // Fallback (should rarely happen due to floating point precision)
return items[items.length - 1][0]; return items[items.length - 1][0];
} }
/**
* Fetches the last modified date for a URL using the datetime detection API
* @param url The URL to check for last modified date
* @returns Promise containing the last modified date or null if not found
*/
export async function getLastModified(url: string): Promise<string | null> {
try {
// Call the API with proper encoding
const apiUrl = `https://api-beta-datetime.jina.ai?url=${encodeURIComponent(url)}`;
const response = await fetch(apiUrl);
if (!response.ok) {
throw new Error(`API returned ${response.status}`);
}
const data = await response.json();
// Return the bestGuess date if available
if (data.bestGuess) {
return data.bestGuess;
}
return null;
} catch (error) {
console.error('Failed to fetch last modified date:', error);
return null;
}
}