fix: url datetime guessing

This commit is contained in:
Han Xiao
2025-03-10 17:35:34 +08:00
parent 43e22cbd8d
commit f264fc01a3
4 changed files with 26 additions and 19 deletions

View File

@@ -1,7 +1,7 @@
import {ZodObject} from 'zod';
import {CoreMessage} from 'ai';
import {SEARCH_PROVIDER, STEP_SLEEP} from "./config";
import {readUrl, removeAllLineBreaks} from "./tools/read";
import {readUrl} from "./tools/read";
import fs from 'fs/promises';
import {SafeSearchType, search as duckSearch} from "duck-duck-scrape";
import {braveSearch} from "./tools/brave-search";
@@ -35,7 +35,13 @@ import {
normalizeUrl, sampleMultinomial,
weightedURLToString, getLastModified
} from "./utils/url-tools";
import {buildMdFromAnswer, chooseK, removeExtraLineBreaks, removeHTMLtags} from "./utils/text-tools";
import {
buildMdFromAnswer,
chooseK,
removeAllLineBreaks,
removeExtraLineBreaks,
removeHTMLtags
} from "./utils/text-tools";
import {MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas} from "./utils/schemas";
async function sleep(ms: number) {
@@ -687,7 +693,7 @@ You decided to think out of the box or cut from a completely different angle.
const urlResults = await Promise.all(
uniqueURLs.map(async url => {
try {
const {response} = await readUrl(url, context.tokenTracker);
const {response} = await readUrl(url, true, context.tokenTracker);
const {data} = response;
const guessedTime = await getLastModified(url);
console.log('Guessed time for', url, guessedTime)

View File

@@ -1,8 +1,9 @@
import {GenerateObjectResult} from 'ai';
import {AnswerAction, EvaluationResponse, EvaluationType, PromptPair, TrackerContext} from '../types';
import {readUrl, removeAllLineBreaks} from "./read";
import {readUrl} from "./read";
import {ObjectGeneratorSafe} from "../utils/safe-generator";
import {Schemas} from "../utils/schemas";
import {removeAllLineBreaks} from "../utils/text-tools";
const TOOL_NAME = 'evaluator';
@@ -696,7 +697,7 @@ async function fetchSourceContent(urls: string[], trackers: TrackerContext, sche
const results = await Promise.all(
urls.map(async (url) => {
try {
const {response} = await readUrl(url, trackers.tokenTracker);
const {response} = await readUrl(url, false, trackers.tokenTracker);
const content = response?.data?.content || '';
return removeAllLineBreaks(content);
} catch (error) {

View File

@@ -3,7 +3,7 @@ import { TokenTracker } from "../utils/token-tracker";
import { ReadResponse } from '../types';
import { JINA_API_KEY } from "../config";
export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
export function readUrl(url: string, withAllLinks?: boolean, tracker?: TokenTracker): Promise<{ response: ReadResponse }> {
return new Promise((resolve, reject) => {
if (!url.trim()) {
reject(new Error('URL cannot be empty'));
@@ -11,21 +11,22 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
}
const data = JSON.stringify({ url });
const headers: Record<string, any> = {
'Accept': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`,
'Content-Type': 'application/json',
'X-Retain-Images': 'none',
};
if (withAllLinks) {
headers['X-With-Links-Summary'] = 'all'
}
const options = {
hostname: 'r.jina.ai',
port: 443,
path: '/',
method: 'POST',
headers: {
'Accept': 'application/json',
'Authorization': `Bearer ${JINA_API_KEY}`,
'Content-Type': 'application/json',
'Content-Length': data.length,
'X-Retain-Images': 'none',
'X-With-Links-Summary': 'all',
'X-Timeout': '30'
}
headers
};
const req = https.request(options, (res) => {
@@ -96,8 +97,4 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response
req.write(data);
req.end();
});
}
export function removeAllLineBreaks(text: string) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}

View File

@@ -149,6 +149,9 @@ export function removeHTMLtags(text: string) {
return text.replace(/<[^>]*>?/gm, '');
}
export function removeAllLineBreaks(text: string) {
return text.replace(/(\r\n|\n|\r)/gm, " ");
}
export function getI18nText(key: string, lang = 'en', params: Record<string, string> = {}) {
// 获取i18n数据