diff --git a/README.md b/README.md index b6036fd..eba2feb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # DeepResearch -[UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [API](https://jina.ai/deepsearch) | [Evaluation](#evaluation) +[Official UI](https://search.jina.ai/) | [UI Code](https://github.com/jina-ai/deepsearch-ui) | [Official API](https://jina.ai/deepsearch) | [Evaluation](#evaluation) Keep searching, reading webpages, reasoning until an answer is found (or the token budget is exceeded). Useful for deeply investigating a query. @@ -54,15 +54,21 @@ export JINA_API_KEY=jina_... # free jina api key, get from https://jina.ai/read npm run dev $QUERY ``` +### Official Site + +You can try it on [our official site](https://search.jina.ai). + ### Official API -You can also use our official DeepSearch API, hosted and optimized by Jina AI: +You can also use [our official DeepSearch API](https://jina.ai/deepsearch): ``` https://deepsearch.jina.ai/v1/chat/completions ``` -You can use it with any OpenAI-compatible client. For the authentication Bearer, get your Jina API key from https://jina.ai +You can use it with any OpenAI-compatible client. + +For the authentication Bearer, API key, rate limit, get from https://jina.ai/deepsearch. #### Client integration guidelines diff --git a/src/agent.ts b/src/agent.ts index 2e7e1c8..91fb82c 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -576,7 +576,7 @@ But then you realized you have asked them before. You decided to to think out of let {queries: keywordsQueries} = await rewriteQuery(thisStep, context.tokenTracker); // add the original query before rewrite to the keywordsQueries - keywordsQueries.push(thisStep.searchQuery) + keywordsQueries.push(question) const oldKeywords = keywordsQueries; // avoid exisitng searched queries diff --git a/src/tools/evaluator.ts b/src/tools/evaluator.ts index 45745fb..e68bdfd 100644 --- a/src/tools/evaluator.ts +++ b/src/tools/evaluator.ts @@ -21,9 +21,7 @@ const freshnessSchema = z.object({ ...baseSchema, type: z.literal('freshness'), freshness_analysis: z.object({ - likely_outdated: z.boolean().describe('Whether the answer content is likely outdated based on dates and current time'), - dates_mentioned: z.array(z.string()).describe('All dates mentioned in the answer'), - current_time: z.string().describe('Current system time when evaluation was performed'), + days_ago: z.number().describe('Inferred dates or timeframes mentioned in the answer and relative to the current time'), max_age_days: z.number().optional().describe('Maximum allowed age in days before content is considered outdated') }) }); @@ -32,8 +30,6 @@ const pluralitySchema = z.object({ ...baseSchema, type: z.literal('plurality'), plurality_analysis: z.object({ - expects_multiple: z.boolean().describe('Whether the question asks for multiple items'), - provides_multiple: z.boolean().describe('Whether the answer provides multiple items'), count_expected: z.number().optional().describe('Number of items expected if specified in question'), count_provided: z.number().describe('Number of items provided in answer') }) @@ -162,69 +158,61 @@ Answer: ${JSON.stringify(answer)}`; } function getFreshnessPrompt(question: string, answer: string, currentTime: string): string { - return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates and current time. + return `You are an evaluator that analyzes if answer content is likely outdated based on mentioned dates (or implied datetime) and current system time: ${currentTime} -1. Date Analysis: - - Extract all dates mentioned in the answer - - Compare against current system time: ${currentTime} - - Consider content outdated if: - * It refers to a "latest" or "current" state from more than 30 days ago - * It mentions specific dates/events that have been superseded - * It contains time-sensitive information (e.g., "current CEO", "latest version") from more than 60 days ago - - For product versions, releases, or announcements, max age is 30 days - - For company positions, leadership, or general facts, max age is 90 days +Question-Answer Freshness Checker Guidelines -2. Context Hints: - - Words indicating recency: "latest", "current", "newest", "just released", "recently" - - Time-sensitive terms: "CEO", "price", "version", "release" - - Future dates should be ignored in outdated calculation +# Revised QA Type Maximum Age Table + +| QA Type | Max Age (Days) | Notes | +|--------------------------|--------------|-----------------------------------------------------------------------| +| Financial Data (Real-time)| 0.1 | Stock prices, exchange rates, crypto (real-time preferred) | +| Breaking News | 1 | Immediate coverage of major events | +| News/Current Events | 1 | Time-sensitive news, politics, or global events | +| Weather Forecasts | 1 | Accuracy drops significantly after 24 hours | +| Sports Scores/Events | 1 | Live updates required for ongoing matches | +| Security Advisories | 1 | Critical security updates and patches | +| Social Media Trends | 1 | Viral content, hashtags, memes | +| Cybersecurity Threats | 7 | Rapidly evolving vulnerabilities/patches | +| Tech News | 7 | Technology industry updates and announcements | +| Political Developments | 7 | Legislative changes, political statements | +| Political Elections | 7 | Poll results, candidate updates | +| Sales/Promotions | 7 | Limited-time offers and marketing campaigns | +| Travel Restrictions | 7 | Visa rules, pandemic-related policies | +| Entertainment News | 14 | Celebrity updates, industry announcements | +| Product Launches | 14 | New product announcements and releases | +| Market Analysis | 14 | Market trends and competitive landscape | +| Competitive Intelligence | 21 | Analysis of competitor activities and market position | +| Product Recalls | 30 | Safety alerts or recalls from manufacturers | +| Industry Reports | 30 | Sector-specific analysis and forecasting | +| Software Version Info | 30 | Updates, patches, and compatibility information | +| Legal/Regulatory Updates | 30 | Laws, compliance rules (jurisdiction-dependent) | +| Economic Forecasts | 30 | Macroeconomic predictions and analysis | +| Consumer Trends | 45 | Shifting consumer preferences and behaviors | +| Scientific Discoveries | 60 | New research findings and breakthroughs (includes all scientific research) | +| Healthcare Guidelines | 60 | Medical recommendations and best practices (includes medical guidelines)| +| Environmental Reports | 60 | Climate and environmental status updates | +| Best Practices | 90 | Industry standards and recommended procedures | +| API Documentation | 90 | Technical specifications and implementation guides | +| Tutorial Content | 180 | How-to guides and instructional materials (includes educational content)| +| Tech Product Info | 180 | Product specs, release dates, or pricing | +| Statistical Data | 180 | Demographic and statistical information | +| Reference Material | 180 | General reference information and resources | +| Historical Content | 365 | Events and information from the past year | +| Cultural Trends | 730 | Shifts in language, fashion, or social norms | +| Entertainment Releases | 730 | Movie/TV show schedules, media catalogs | +| Factual Knowledge | ∞ | Static facts (e.g., historical events, geography, physical constants) | + +### Implementation Notes: +1. **Contextual Adjustment**: Freshness requirements may change during crises or rapid developments in specific domains. +2. **Tiered Approach**: Consider implementing urgency levels (critical, important, standard) alongside age thresholds. +3. **User Preferences**: Allow customization of thresholds for specific query types or user needs. +4. **Source Reliability**: Pair freshness metrics with source credibility scores for better quality assessment. +5. **Domain Specificity**: Some specialized fields (medical research during pandemics, financial data during market volatility) may require dynamically adjusted thresholds. +6. **Geographic Relevance**: Regional considerations may alter freshness requirements for local regulations or events. - -Question: "What was Jina AI's closing stock price yesterday?" -Answer: "Jina AI's stock closed at $45.30 per share at yesterday's market close." -Current Time: "2024-03-07T14:30:00Z" -Evaluation: { - "pass": true, - "think": "The question specifically asks for yesterday's closing price, and the answer provides exactly that information. Since it's asking for a historical data point rather than current price, yesterday's closing price is the correct timeframe.", - "freshness_analysis": { - "likely_outdated": false, - "dates_mentioned": ["2024-03-06"], - "current_time": "2024-03-07T14:30:00Z", - "max_age_days": 1 - } -} - -Question: "What is Jina AI's latest embedding model?" -Answer: "The latest embedding model from Jina AI is jina-embeddings-v2, released on March 15, 2024." -Current Time: "2024-10-06T00:00:00Z" -Evaluation: { - "pass": false, - "think": "The answer refers to a 'latest' model release from over 6 months ago, which is likely outdated for product version information", - "freshness_analysis": { - "likely_outdated": true, - "dates_mentioned": ["2024-03-15"], - "current_time": "2024-10-06T00:00:00Z", - "max_age_days": 30 - } -} - -Question: "Who is OpenAI's CEO?" -Answer: "Sam Altman is the CEO of OpenAI as of December 2023." -Current Time: "2024-02-06T00:00:00Z" -Evaluation: { - "pass": true, - "think": "The answer is about company leadership and is within the 60-day threshold for such information", - "freshness_analysis": { - "likely_outdated": false, - "dates_mentioned": ["2023-12"], - "current_time": "2024-02-06T00:00:00Z", - "max_age_days": 90 - } -} - - Now evaluate this pair: Question: ${JSON.stringify(question)} Answer: ${JSON.stringify(answer)}`; @@ -234,77 +222,38 @@ function getPluralityPrompt(question: string, answer: string): string { return `You are an evaluator that analyzes if answers provide the appropriate number of items requested in the question. -1. Question Analysis: - - Check if question asks for multiple items using indicators like: - * Plural nouns: "companies", "people", "names" - * Quantifiers: "all", "many", "several", "various", "multiple" - * List requests: "list", "enumerate", "name all", "give me all" - * Numbers: "5 examples", "top 10" - - Otherwise skip the analysis and return pass to true +Question Type Reference Table -2. Answer Analysis: - - Count distinct items provided in the answer - - Check if answer uses limiting words like "only", "just", "single" - - Identify if answer acknowledges there are more items but only provides some - -3. Definitiveness Rules: - - If question asks for multiple items but answer provides only one → NOT definitive - - If question asks for specific number (e.g., "top 5") but answer provides fewer → NOT definitive - - If answer clearly states it's providing a partial list → NOT definitive - - If question asks for "all" or "every" but answer seems incomplete → NOT definitive +| Question Type | Expected Items | Evaluation Rules | +|---------------|----------------|------------------| +| Explicit Count | Exact match to number specified | Provide exactly the requested number of distinct, non-redundant items relevant to the query. | +| Numeric Range | Any number within specified range | Ensure count falls within given range with distinct, non-redundant items. For "at least N" queries, meet minimum threshold. | +| Implied Multiple | ≥ 2 | Provide multiple items (typically 2-4 unless context suggests more) with balanced detail and importance. | +| "Few" | 2-4 | Offer 2-4 substantive items prioritizing quality over quantity. | +| "Several" | 3-7 | Include 3-7 items with comprehensive yet focused coverage, each with brief explanation. | +| "Many" | 7+ | Present 7+ items demonstrating breadth, with concise descriptions per item. | +| "Most important" | Top 3-5 by relevance | Prioritize by importance, explain ranking criteria, and order items by significance. | +| "Top N" | Exactly N, ranked | Provide exactly N items ordered by importance/relevance with clear ranking criteria. | +| "Pros and Cons" | ≥ 2 of each category | Present balanced perspectives with at least 2 items per category addressing different aspects. | +| "Compare X and Y" | ≥ 3 comparison points | Address at least 3 distinct comparison dimensions with balanced treatment covering major differences/similarities. | +| "Steps" or "Process" | All essential steps | Include all critical steps in logical order without missing dependencies. | +| "Examples" | ≥ 3 unless specified | Provide at least 3 diverse, representative, concrete examples unless count specified. | +| "Comprehensive" | 10+ | Deliver extensive coverage (10+ items) across major categories/subcategories demonstrating domain expertise. | +| "Brief" or "Quick" | 1-3 | Present concise content (1-3 items) focusing on most important elements described efficiently. | +| "Complete" | All relevant items | Provide exhaustive coverage within reasonable scope without major omissions, using categorization if needed. | +| "Thorough" | 7-10 | Offer detailed coverage addressing main topics and subtopics with both breadth and depth. | +| "Overview" | 3-5 | Cover main concepts/aspects with balanced coverage focused on fundamental understanding. | +| "Summary" | 3-5 key points | Distill essential information capturing main takeaways concisely yet comprehensively. | +| "Main" or "Key" | 3-7 | Focus on most significant elements fundamental to understanding, covering distinct aspects. | +| "Essential" | 3-7 | Include only critical, necessary items without peripheral or optional elements. | +| "Basic" | 2-5 | Present foundational concepts accessible to beginners focusing on core principles. | +| "Detailed" | 5-10 with elaboration | Provide in-depth coverage with explanations beyond listing, including specific information and nuance. | +| "Common" | 4-8 most frequent | Focus on typical or prevalent items, ordered by frequency when possible, that are widely recognized. | +| "Primary" | 2-5 most important | Focus on dominant factors with explanation of their primacy and outsized impact. | +| "Secondary" | 3-7 supporting items | Present important but not critical items that complement primary factors and provide additional context. | +| Unspecified Analysis | 3-5 key points | Default to 3-5 main points covering primary aspects with balanced breadth and depth. | - -Question: "Who works in Jina AI's sales team?" -Answer: "John Smith is a sales representative at Jina AI." -Evaluation: { - "pass": true, - "think": "The question doesn't specifically ask for multiple team members, so a single name can be considered a definitive answer.", - "plurality_analysis": { - "expects_multiple": false, - "provides_multiple": false, - "count_provided": 1 - } -} - -Question: "List all the salespeople who work at Jina AI" -Answer: "John Smith is a sales representative at Jina AI." -Evaluation: { - "pass": false, - "think": "The question asks for 'all salespeople' but the answer only provides one name without indicating if this is the complete list.", - "plurality_analysis": { - "expects_multiple": true, - "provides_multiple": false, - "count_provided": 1 - } -} - -Question: "Name the top 3 products sold by Jina AI" -Answer: "Jina AI's product lineup includes DocArray and Jina." -Evaluation: { - "pass": false, - "think": "The question asks for top 3 products but only 2 are provided.", - "plurality_analysis": { - "expects_multiple": true, - "provides_multiple": true, - "count_expected": 3, - "count_provided": 2 - } -} - -Question: "List as many AI companies in Berlin as you can find" -Answer: "Here are several AI companies in Berlin: Ada Health, Merantix, DeepL, Understand.ai, and Zeitgold. There are many more AI companies in Berlin, but these are some notable examples." -Evaluation: { - "pass": false, - "think": "While the answer provides multiple companies, it explicitly states it's an incomplete list when the question asks to list as many as possible.", - "plurality_analysis": { - "expects_multiple": true, - "provides_multiple": true, - "count_provided": 5 - } -} - - Now evaluate this pair: Question: ${JSON.stringify(question)} Answer: ${JSON.stringify(answer)}`; diff --git a/src/tools/read.ts b/src/tools/read.ts index fe076be..89a8c29 100644 --- a/src/tools/read.ts +++ b/src/tools/read.ts @@ -23,7 +23,8 @@ export function readUrl(url: string, tracker?: TokenTracker): Promise<{ response 'Content-Type': 'application/json', 'Content-Length': data.length, 'X-Retain-Images': 'none', - 'X-Return-Format': 'markdown' + 'X-Return-Format': 'markdown', + 'X-Engine': 'direct' } }; diff --git a/src/types.ts b/src/types.ts index 89e797c..21d8a05 100644 --- a/src/types.ts +++ b/src/types.ts @@ -143,14 +143,10 @@ export type EvaluationResponse = { think: string; type?: 'definitive' | 'freshness' | 'plurality' | 'attribution'; freshness_analysis?: { - likely_outdated: boolean; - dates_mentioned: string[]; - current_time: string; + days_ago: number; max_age_days?: number; }; plurality_analysis?: { - expects_multiple: boolean; - provides_multiple: boolean; count_expected?: number; count_provided: number; };