feat: gather images to response (#98)

* feat: add image tools * rank images * add image dedup * wip * wip * remove rank functions * fix * add embeddings to image * move image object to agent * build image references * update * add with_images param * update dimensions for image tools * dudup images * save images to cloud storage * remove extra log * fix * remove test data * fix
2025-12-25 22:16:49 +08:00 · 2025-06-10 11:55:46 +08:00 · 2025-06-10 11:55:46 +08:00 · a768755783
commit a768755783
parent 77c96c07fa
11 changed files with 1433 additions and 75 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -28,6 +28,8 @@
  "dependencies": {
    "@ai-sdk/google": "^1.0.0",
    "@ai-sdk/openai": "^1.1.9",
+    "@google-cloud/storage": "^7.16.0",
+    "@napi-rs/canvas": "^0.1.68",
    "@types/jsdom": "^21.1.7",
    "ai": "^4.1.26",
    "axios": "^1.7.9",
--- a/src/agent.ts
+++ b/src/agent.ts
@ -16,7 +16,9 @@ import {
  KnowledgeItem,
  EvaluationType,
  BoostedSearchSnippet,
-  SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent
+  SearchSnippet, EvaluationResponse, Reference, SERPQuery, RepeatEvaluationType, UnNormalizedSearchSnippet, WebContent,
+  ImageObject,
+  ImageReference
 } from "./types";
 import { TrackerContext } from "./types";
 import { search } from "./tools/jina-search";
@ -41,7 +43,7 @@ import {
 import { MAX_QUERIES_PER_STEP, MAX_REFLECT_PER_STEP, MAX_URLS_PER_STEP, Schemas } from "./utils/schemas";
 import { formatDateBasedOnType, formatDateRange } from "./utils/date-tools";
 import { reviseAnswer } from "./tools/md-fixer";
-import { buildReferences } from "./tools/build-ref";
+import { buildImageReferences, buildReferences } from "./tools/build-ref";

 async function sleep(ms: number) {
  const seconds = Math.ceil(ms / 1000);
@ -391,8 +393,9 @@ export async function getResponse(question?: string,
  minRelScore: number = 0.85,
  languageCode: string | undefined = undefined,
  searchLanguageCode?: string,
-  searchProvider?: string
-): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[] }> {
+  searchProvider?: string,
+  with_images: boolean = false
+): Promise<{ result: StepAction; context: TrackerContext; visitedURLs: string[], readURLs: string[], allURLs: string[], allImages?: string[], relatedImages?: string[] }> {

  let step = 0;
  let totalStep = 0;
@ -451,6 +454,7 @@ export async function getResponse(question?: string,
  const allWebContents: Record<string, WebContent> = {};
  const visitedURLs: string[] = [];
  const badURLs: string[] = [];
+  const imageObjects: ImageObject[] = [];
  const evaluationMetrics: Record<string, RepeatEvaluationType[]> = {};
  // reserve the 10% final budget for the beast mode
  const regularBudget = tokenBudget * 0.85;
@ -859,9 +863,11 @@ You decided to think out of the box or cut from a completely different angle.
          allURLs,
          visitedURLs,
          badURLs,
+          imageObjects,
          SchemaGen,
          currentQuestion,
-          allWebContents
+          allWebContents,
+          with_images
        );

        diaryContext.push(success
@ -1017,7 +1023,16 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
    answerStep.mdAnswer = buildMdFromAnswer(answerStep);
  }

-  console.log(thisStep)
+  let imageReferences: ImageReference[] = [];
+  if(imageObjects.length && with_images) {
+    try {
+      imageReferences = await buildImageReferences(answerStep.answer, imageObjects, context, SchemaGen);
+      console.log('Image references built:', imageReferences);
+    } catch (error) {
+      console.error('Error building image references:', error);
+      imageReferences = [];
+    }
+  }

  // max return 300 urls
  const returnedURLs = weightedURLs.slice(0, numReturnedURLs).map(r => r.url);
@ -1026,7 +1041,9 @@ But unfortunately, you failed to solve the issue. You need to think out of the b
    context,
    visitedURLs: returnedURLs,
    readURLs: visitedURLs.filter(url => !badURLs.includes(url)),
-    allURLs: weightedURLs.map(r => r.url)
+    allURLs: weightedURLs.map(r => r.url),
+    allImages: with_images ? imageObjects.map(i => i.url) : undefined,
+    relatedImages: with_images ? imageReferences.map(i => i.url) : undefined,
  };
 }

--- a/src/app.ts
+++ b/src/app.ts
@ -7,7 +7,7 @@ import {
  ChatCompletionResponse,
  ChatCompletionChunk,
  AnswerAction,
-  Model, StepAction, VisitAction
+  Model, StepAction, VisitAction,
 } from './types';
 import { TokenTracker } from "./utils/token-tracker";
 import { ActionTracker } from "./utils/action-tracker";
@ -522,7 +522,7 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
      // Add content to queue for both thinking steps and final answer
      if (step.action === 'visit') {
        // emit every url in the visit action in url field
-        ((step as VisitAction).URLTargets as string[]).forEach((url) => {
+        ((step as VisitAction).URLTargets as string[])?.forEach((url) => {
          const chunk: ChatCompletionChunk = {
            id: requestId,
            object: 'chat.completion.chunk',
@ -568,7 +568,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
      result: finalStep,
      visitedURLs,
      readURLs,
-      allURLs
+      allURLs,
+      allImages,
+      relatedImages,
    } = await getResponse(undefined,
      tokenBudget,
      maxBadAttempts,
@ -583,7 +585,8 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
      body.min_annotation_relevance,
      body.language_code,
      body.search_language_code,
-      body.search_provider
+      body.search_provider,
+      body.with_images
    )
    let finalAnswer = (finalStep as AnswerAction).mdAnswer;

@ -656,7 +659,8 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        usage,
        visitedURLs,
        readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        relatedImages
      };
      res.write(`data: ${JSON.stringify(finalChunk)}\n\n`);
      res.end();
@ -682,7 +686,8 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        usage,
        visitedURLs,
        readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        relatedImages,
      };

      // Log final response (excluding full content for brevity)
@ -693,7 +698,9 @@ app.post('/v1/chat/completions', (async (req: Request, res: Response) => {
        usage: response.usage,
        visitedURLs: response.visitedURLs,
        readURLs: response.readURLs,
-        numURLs: allURLs.length
+        numURLs: allURLs.length,
+        allImages: allImages?.length,
+        relatedImages: relatedImages?.length,
      });

      res.json(response);
--- a/src/tools/build-ref.ts
+++ b/src/tools/build-ref.ts
@ -1,8 +1,9 @@
 import {segmentText} from './segment';
-import {Reference, TrackerContext, WebContent} from "../types";
+import {ImageObject, ImageReference, Reference, TrackerContext, WebContent} from "../types";
 import {Schemas} from "../utils/schemas";
 import {cosineSimilarity, jaccardRank} from "./cosine";
 import {getEmbeddings} from "./embeddings";
+import { dedupImagesWithEmbeddings } from '../utils/image-tools';
 import {normalizeHostName} from '../utils/url-tools';

 export async function buildReferences(
@ -366,4 +367,184 @@ function buildFinalResult(
    answer: modifiedAnswer,
    references
  };
+}
+
+export async function buildImageReferences(
+  answer: string,
+  imageObjects: ImageObject[],
+  context: TrackerContext,
+  schema: Schemas,
+  minChunkLength: number = 80,
+  maxRef: number = 10,
+  minRelScore: number = 0.35
+): Promise<Array<ImageReference>> {
+  console.log(`[buildImageReferences] Starting with maxRef=${maxRef}, minChunkLength=${minChunkLength}, minRelScore=${minRelScore}`);
+  console.log(`[buildImageReferences] Answer length: ${answer.length} chars, Image sources: ${imageObjects.length}`);
+
+  // Step 1: Chunk the answer
+  console.log(`[buildImageReferences] Step 1: Chunking answer text`);
+  const {chunks: answerChunks, chunk_positions: answerChunkPositions} = await segmentText(answer, context);
+  console.log(`[buildImageReferences] Answer segmented into ${answerChunks.length} chunks`);
+
+  // Step 2: Prepare image content
+  console.log(`[buildImageReferences] Step 2: Preparing image content`);
+  const dudupImages = dedupImagesWithEmbeddings(imageObjects, []);
+  const allImageEmbeddings: number[][] = dudupImages.map(img => img.embedding[0]); // Extract embedding
+  const imageToSourceMap: any = {};
+  const validImageIndices = new Set<number>();
+
+  dudupImages.forEach((img, index) => {
+      imageToSourceMap[index] = {
+          url: img.url,
+          altText: img.alt,
+          embedding: img.embedding[0] // Store extracted embedding
+      };
+      validImageIndices.add(index);
+  });
+
+  console.log(`[buildImageReferences] Collected ${allImageEmbeddings.length} image embeddings`);
+
+  if (allImageEmbeddings.length === 0) {
+      console.log(`[buildImageReferences] No image data available, returning empty array`);
+      return [];
+  }
+
+  // Step 3: Filter answer chunks by minimum length
+  console.log(`[buildImageReferences] Step 3: Filtering answer chunks by minimum length`);
+  const validAnswerChunks: string[] = [];
+  const validAnswerChunkIndices: number[] = [];
+  const validAnswerChunkPositions: [number, number][] = [];
+
+  context.actionTracker.trackThink('cross_reference', schema.languageCode);
+
+  for (let i = 0; i < answerChunks.length; i++) {
+      const answerChunk = answerChunks[i];
+      const answerChunkPosition = answerChunkPositions[i];
+
+      if (!answerChunk.trim() || answerChunk.length < minChunkLength) continue;
+
+      validAnswerChunks.push(answerChunk);
+      validAnswerChunkIndices.push(i);
+      validAnswerChunkPositions.push(answerChunkPosition);
+  }
+
+  console.log(`[buildImageReferences] Found ${validAnswerChunks.length}/${answerChunks.length} valid answer chunks above minimum length`);
+
+  if (validAnswerChunks.length === 0) {
+      console.log(`[buildImageReferences] No valid answer chunks, returning empty array`);
+      return [];
+  }
+
+  // Step 4: Get embeddings for answer chunks
+  console.log(`[buildImageReferences] Step 4: Getting embeddings for answer chunks`);
+  const answerEmbeddings: number[][] = [];
+
+  try {
+      //  const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, embeddingOptions); //  No embeddingOptions needed here
+      //   answerEmbeddings.push(...embeddingsResult.embeddings);
+      const embeddingsResult = await getEmbeddings(validAnswerChunks, context.tokenTracker, {
+          dimensions: 1024,
+          model: 'jina-clip-v2',
+      });
+      answerEmbeddings.push(...embeddingsResult.embeddings);
+
+      console.log(`[buildImageReferences] Got embeddings for ${answerEmbeddings.length} answer chunks`);
+
+      // Step 5: Compute pairwise cosine similarity
+      console.log(`[buildImageReferences] Step 5: Computing pairwise cosine similarity between answer and image embeddings`);
+      const allMatches = [];
+
+      for (let i = 0; i < validAnswerChunks.length; i++) {
+          const answerChunkIndex = validAnswerChunkIndices[i];
+          const answerChunk = validAnswerChunks[i];
+          const answerChunkPosition = answerChunkPositions[i];
+          const answerEmbedding = answerEmbeddings[i];
+
+          const matchesForChunk = [];
+
+          for (const imageIndex of validImageIndices) {
+              const imageEmbedding = allImageEmbeddings[imageIndex];
+
+              if (imageEmbedding) {
+                  const score = cosineSimilarity(answerEmbedding, imageEmbedding);
+
+                  matchesForChunk.push({
+                      imageIndex,
+                      relevanceScore: score
+                  });
+              }
+          }
+
+          matchesForChunk.sort((a, b) => b.relevanceScore - a.relevanceScore);
+
+          for (const match of matchesForChunk) {
+              allMatches.push({
+                  imageIndex: match.imageIndex,
+                  answerChunkIndex: answerChunkIndex,
+                  relevanceScore: match.relevanceScore,
+                  answerChunk: answerChunk,
+                  answerChunkPosition: answerChunkPosition
+              });
+          }
+
+          console.log(`[buildImageReferences] Processed answer chunk ${i + 1}/${validAnswerChunks.length}, top score: ${matchesForChunk[0]?.relevanceScore.toFixed(4)}`);
+      }
+
+      // Log statistics about relevance scores
+      if (allMatches.length > 0) {
+          const relevanceScores = allMatches.map(match => match.relevanceScore);
+          const minRelevance = Math.min(...relevanceScores);
+          const maxRelevance = Math.max(...relevanceScores);
+          const sumRelevance = relevanceScores.reduce((sum, score) => sum + score, 0);
+          const meanRelevance = sumRelevance / relevanceScores.length;
+
+          console.log('Reference relevance statistics:', {
+              min: minRelevance.toFixed(4),
+              max: maxRelevance.toFixed(4),
+              mean: meanRelevance.toFixed(4),
+              count: relevanceScores.length
+          });
+      }
+
+
+      // Step 6: Sort all matches by relevance
+      allMatches.sort((a, b) => b.relevanceScore - a.relevanceScore);
+      console.log(`[buildImageReferences] Step 6: Sorted ${allMatches.length} potential matches by relevance score`);
+
+      // Step 7: Filter matches
+      console.log(`[buildImageReferences] Step 7: Filtering matches to ensure uniqueness and threshold (min: ${minRelScore})`);
+      const usedImages = new Set();
+      const usedAnswerChunks = new Set();
+      const filteredMatches = [];
+
+      for (const match of allMatches) {
+          if (match.relevanceScore < minRelScore) continue;
+
+          if (!usedImages.has(match.imageIndex) && !usedAnswerChunks.has(match.answerChunkIndex)) {
+              filteredMatches.push(match);
+              usedImages.add(match.imageIndex);
+              usedAnswerChunks.add(match.answerChunkIndex);
+
+              if (filteredMatches.length >= maxRef) break;
+          }
+      }
+
+      console.log(`[buildImageReferences] Selected ${filteredMatches.length}/${allMatches.length} references after filtering`);
+
+      const references: ImageReference[] = filteredMatches.map((match) => {
+          const source = imageToSourceMap[match.imageIndex];
+          return {
+              url: source.url,
+              relevanceScore: match.relevanceScore,
+              answerChunk: match.answerChunk,
+              answerChunkPosition: match.answerChunkPosition
+          };
+      });
+
+      return references;
+
+  } catch (error) {
+      console.error('Embedding failed', error);
+      return [];
+  }
 }
--- a/src/tools/embeddings.ts
+++ b/src/tools/embeddings.ts
@ -8,13 +8,14 @@ const MAX_RETRIES = 3; // Maximum number of retries for missing embeddings

 // Modified to support different embedding tasks and dimensions
 export async function getEmbeddings(
-  texts: string[],
+  texts: string[] | Record<string, string>[],
  tokenTracker?: any,
  options: {
    task?: "text-matching" | "retrieval.passage" | "retrieval.query",
    dimensions?: number,
    late_chunking?: boolean,
-    embedding_type?: string
+    embedding_type?: string,
+    model?: string,
  } = {}
 ): Promise<{ embeddings: number[][], tokens: number }> {
  console.log(`[embeddings] Getting embeddings for ${texts.length} texts`);
@ -66,12 +67,13 @@ export async function getEmbeddings(

 // Helper function to get embeddings for a batch with retry logic for missing indices
 async function getBatchEmbeddingsWithRetry(
-  batchTexts: string[],
+  batchTexts: string[] | Record<string, string>[],
  options: {
    task?: "text-matching" | "retrieval.passage" | "retrieval.query",
    dimensions?: number,
    late_chunking?: boolean,
-    embedding_type?: string
+    embedding_type?: string,
+    model?: string,
  },
  currentBatch: number,
  batchCount: number
@ -89,12 +91,15 @@ async function getBatchEmbeddingsWithRetry(

  while (textsToProcess.length > 0 && retryCount < MAX_RETRIES) {
    const request: JinaEmbeddingRequest = {
-      model: "jina-embeddings-v3",
-      task: options.task || "text-matching",
-      input: textsToProcess,
-      truncate: true,
+      model: options.model || "jina-embeddings-v3",
+      input: textsToProcess as any,
    };

+    if (request.model === "jina-embeddings-v3") {
+      request.task = options.task || "text-matching";
+      request.truncate = true;
+    }
+
    // Add optional parameters if provided
    if (options.dimensions) request.dimensions = options.dimensions;
    if (options.late_chunking) request.late_chunking = options.late_chunking;
@ -110,7 +115,7 @@ async function getBatchEmbeddingsWithRetry(
            "Authorization": `Bearer ${JINA_API_KEY}`
          }
        }
-      );
+      ); 

      if (!response.data.data) {
        console.error('No data returned from Jina API');
@ -118,7 +123,7 @@ async function getBatchEmbeddingsWithRetry(
          // On last retry, create placeholder embeddings
          const dimensionSize = options.dimensions || 1024;
          const placeholderEmbeddings = textsToProcess.map(text => {
-            console.error(`Failed to get embedding after all retries: [${text.substring(0, 50)}...]`);
+            console.error(`Failed to get embedding after all retries: [${truncateInputString(text)}...]`);
            return new Array(dimensionSize).fill(0);
          });
          
@ -140,7 +145,7 @@ async function getBatchEmbeddingsWithRetry(
      
      // Process successful embeddings
      const successfulEmbeddings: number[][] = [];
-      const remainingTexts: string[] = [];
+      const remainingTexts: (string | Record<string, string>)[] = [];
      const newIndexMap = new Map<number, number>();
      
      for (let idx = 0; idx < textsToProcess.length; idx++) {
@ -160,7 +165,7 @@ async function getBatchEmbeddingsWithRetry(
          const newIndex = remainingTexts.length;
          newIndexMap.set(newIndex, indexMap.get(idx)!);
          remainingTexts.push(textsToProcess[idx]);
-          console.log(`Missing embedding for index ${idx}, will retry: [${textsToProcess[idx].substring(0, 50)}...]`);
+          console.log(`Missing embedding for index ${idx}, will retry: [${truncateInputString(textsToProcess[idx])}...]`);
        }
      }

@ -190,7 +195,7 @@ async function getBatchEmbeddingsWithRetry(
        const dimensionSize = options.dimensions || 1024;
        for (let idx = 0; idx < textsToProcess.length; idx++) {
          const originalIndex = indexMap.get(idx)!;
-          console.error(`Failed to get embedding after all retries for index ${originalIndex}: [${textsToProcess[idx].substring(0, 50)}...]`);
+          console.error(`Failed to get embedding after all retries for index ${originalIndex}: [${truncateInputString(textsToProcess[idx])}...]`);
          
          while (batchEmbeddings.length <= originalIndex) {
            batchEmbeddings.push([]);
@ -228,3 +233,11 @@ async function getBatchEmbeddingsWithRetry(
  
  return { batchEmbeddings, batchTokens };
 }
+
+function truncateInputString(input: string | Record<string, string>): string {
+  if (typeof input === 'string') {
+    return input.slice(0, 50);
+  } else {
+    return Object.values(input)[0].slice(0, 50);
+  }
+}
--- a/src/tools/jina-dedup.ts
+++ b/src/tools/jina-dedup.ts
@ -78,4 +78,4 @@ export async function dedupQueries(
      unique_queries: newQueries,
    };
  }
-}
+}
--- a/src/tools/read.ts
+++ b/src/tools/read.ts
@ -6,7 +6,8 @@ import axiosClient from "../utils/axios-client";
 export async function readUrl(
  url: string,
  withAllLinks?: boolean,
-  tracker?: TokenTracker
+  tracker?: TokenTracker,
+  withAllImages?: boolean
 ): Promise<{ response: ReadResponse }> {
  if (!url.trim()) {
    throw new Error('URL cannot be empty');
@ -20,7 +21,6 @@ export async function readUrl(
    'Accept': 'application/json',
    'Authorization': `Bearer ${JINA_API_KEY}`,
    'Content-Type': 'application/json',
-    'X-Retain-Images': 'none',
    'X-Md-Link-Style': 'discarded',
  };

@ -28,6 +28,12 @@ export async function readUrl(
    headers['X-With-Links-Summary'] = 'all';
  }

+  if (withAllImages) {
+    headers['X-With-Images-Summary'] = 'true'
+  } else {
+    headers['X-Retain-Images'] = 'none'
+  }
+
  try {
    // Use axios which handles encoding properly
    const { data } = await axiosClient.post<ReadResponse>(
--- a/src/types.ts
+++ b/src/types.ts
@ -27,6 +27,14 @@ export type Reference = {
  answerChunkPosition?: number[];
 }

+export type ImageReference = {
+  url: string;
+  dateTime?: string;
+  relevanceScore?: number;
+  answerChunk?: string;
+  answerChunkPosition?: number[];
+}
+
 export type AnswerAction = BaseAction & {
  action: "answer";
  answer: string;
@ -53,6 +61,7 @@ export type ReflectAction = BaseAction & {
 export type VisitAction = BaseAction & {
  action: "visit";
  URLTargets: number[] | string[];
+  image?: ImageObject;
 };

 export type CodingAction = BaseAction & {
@ -155,6 +164,7 @@ export interface ReadResponse {
    content: string;
    usage: { tokens: number; };
    links: Array<[string, string]>; // [anchor, url]
+    images: Record<string, string>; // { image: url }
  };
  name?: string;
  message?: string;
@ -259,6 +269,8 @@ export interface ChatCompletionRequest {

  max_annotations?: number;
  min_annotation_relevance?: number;
+
+  with_images?: boolean;
  language_code?: string;
  search_language_code?: string;
  search_provider?: string;
@ -294,6 +306,8 @@ export interface ChatCompletionResponse {
  visitedURLs?: string[];
  readURLs?: string[];
  numURLs?: number;
+  allImages?: string[];
+  relatedImages?: string[];
 }

 export interface ChatCompletionChunk {
@ -318,6 +332,8 @@ export interface ChatCompletionChunk {
  visitedURLs?: string[];
  readURLs?: string[];
  numURLs?: number;
+  allImages?: string[];
+  relatedImages?: string[];
 }

 // Tracker Types
@ -336,11 +352,11 @@ export interface TrackerContext {
 // Interface definitions for Jina API
 export interface JinaEmbeddingRequest {
  model: string;
-  task: string;
+  task?: string;
  late_chunking?: boolean;
  dimensions?: number;
  embedding_type?: string;
-  input: string[];
+  input: string[] | Record<string, string>[];
  truncate?: boolean;
 }

@ -356,4 +372,10 @@ export interface JinaEmbeddingResponse {
    index: number;
    embedding: number[];
  }>;
+}
+
+export type ImageObject = {
+  url: string;
+  alt?: string;
+  embedding: number[][];
 }
--- a/src/utils/image-tools.ts
+++ b/src/utils/image-tools.ts
@ -0,0 +1,233 @@
+import canvas from '@napi-rs/canvas';
+import { getEmbeddings } from '../tools/embeddings';
+import { TokenTracker } from './token-tracker';
+import { ImageObject } from '../types';
+import { cosineSimilarity } from '../tools/cosine';
+export type { Canvas, Image } from '@napi-rs/canvas';
+import { Storage } from '@google-cloud/storage';
+import { randomUUID } from 'crypto';
+
+export const downloadFile = async (uri: string) => {
+    const resp = await fetch(uri);
+    if (!(resp.ok && resp.body)) {
+        throw new Error(`Unexpected response ${resp.statusText}`);
+    }
+    const contentLength = parseInt(resp.headers.get('content-length') || '0');
+    if (contentLength > 1024 * 1024 * 100) {
+        throw new Error('File too large');
+    }
+    const buff = await resp.arrayBuffer();
+
+    return { buff, contentType: resp.headers.get('content-type') };
+};
+
+const _loadImage = async (input: string | Buffer) => {
+  let buff;
+  let contentType;
+
+  if (typeof input === 'string') {
+      if (input.startsWith('data:')) {
+          const firstComma = input.indexOf(',');
+          const header = input.slice(0, firstComma);
+          const data = input.slice(firstComma + 1);
+          const encoding = header.split(';')[1];
+          contentType = header.split(';')[0].split(':')[1];
+          if (encoding?.startsWith('base64')) {
+              buff = Buffer.from(data, 'base64');
+          } else {
+              buff = Buffer.from(decodeURIComponent(data), 'utf-8');
+          }
+      }
+      if (input.startsWith('http')) {
+        if (input.endsWith('.svg')) {
+          throw new Error('Unsupported image type');
+        }
+        const r = await downloadFile(input);
+        buff = Buffer.from(r.buff);
+        contentType = r.contentType;
+      }
+  }
+
+  if (!buff) {
+      throw new Error('Invalid input');
+  }
+
+  const img = await canvas.loadImage(buff);
+  Reflect.set(img, 'contentType', contentType);
+
+  return {
+    img,
+    buff,
+    contentType,
+  };
+}
+
+export const loadImage = async (uri: string | Buffer) => {
+    try {
+        const theImage = await _loadImage(uri);
+
+        return theImage;
+    } catch (err: any) {
+        if (err?.message?.includes('Unsupported image type') || err?.message?.includes('unsupported')) {
+            throw new Error(`Unknown image format for ${uri.slice(0, 128)}`);
+        }
+        throw err;
+    }
+}
+
+export const fitImageToSquareBox = (image: canvas.Image | canvas.Canvas, size: number = 1024) => {
+    if (image.width <= size && image.height <= size) {
+      const canvasInstance = canvas.createCanvas(image.width, image.height);
+      const ctx = canvasInstance.getContext('2d');
+      ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, canvasInstance.width, canvasInstance.height);
+      
+      return canvasInstance;
+    }
+
+    const aspectRatio = image.width / image.height;
+
+    const resizedWidth = Math.round(aspectRatio > 1 ? size : size * aspectRatio);
+    const resizedHeight = Math.round(aspectRatio > 1 ? size / aspectRatio : size);
+
+    const canvasInstance = canvas.createCanvas(resizedWidth, resizedHeight);
+    const ctx = canvasInstance.getContext('2d');
+    ctx.drawImage(image, 0, 0, image.width, image.height, 0, 0, resizedWidth, resizedHeight);
+
+    return canvasInstance;
+}
+
+
+export const canvasToDataUrl = (canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') => {
+    return canvas.toDataURLAsync((mimeType || 'image/png') as 'image/png');
+}
+
+export const canvasToBuffer = (canvas: canvas.Canvas, mimeType?: 'image/png' | 'image/jpeg') => {
+    return canvas.toBuffer((mimeType || 'image/png') as 'image/png');
+}
+
+export const processImage = async (url: string, tracker: TokenTracker): Promise<ImageObject | undefined> => {
+  try {
+    const { img, buff, contentType } = await loadImage(url);
+    if (!img) {
+      return;
+    }
+
+    // Check if the image is smaller than 256x256
+    if (img.width < 256 || img.height < 256) {
+      return;
+    }
+
+    const newUrl = await saveImageToFirebase(buff, contentType);
+    const canvas = fitImageToSquareBox(img, 512);
+    const base64Data = (await canvasToDataUrl(canvas)).split(',')[1];
+
+    const {embeddings} = await getEmbeddings([{ image: base64Data }], tracker, {
+      dimensions: 1024,
+      model: 'jina-clip-v2',
+    });
+
+    return {
+      url: newUrl ?? url,
+      embedding: embeddings,
+    };
+
+  } catch (error) {
+    return;
+  }
+}
+
+export const dedupImagesWithEmbeddings = (
+  newImages: ImageObject[], // New images with embeddings
+  existingImages: ImageObject[], // Existing images with embeddings
+  similarityThreshold: number = 0.86, // Default similarity threshold
+): ImageObject[]  =>{
+  try {
+    // Quick return for single new image with no existing images
+    if (newImages.length === 1 && existingImages.length === 0) {
+      return newImages;
+    }
+
+    const uniqueImages: ImageObject[] = [];
+    const usedIndices = new Set<number>();
+
+    // Compare each new image against existing images and already accepted images
+    for (let i = 0; i < newImages.length; i++) {
+      let isUnique = true;
+
+      // Check against existing images
+      for (let j = 0; j < existingImages.length; j++) {
+        const similarity = cosineSimilarity(
+          newImages[i].embedding[0], // Use the first embedding for comparison
+          existingImages[j].embedding[0]
+        );
+        if (similarity >= similarityThreshold) {
+          isUnique = false;
+          break;
+        }
+      }
+
+      // Check against already accepted images
+      if (isUnique) {
+        for (const usedIndex of usedIndices) {
+          const similarity = cosineSimilarity(
+            newImages[i].embedding[0], // Use the first embedding for comparison
+            newImages[usedIndex].embedding[0]
+          );
+          if (similarity >= similarityThreshold) {
+            isUnique = false;
+            break;
+          }
+        }
+      }
+
+      // Add to unique images if passed all checks
+      if (isUnique) {
+        uniqueImages.push(newImages[i]);
+        usedIndices.add(i);
+      }
+    }
+
+    return uniqueImages;
+  } catch (error) {
+    console.error('Error in image deduplication analysis:', error);
+
+    // Return all new images if there is an error
+    return newImages;
+  }
+}
+
+export const saveImageToFirebase = async (
+  buffer: Buffer,
+  mimeType?: string | null,
+): Promise<string | undefined> => {
+  if (!process.env.GCLOUD_PROJECT) {
+    console.error('GCLOUD_PROJECT environment variable is not set');
+    return;
+  }
+  const firebaseDefaultBucket = new Storage().bucket(`${process.env.GCLOUD_PROJECT}.appspot.com`);
+
+  try {
+    let extension = 'png';
+    const finalMimeType = mimeType || 'image/png';
+
+    if (!finalMimeType.startsWith('image/')) {
+      return;
+    } else {
+      extension = finalMimeType?.split('/')[1] || 'png';
+    }
+
+    const fileName = `readImages/${randomUUID()}.${extension}`;
+    
+    const file = firebaseDefaultBucket.file(fileName);
+    
+    await file.save(buffer, {
+      contentType: finalMimeType,
+      public: true,
+    });
+    
+    return file.publicUrl();
+  } catch (error) {
+    console.error('Error saving image to Firebase Storage:', error);
+    return;
+  }
+};
--- a/src/utils/url-tools.ts
+++ b/src/utils/url-tools.ts
@ -1,12 +1,13 @@
-import { BoostedSearchSnippet, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent } from "../types";
-import { getI18nText, smartMergeStrings } from "./text-tools";
-import { rerankDocuments } from "../tools/jina-rerank";
-import { readUrl } from "../tools/read";
-import { Schemas } from "./schemas";
-import { cherryPick } from "../tools/jina-latechunk";
-import { formatDateBasedOnType } from "./date-tools";
-import { classifyText } from "../tools/jina-classify-spam";
-import { segmentText } from "../tools/segment";
+import {BoostedSearchSnippet, ImageObject, KnowledgeItem, SearchSnippet, TrackerContext, VisitAction, WebContent} from "../types";
+import {getI18nText, smartMergeStrings} from "./text-tools";
+import {rerankDocuments} from "../tools/jina-rerank";
+import {readUrl} from "../tools/read";
+import {Schemas} from "./schemas";
+import {cherryPick} from "../tools/jina-latechunk";
+import {formatDateBasedOnType} from "./date-tools";
+import {classifyText} from "../tools/jina-classify-spam";
+import { processImage } from "./image-tools";
+import {segmentText} from "../tools/segment";
 import axiosClient from "./axios-client";

 export function normalizeUrl(urlString: string, debug = false, options = {
@ -460,9 +461,11 @@ export async function processURLs(
  allURLs: Record<string, SearchSnippet>,
  visitedURLs: string[],
  badURLs: string[],
+  imageObjects: ImageObject[],
  schemaGen: Schemas,
  question: string,
-  webContents: Record<string, WebContent>
+  webContents: Record<string, WebContent>,
+  withImages: boolean = false,
 ): Promise<{ urlResults: any[], success: boolean }> {
  // Skip if no URLs to process
  if (urls.length === 0) {
@ -491,8 +494,8 @@ export async function processURLs(
        // Store normalized URL for consistent reference
        url = normalizedUrl;

-        const { response } = await readUrl(url, true, context.tokenTracker);
-        const { data } = response;
+        const {response} = await readUrl(url, true, context.tokenTracker, withImages);
+        const {data} = response;
        const guessedTime = await getLastModified(url);
        if (guessedTime) {
          console.log('Guessed time for', url, guessedTime);
@ -554,7 +557,18 @@ export async function processURLs(
          }
        });

-        return { url, result: response };
+        // Process images
+        if (withImages && data.images) {
+          const imageEntries = Object.entries(data.images || {});
+          imageEntries.forEach(async ([alt, url]) => {
+            const imageObject = await processImage(url, context.tokenTracker);
+            if (imageObject && !imageObjects.find(i => i.url === imageObject.url)) {
+              imageObjects.push(imageObject);
+            }
+          });
+        }
+
+        return {url, result: response};
      } catch (error: any) {
        console.error('Error reading URL:', url, error);
        badURLs.push(url);