Fix Gemini URL scraper based on review feedback

claude[bot] · MrOrz · claude[bot] · commit 54b8cfcff654 · 2025-08-12T07:50:46.000Z
- Use correct urlContext tool configuration format
- Process multiple URLs in single LLM call for efficiency
- Switch to gemini-2.5-flash model
- Add canonical URL extraction capability
- Implement client reuse pattern for performance
- Add safety check for empty URL list in experiment script

Co-authored-by: Johnson Liang &lt;MrOrz@users.noreply.github.com&gt;
diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts
@@ -41,12 +41,16 @@ async function main({
   if (single) {
     testUrls = [single];
   } else if (urls) {
-    testUrls = urls.split(',').map(url => url.trim());
+    testUrls = urls.split(',').map(url => url.trim()).filter(Boolean);
   } else {
     testUrls = DEFAULT_TEST_URLS;
     console.info('No URLs specified, using default test URLs:', testUrls);
   }
 
+  if (testUrls.length === 0) {
+    console.info('No valid URLs to process. Exiting.');
+    return;
+  }
   console.info(`Testing URL scraping with ${testUrls.length} URLs`);
 
   const trace = langfuse.trace({
diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js
@@ -2,6 +2,21 @@ import { GoogleGenAI } from '@google/genai';
 import { GoogleAuth } from 'google-auth-library';
 import rollbar from '../rollbarInstance';
 
+// Singleton client for reuse across requests
+let genAI;
+
+async function getGenAIClient() {
+  if (genAI) return genAI;
+
+  const project = await new GoogleAuth().getProjectId();
+  genAI = new GoogleGenAI({
+    vertexai: true,
+    project,
+    location: 'us-central1', // Using us-central1 for better availability
+  });
+  return genAI;
+}
+
 /**
  * Scrapes URL content using Gemini's urlContext tool
  * 
@@ -13,128 +28,122 @@ export default async function scrapeUrlsWithGemini(urls) {
     return [];
   }
 
-  const project = await new GoogleAuth().getProjectId();
-  const genAI = new GoogleGenAI({
-    vertexai: true,
-    project,
-    location: 'us-central1', // Using us-central1 for better availability
-  });
+  const genAIClient = await getGenAIClient();
 
-  const results = [];
+  try {
+    // Process all URLs in a single LLM call for efficiency
+    const urlList = urls.map((url, index) => `${index + 1}. ${url}`).join('\n');
+    
+    const generateContentArgs = {
+      model: 'gemini-2.5-flash',
+      contents: [
+        {
+          role: 'user',
+          parts: [
+            {
+              text: `Please analyze the content at these URLs and extract information from each:
 
-  // Process URLs one by one to avoid overwhelming the API
-  for (const url of urls) {
-    try {
-      const generateContentArgs = {
-        model: 'gemini-2.0-flash-001',
-        contents: [
-          {
-            role: 'user',
-            parts: [
-              {
-                text: `Please analyze the content at this URL: ${url}
+${urlList}
 
-Extract and return ONLY a JSON object with the following structure (no markdown formatting, no extra text):
-{
-  "title": "The main title of the page",
-  "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
-  "topImageUrl": "URL of the most representative image on the page, or null if none exists"
-}
+For each URL, extract and return a JSON array with objects having the following structure (no markdown formatting, no extra text):
+[
+  {
+    "url": "original URL from the list",
+    "canonical": "canonical URL if different from original, or same as original",
+    "title": "The main title of the page",
+    "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
+    "topImageUrl": "URL of the most representative image on the page, or null if none exists"
+  }
+]
 
 Requirements:
+- url: Return the exact original URL from the input list
+- canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found
 - title: Extract the main page title
 - summary: Should be detailed enough for search and fact-checking, capturing all important claims and information
 - topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists
-- Return valid JSON only, no markdown code blocks or explanations`,
-              },
-            ],
-          },
-        ],
-        tools: [
-          {
-            functionDeclarations: [
-              {
-                name: 'url_context',
-                description: 'Retrieves content from the specified URL',
-                parameters: {
-                  type: 'object',
-                  properties: {
-                    url: {
-                      type: 'string',
-                      description: 'The URL to retrieve content from',
-                    },
-                  },
-                  required: ['url'],
-                },
-              },
-            ],
-          },
-        ],
-        config: {
-          systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.',
-          responseModalities: ['TEXT'],
-          temperature: 0.1, // Low temperature for consistent extraction
-          maxOutputTokens: 2048,
+- Return valid JSON array only, no markdown code blocks or explanations
+- Process all URLs and return results for each, even if some fail`,
+            },
+          ],
         },
-      };
+      ],
+      config: {
+        tools: [{ urlContext: {} }],
+        systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.',
+        responseModalities: ['TEXT'],
+        temperature: 0.1, // Low temperature for consistent extraction
+        maxOutputTokens: 4096,
+      },
+    };
 
-      const response = await genAI.models.generateContent(generateContentArgs);
-      
-      if (!response.candidates || !response.candidates[0]) {
-        throw new Error('No response candidates received');
-      }
+    const response = await genAIClient.models.generateContent(generateContentArgs);
+    
+    if (!response.candidates || !response.candidates[0]) {
+      throw new Error('No response candidates received');
+    }
 
-      const responseText = response.candidates[0].content.parts[0].text;
-      
-      // Parse the JSON response
-      let extractedData;
-      try {
-        // Clean the response text to extract JSON
-        const jsonMatch = responseText.match(/\{[\s\S]*\}/);
-        if (jsonMatch) {
-          extractedData = JSON.parse(jsonMatch[0]);
-        } else {
-          extractedData = JSON.parse(responseText);
-        }
-      } catch (parseError) {
-        console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText);
-        extractedData = {
-          title: null,
-          summary: responseText.trim() || 'Unable to extract structured content',
-          topImageUrl: null,
-        };
+    const responseText = response.candidates[0].content.parts[0].text;
+    
+    // Parse the JSON response
+    let extractedDataArray;
+    try {
+      // Clean the response text to extract JSON array
+      const jsonMatch = responseText.match(/\[[\s\S]*\]/);
+      if (jsonMatch) {
+        extractedDataArray = JSON.parse(jsonMatch[0]);
+      } else {
+        extractedDataArray = JSON.parse(responseText);
       }
-
-      results.push({
-        url,
-        canonical: url, // Use original URL as canonical for now
-        title: extractedData.title || null,
-        summary: extractedData.summary || null,
-        topImageUrl: extractedData.topImageUrl || null,
-        html: '', // Leave empty as requested
-        status: 'SUCCESS',
-      });
-
-    } catch (error) {
-      console.error('[geminiUrlScraper] Error processing URL:', url, error);
-      
-      rollbar.error('Gemini URL scraping error', {
-        url,
-        error: error.message,
-      });
-
-      results.push({
+    } catch (parseError) {
+      console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText);
+      // Fallback: create error results for all URLs
+      return urls.map(url => ({
         url,
         canonical: url,
         title: null,
-        summary: null,
+        summary: 'Unable to extract structured content',
         topImageUrl: null,
         html: '',
         status: 'ERROR',
-        error: error.message,
-      });
+        error: 'Failed to parse LLM response',
+      }));
     }
-  }
 
-  return results;
+    // Ensure we have results for all input URLs
+    const results = urls.map(url => {
+      const extracted = extractedDataArray.find(item => item.url === url) || {};
+      return {
+        url,
+        canonical: extracted.canonical || url,
+        title: extracted.title || null,
+        summary: extracted.summary || null,
+        topImageUrl: extracted.topImageUrl || null,
+        html: '', // Leave empty as requested
+        status: 'SUCCESS',
+      };
+    });
+
+    return results;
+
+  } catch (error) {
+    console.error('[geminiUrlScraper] Error processing URLs:', error);
+    
+    rollbar.error('Gemini URL scraping error', {
+      urls,
+      error: error.message,
+    });
+
+    // Return error results for all URLs
+    return urls.map(url => ({
+      url,
+      canonical: url,
+      title: null,
+      summary: null,
+      topImageUrl: null,
+      html: '',
+      status: 'ERROR',
+      error: error.message,
+    }));
+  }
 }