Skip to content

Commit 54b8cfc

Browse files
claude[bot]MrOrz
andcommitted
Fix Gemini URL scraper based on review feedback
- Use correct urlContext tool configuration format - Process multiple URLs in single LLM call for efficiency - Switch to gemini-2.5-flash model - Add canonical URL extraction capability - Implement client reuse pattern for performance - Add safety check for empty URL list in experiment script Co-authored-by: Johnson Liang <[email protected]>
1 parent 4e9ac86 commit 54b8cfc

File tree

2 files changed

+118
-105
lines changed

2 files changed

+118
-105
lines changed

src/scripts/experimentUrlScraper.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,16 @@ async function main({
4141
if (single) {
4242
testUrls = [single];
4343
} else if (urls) {
44-
testUrls = urls.split(',').map(url => url.trim());
44+
testUrls = urls.split(',').map(url => url.trim()).filter(Boolean);
4545
} else {
4646
testUrls = DEFAULT_TEST_URLS;
4747
console.info('No URLs specified, using default test URLs:', testUrls);
4848
}
4949

50+
if (testUrls.length === 0) {
51+
console.info('No valid URLs to process. Exiting.');
52+
return;
53+
}
5054
console.info(`Testing URL scraping with ${testUrls.length} URLs`);
5155

5256
const trace = langfuse.trace({

src/util/geminiUrlScraper.js

Lines changed: 113 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,21 @@ import { GoogleGenAI } from '@google/genai';
22
import { GoogleAuth } from 'google-auth-library';
33
import rollbar from '../rollbarInstance';
44

5+
// Singleton client for reuse across requests
6+
let genAI;
7+
8+
async function getGenAIClient() {
9+
if (genAI) return genAI;
10+
11+
const project = await new GoogleAuth().getProjectId();
12+
genAI = new GoogleGenAI({
13+
vertexai: true,
14+
project,
15+
location: 'us-central1', // Using us-central1 for better availability
16+
});
17+
return genAI;
18+
}
19+
520
/**
621
* Scrapes URL content using Gemini's urlContext tool
722
*
@@ -13,128 +28,122 @@ export default async function scrapeUrlsWithGemini(urls) {
1328
return [];
1429
}
1530

16-
const project = await new GoogleAuth().getProjectId();
17-
const genAI = new GoogleGenAI({
18-
vertexai: true,
19-
project,
20-
location: 'us-central1', // Using us-central1 for better availability
21-
});
31+
const genAIClient = await getGenAIClient();
2232

23-
const results = [];
33+
try {
34+
// Process all URLs in a single LLM call for efficiency
35+
const urlList = urls.map((url, index) => `${index + 1}. ${url}`).join('\n');
36+
37+
const generateContentArgs = {
38+
model: 'gemini-2.5-flash',
39+
contents: [
40+
{
41+
role: 'user',
42+
parts: [
43+
{
44+
text: `Please analyze the content at these URLs and extract information from each:
2445
25-
// Process URLs one by one to avoid overwhelming the API
26-
for (const url of urls) {
27-
try {
28-
const generateContentArgs = {
29-
model: 'gemini-2.0-flash-001',
30-
contents: [
31-
{
32-
role: 'user',
33-
parts: [
34-
{
35-
text: `Please analyze the content at this URL: ${url}
46+
${urlList}
3647
37-
Extract and return ONLY a JSON object with the following structure (no markdown formatting, no extra text):
38-
{
39-
"title": "The main title of the page",
40-
"summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
41-
"topImageUrl": "URL of the most representative image on the page, or null if none exists"
42-
}
48+
For each URL, extract and return a JSON array with objects having the following structure (no markdown formatting, no extra text):
49+
[
50+
{
51+
"url": "original URL from the list",
52+
"canonical": "canonical URL if different from original, or same as original",
53+
"title": "The main title of the page",
54+
"summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
55+
"topImageUrl": "URL of the most representative image on the page, or null if none exists"
56+
}
57+
]
4358
4459
Requirements:
60+
- url: Return the exact original URL from the input list
61+
- canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found
4562
- title: Extract the main page title
4663
- summary: Should be detailed enough for search and fact-checking, capturing all important claims and information
4764
- topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists
48-
- Return valid JSON only, no markdown code blocks or explanations`,
49-
},
50-
],
51-
},
52-
],
53-
tools: [
54-
{
55-
functionDeclarations: [
56-
{
57-
name: 'url_context',
58-
description: 'Retrieves content from the specified URL',
59-
parameters: {
60-
type: 'object',
61-
properties: {
62-
url: {
63-
type: 'string',
64-
description: 'The URL to retrieve content from',
65-
},
66-
},
67-
required: ['url'],
68-
},
69-
},
70-
],
71-
},
72-
],
73-
config: {
74-
systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.',
75-
responseModalities: ['TEXT'],
76-
temperature: 0.1, // Low temperature for consistent extraction
77-
maxOutputTokens: 2048,
65+
- Return valid JSON array only, no markdown code blocks or explanations
66+
- Process all URLs and return results for each, even if some fail`,
67+
},
68+
],
7869
},
79-
};
70+
],
71+
config: {
72+
tools: [{ urlContext: {} }],
73+
systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.',
74+
responseModalities: ['TEXT'],
75+
temperature: 0.1, // Low temperature for consistent extraction
76+
maxOutputTokens: 4096,
77+
},
78+
};
8079

81-
const response = await genAI.models.generateContent(generateContentArgs);
82-
83-
if (!response.candidates || !response.candidates[0]) {
84-
throw new Error('No response candidates received');
85-
}
80+
const response = await genAIClient.models.generateContent(generateContentArgs);
81+
82+
if (!response.candidates || !response.candidates[0]) {
83+
throw new Error('No response candidates received');
84+
}
8685

87-
const responseText = response.candidates[0].content.parts[0].text;
88-
89-
// Parse the JSON response
90-
let extractedData;
91-
try {
92-
// Clean the response text to extract JSON
93-
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
94-
if (jsonMatch) {
95-
extractedData = JSON.parse(jsonMatch[0]);
96-
} else {
97-
extractedData = JSON.parse(responseText);
98-
}
99-
} catch (parseError) {
100-
console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText);
101-
extractedData = {
102-
title: null,
103-
summary: responseText.trim() || 'Unable to extract structured content',
104-
topImageUrl: null,
105-
};
86+
const responseText = response.candidates[0].content.parts[0].text;
87+
88+
// Parse the JSON response
89+
let extractedDataArray;
90+
try {
91+
// Clean the response text to extract JSON array
92+
const jsonMatch = responseText.match(/\[[\s\S]*\]/);
93+
if (jsonMatch) {
94+
extractedDataArray = JSON.parse(jsonMatch[0]);
95+
} else {
96+
extractedDataArray = JSON.parse(responseText);
10697
}
107-
108-
results.push({
109-
url,
110-
canonical: url, // Use original URL as canonical for now
111-
title: extractedData.title || null,
112-
summary: extractedData.summary || null,
113-
topImageUrl: extractedData.topImageUrl || null,
114-
html: '', // Leave empty as requested
115-
status: 'SUCCESS',
116-
});
117-
118-
} catch (error) {
119-
console.error('[geminiUrlScraper] Error processing URL:', url, error);
120-
121-
rollbar.error('Gemini URL scraping error', {
122-
url,
123-
error: error.message,
124-
});
125-
126-
results.push({
98+
} catch (parseError) {
99+
console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText);
100+
// Fallback: create error results for all URLs
101+
return urls.map(url => ({
127102
url,
128103
canonical: url,
129104
title: null,
130-
summary: null,
105+
summary: 'Unable to extract structured content',
131106
topImageUrl: null,
132107
html: '',
133108
status: 'ERROR',
134-
error: error.message,
135-
});
109+
error: 'Failed to parse LLM response',
110+
}));
136111
}
137-
}
138112

139-
return results;
113+
// Ensure we have results for all input URLs
114+
const results = urls.map(url => {
115+
const extracted = extractedDataArray.find(item => item.url === url) || {};
116+
return {
117+
url,
118+
canonical: extracted.canonical || url,
119+
title: extracted.title || null,
120+
summary: extracted.summary || null,
121+
topImageUrl: extracted.topImageUrl || null,
122+
html: '', // Leave empty as requested
123+
status: 'SUCCESS',
124+
};
125+
});
126+
127+
return results;
128+
129+
} catch (error) {
130+
console.error('[geminiUrlScraper] Error processing URLs:', error);
131+
132+
rollbar.error('Gemini URL scraping error', {
133+
urls,
134+
error: error.message,
135+
});
136+
137+
// Return error results for all URLs
138+
return urls.map(url => ({
139+
url,
140+
canonical: url,
141+
title: null,
142+
summary: null,
143+
topImageUrl: null,
144+
html: '',
145+
status: 'ERROR',
146+
error: error.message,
147+
}));
148+
}
140149
}

0 commit comments

Comments
 (0)