@@ -2,6 +2,21 @@ import { GoogleGenAI } from '@google/genai';
2
2
import { GoogleAuth } from 'google-auth-library' ;
3
3
import rollbar from '../rollbarInstance' ;
4
4
5
+ // Singleton client for reuse across requests
6
+ let genAI ;
7
+
8
+ async function getGenAIClient ( ) {
9
+ if ( genAI ) return genAI ;
10
+
11
+ const project = await new GoogleAuth ( ) . getProjectId ( ) ;
12
+ genAI = new GoogleGenAI ( {
13
+ vertexai : true ,
14
+ project,
15
+ location : 'us-central1' , // Using us-central1 for better availability
16
+ } ) ;
17
+ return genAI ;
18
+ }
19
+
5
20
/**
6
21
* Scrapes URL content using Gemini's urlContext tool
7
22
*
@@ -13,128 +28,122 @@ export default async function scrapeUrlsWithGemini(urls) {
13
28
return [ ] ;
14
29
}
15
30
16
- const project = await new GoogleAuth ( ) . getProjectId ( ) ;
17
- const genAI = new GoogleGenAI ( {
18
- vertexai : true ,
19
- project,
20
- location : 'us-central1' , // Using us-central1 for better availability
21
- } ) ;
31
+ const genAIClient = await getGenAIClient ( ) ;
22
32
23
- const results = [ ] ;
33
+ try {
34
+ // Process all URLs in a single LLM call for efficiency
35
+ const urlList = urls . map ( ( url , index ) => `${ index + 1 } . ${ url } ` ) . join ( '\n' ) ;
36
+
37
+ const generateContentArgs = {
38
+ model : 'gemini-2.5-flash' ,
39
+ contents : [
40
+ {
41
+ role : 'user' ,
42
+ parts : [
43
+ {
44
+ text : `Please analyze the content at these URLs and extract information from each:
24
45
25
- // Process URLs one by one to avoid overwhelming the API
26
- for ( const url of urls ) {
27
- try {
28
- const generateContentArgs = {
29
- model : 'gemini-2.0-flash-001' ,
30
- contents : [
31
- {
32
- role : 'user' ,
33
- parts : [
34
- {
35
- text : `Please analyze the content at this URL: ${ url }
46
+ ${ urlList }
36
47
37
- Extract and return ONLY a JSON object with the following structure (no markdown formatting, no extra text):
38
- {
39
- "title": "The main title of the page",
40
- "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
41
- "topImageUrl": "URL of the most representative image on the page, or null if none exists"
42
- }
48
+ For each URL, extract and return a JSON array with objects having the following structure (no markdown formatting, no extra text):
49
+ [
50
+ {
51
+ "url": "original URL from the list",
52
+ "canonical": "canonical URL if different from original, or same as original",
53
+ "title": "The main title of the page",
54
+ "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes",
55
+ "topImageUrl": "URL of the most representative image on the page, or null if none exists"
56
+ }
57
+ ]
43
58
44
59
Requirements:
60
+ - url: Return the exact original URL from the input list
61
+ - canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found
45
62
- title: Extract the main page title
46
63
- summary: Should be detailed enough for search and fact-checking, capturing all important claims and information
47
64
- topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists
48
- - Return valid JSON only, no markdown code blocks or explanations` ,
49
- } ,
50
- ] ,
51
- } ,
52
- ] ,
53
- tools : [
54
- {
55
- functionDeclarations : [
56
- {
57
- name : 'url_context' ,
58
- description : 'Retrieves content from the specified URL' ,
59
- parameters : {
60
- type : 'object' ,
61
- properties : {
62
- url : {
63
- type : 'string' ,
64
- description : 'The URL to retrieve content from' ,
65
- } ,
66
- } ,
67
- required : [ 'url' ] ,
68
- } ,
69
- } ,
70
- ] ,
71
- } ,
72
- ] ,
73
- config : {
74
- systemInstruction : 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.' ,
75
- responseModalities : [ 'TEXT' ] ,
76
- temperature : 0.1 , // Low temperature for consistent extraction
77
- maxOutputTokens : 2048 ,
65
+ - Return valid JSON array only, no markdown code blocks or explanations
66
+ - Process all URLs and return results for each, even if some fail` ,
67
+ } ,
68
+ ] ,
78
69
} ,
79
- } ;
70
+ ] ,
71
+ config : {
72
+ tools : [ { urlContext : { } } ] ,
73
+ systemInstruction : 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.' ,
74
+ responseModalities : [ 'TEXT' ] ,
75
+ temperature : 0.1 , // Low temperature for consistent extraction
76
+ maxOutputTokens : 4096 ,
77
+ } ,
78
+ } ;
80
79
81
- const response = await genAI . models . generateContent ( generateContentArgs ) ;
82
-
83
- if ( ! response . candidates || ! response . candidates [ 0 ] ) {
84
- throw new Error ( 'No response candidates received' ) ;
85
- }
80
+ const response = await genAIClient . models . generateContent ( generateContentArgs ) ;
81
+
82
+ if ( ! response . candidates || ! response . candidates [ 0 ] ) {
83
+ throw new Error ( 'No response candidates received' ) ;
84
+ }
86
85
87
- const responseText = response . candidates [ 0 ] . content . parts [ 0 ] . text ;
88
-
89
- // Parse the JSON response
90
- let extractedData ;
91
- try {
92
- // Clean the response text to extract JSON
93
- const jsonMatch = responseText . match ( / \{ [ \s \S ] * \} / ) ;
94
- if ( jsonMatch ) {
95
- extractedData = JSON . parse ( jsonMatch [ 0 ] ) ;
96
- } else {
97
- extractedData = JSON . parse ( responseText ) ;
98
- }
99
- } catch ( parseError ) {
100
- console . warn ( '[geminiUrlScraper] Failed to parse JSON response:' , responseText ) ;
101
- extractedData = {
102
- title : null ,
103
- summary : responseText . trim ( ) || 'Unable to extract structured content' ,
104
- topImageUrl : null ,
105
- } ;
86
+ const responseText = response . candidates [ 0 ] . content . parts [ 0 ] . text ;
87
+
88
+ // Parse the JSON response
89
+ let extractedDataArray ;
90
+ try {
91
+ // Clean the response text to extract JSON array
92
+ const jsonMatch = responseText . match ( / \[ [ \s \S ] * \] / ) ;
93
+ if ( jsonMatch ) {
94
+ extractedDataArray = JSON . parse ( jsonMatch [ 0 ] ) ;
95
+ } else {
96
+ extractedDataArray = JSON . parse ( responseText ) ;
106
97
}
107
-
108
- results . push ( {
109
- url,
110
- canonical : url , // Use original URL as canonical for now
111
- title : extractedData . title || null ,
112
- summary : extractedData . summary || null ,
113
- topImageUrl : extractedData . topImageUrl || null ,
114
- html : '' , // Leave empty as requested
115
- status : 'SUCCESS' ,
116
- } ) ;
117
-
118
- } catch ( error ) {
119
- console . error ( '[geminiUrlScraper] Error processing URL:' , url , error ) ;
120
-
121
- rollbar . error ( 'Gemini URL scraping error' , {
122
- url,
123
- error : error . message ,
124
- } ) ;
125
-
126
- results . push ( {
98
+ } catch ( parseError ) {
99
+ console . warn ( '[geminiUrlScraper] Failed to parse JSON response:' , responseText ) ;
100
+ // Fallback: create error results for all URLs
101
+ return urls . map ( url => ( {
127
102
url,
128
103
canonical : url ,
129
104
title : null ,
130
- summary : null ,
105
+ summary : 'Unable to extract structured content' ,
131
106
topImageUrl : null ,
132
107
html : '' ,
133
108
status : 'ERROR' ,
134
- error : error . message ,
135
- } ) ;
109
+ error : 'Failed to parse LLM response' ,
110
+ } ) ) ;
136
111
}
137
- }
138
112
139
- return results ;
113
+ // Ensure we have results for all input URLs
114
+ const results = urls . map ( url => {
115
+ const extracted = extractedDataArray . find ( item => item . url === url ) || { } ;
116
+ return {
117
+ url,
118
+ canonical : extracted . canonical || url ,
119
+ title : extracted . title || null ,
120
+ summary : extracted . summary || null ,
121
+ topImageUrl : extracted . topImageUrl || null ,
122
+ html : '' , // Leave empty as requested
123
+ status : 'SUCCESS' ,
124
+ } ;
125
+ } ) ;
126
+
127
+ return results ;
128
+
129
+ } catch ( error ) {
130
+ console . error ( '[geminiUrlScraper] Error processing URLs:' , error ) ;
131
+
132
+ rollbar . error ( 'Gemini URL scraping error' , {
133
+ urls,
134
+ error : error . message ,
135
+ } ) ;
136
+
137
+ // Return error results for all URLs
138
+ return urls . map ( url => ( {
139
+ url,
140
+ canonical : url ,
141
+ title : null ,
142
+ summary : null ,
143
+ topImageUrl : null ,
144
+ html : '' ,
145
+ status : 'ERROR' ,
146
+ error : error . message ,
147
+ } ) ) ;
148
+ }
140
149
}
0 commit comments