From 402be33c5d9df3e5a24ff68ffa4d5e9262be76e6 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com> Date: Tue, 12 Aug 2025 04:35:36 +0000 Subject: [PATCH 1/9] Replace gRPC URL resolver with Gemini urlContext implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Gemini-based URL scraping using @google/genai urlContext tool - Remove gRPC dependency from scrapUrls.js - Create experimental testing script for manual evaluation - Update tests to mock new Gemini scraper - Maintain backward compatibility with existing API - Leave html field empty as requested, focus on title/summary/topImageUrl 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Johnson Liang --- src/scripts/experimentUrlScraper.ts | 164 +++++++++++++++++++++++++ src/util/__mocks__/geminiUrlScraper.js | 4 + src/util/__tests__/scrapUrls.js | 30 ++--- src/util/geminiUrlScraper.js | 140 +++++++++++++++++++++ src/util/scrapUrls.js | 19 +-- 5 files changed, 328 insertions(+), 29 deletions(-) create mode 100644 src/scripts/experimentUrlScraper.ts create mode 100644 src/util/__mocks__/geminiUrlScraper.js create mode 100644 src/util/geminiUrlScraper.js diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts new file mode 100644 index 00000000..508fbcac --- /dev/null +++ b/src/scripts/experimentUrlScraper.ts @@ -0,0 +1,164 @@ +/** + * Script to test URL scraping using Gemini urlContext tool. + * + * Usage: + * npx tsx src/scripts/experimentUrlScraper.ts \ + * --runName "url-scraper-experiment-1" \ + * [--urls "https://example.com,https://another-site.com"] \ + * [--single "https://single-test-url.com"] + * + * Required args: + * --runName: Name to identify this experiment run in Langfuse + * + * Optional args: + * --urls: Comma-separated list of URLs to test + * --single: Single URL to test (alternative to --urls) + */ +import 'dotenv/config'; +import yargs from 'yargs'; + +import scrapeUrlsWithGemini from '../util/geminiUrlScraper.js'; +import langfuse from 'util/langfuse'; + +// Default test URLs for experimentation +const DEFAULT_TEST_URLS = [ + 'https://www.cofacts.tw', + 'https://github.com/cofacts/rumors-api', + 'https://www.taiwannews.com.tw/en/news/5023456', +]; + +async function main({ + urls, + single, + runName, +}: { + urls?: string; + single?: string; + runName: string; +}) { + let testUrls: string[]; + + if (single) { + testUrls = [single]; + } else if (urls) { + testUrls = urls.split(',').map(url => url.trim()); + } else { + testUrls = DEFAULT_TEST_URLS; + console.info('No URLs specified, using default test URLs:', testUrls); + } + + console.info(`Testing URL scraping with ${testUrls.length} URLs`); + + const trace = langfuse.trace({ + name: `URL Scraper Experiment: ${runName}`, + input: testUrls, + metadata: { + experimentType: 'url-scraping', + tool: 'gemini-urlcontext', + urlCount: testUrls.length, + }, + }); + + try { + console.info('Starting URL scraping...'); + const startTime = Date.now(); + + const results = await scrapeUrlsWithGemini(testUrls); + + const endTime = Date.now(); + const duration = endTime - startTime; + + console.info(`\n=== RESULTS ===`); + console.info(`Processed ${results.length} URLs in ${duration}ms`); + console.info(`Average time per URL: ${Math.round(duration / results.length)}ms\n`); + + results.forEach((result, index) => { + console.info(`--- URL ${index + 1}: ${result.url} ---`); + console.info(`Status: ${result.status}`); + + if (result.status === 'SUCCESS') { + console.info(`Title: ${result.title || 'N/A'}`); + console.info(`Summary: ${result.summary ? result.summary.substring(0, 200) + '...' : 'N/A'}`); + console.info(`Top Image: ${result.topImageUrl || 'N/A'}`); + } else { + console.info(`Error: ${result.error}`); + } + console.info(''); + }); + + // Count success/failure rates + const successCount = results.filter(r => r.status === 'SUCCESS').length; + const errorCount = results.filter(r => r.status === 'ERROR').length; + + console.info('=== SUMMARY ==='); + console.info(`Success rate: ${successCount}/${results.length} (${Math.round(successCount / results.length * 100)}%)`); + console.info(`Error rate: ${errorCount}/${results.length} (${Math.round(errorCount / results.length * 100)}%)`); + console.info(`Total processing time: ${duration}ms`); + + // Record results in Langfuse + trace.update({ + output: results, + metadata: { + successCount, + errorCount, + totalDuration: duration, + averageDurationPerUrl: Math.round(duration / results.length), + }, + }); + + // Score the experiment based on success rate + trace.score({ + name: 'success-rate', + value: successCount / results.length, + comment: `${successCount} successful out of ${results.length} URLs`, + }); + + // Score based on average processing time (lower is better, normalize to 0-1) + const avgTimePerUrl = duration / results.length; + const timeScore = Math.max(0, 1 - (avgTimePerUrl / 10000)); // Penalize if >10s per URL + trace.score({ + name: 'processing-speed', + value: timeScore, + comment: `Average ${Math.round(avgTimePerUrl)}ms per URL`, + }); + + } catch (error) { + console.error('Experiment failed:', error); + trace.update({ + output: { error: error.message }, + }); + trace.score({ + name: 'success-rate', + value: 0, + comment: `Experiment failed: ${error.message}`, + }); + } + + await langfuse.flushAsync(); +} + +/* istanbul ignore if */ +if (require.main === module) { + const argv = yargs + .options({ + runName: { + description: 'Name to identify this experiment run in Langfuse', + type: 'string', + demandOption: true, + }, + urls: { + description: 'Comma-separated list of URLs to test', + type: 'string', + }, + single: { + description: 'Single URL to test (alternative to --urls)', + type: 'string', + }, + }) + .help('help') + .parseSync(); + + main(argv).catch(console.error); +} + +export default main; \ No newline at end of file diff --git a/src/util/__mocks__/geminiUrlScraper.js b/src/util/__mocks__/geminiUrlScraper.js new file mode 100644 index 00000000..3800023d --- /dev/null +++ b/src/util/__mocks__/geminiUrlScraper.js @@ -0,0 +1,4 @@ +// Mock implementation for Gemini URL scraper +const scrapeUrlsWithGemini = jest.fn(); + +export default scrapeUrlsWithGemini; \ No newline at end of file diff --git a/src/util/__tests__/scrapUrls.js b/src/util/__tests__/scrapUrls.js index 67a47945..1e20e778 100644 --- a/src/util/__tests__/scrapUrls.js +++ b/src/util/__tests__/scrapUrls.js @@ -1,4 +1,4 @@ -jest.mock('../grpc'); +jest.mock('../geminiUrlScraper'); import MockDate from 'mockdate'; @@ -7,7 +7,7 @@ import fixtures from '../__fixtures__/scrapUrls'; import scrapUrls, { removeFBCLIDIfExist } from '../scrapUrls'; import DataLoaders from 'graphql/dataLoaders'; import client from 'util/client'; -import resolveUrl from '../grpc'; +import scrapeUrlsWithGemini from '../geminiUrlScraper'; describe('scrapping & storage', () => { afterAll(async () => { @@ -24,25 +24,25 @@ describe('scrapping & storage', () => { it('scraps from Internet and handles error', async () => { MockDate.set(1485593157011); - resolveUrl.__addMockResponse([ - // Mimics the out-of-order nature of gRPC - { - url: 'http://example.com/not-found', - canonical: 'http://example.com/not-found', - title: '', - summary: 'Not Found', - topImageUrl: '', - html: 'Not Found', - status: 404, - }, + scrapeUrlsWithGemini.mockResolvedValue([ { url: 'http://example.com/index.html', canonical: 'http://example.com/index.html', title: 'Some title', summary: 'Some text as summary', topImageUrl: '', - html: 'Hello world', - status: 200, + html: '', + status: 'SUCCESS', + }, + { + url: 'http://example.com/not-found', + canonical: 'http://example.com/not-found', + title: '', + summary: 'Not Found', + topImageUrl: '', + html: '', + status: 'ERROR', + error: 'Not Found', }, ]); diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js new file mode 100644 index 00000000..15a74066 --- /dev/null +++ b/src/util/geminiUrlScraper.js @@ -0,0 +1,140 @@ +import { GoogleGenAI } from '@google/genai'; +import { GoogleAuth } from 'google-auth-library'; +import rollbar from '../rollbarInstance'; + +/** + * Scrapes URL content using Gemini's urlContext tool + * + * @param {string[]} urls - Array of URLs to scrape + * @returns {Promise>} + */ +export default async function scrapeUrlsWithGemini(urls) { + if (!urls || urls.length === 0) { + return []; + } + + const project = await new GoogleAuth().getProjectId(); + const genAI = new GoogleGenAI({ + vertexai: true, + project, + location: 'us-central1', // Using us-central1 for better availability + }); + + const results = []; + + // Process URLs one by one to avoid overwhelming the API + for (const url of urls) { + try { + const generateContentArgs = { + model: 'gemini-2.0-flash-001', + contents: [ + { + role: 'user', + parts: [ + { + text: `Please analyze the content at this URL: ${url} + +Extract and return ONLY a JSON object with the following structure (no markdown formatting, no extra text): +{ + "title": "The main title of the page", + "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes", + "topImageUrl": "URL of the most representative image on the page, or null if none exists" +} + +Requirements: +- title: Extract the main page title +- summary: Should be detailed enough for search and fact-checking, capturing all important claims and information +- topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists +- Return valid JSON only, no markdown code blocks or explanations`, + }, + ], + }, + ], + tools: [ + { + functionDeclarations: [ + { + name: 'url_context', + description: 'Retrieves content from the specified URL', + parameters: { + type: 'object', + properties: { + url: { + type: 'string', + description: 'The URL to retrieve content from', + }, + }, + required: ['url'], + }, + }, + ], + }, + ], + config: { + systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.', + responseModalities: ['TEXT'], + temperature: 0.1, // Low temperature for consistent extraction + maxOutputTokens: 2048, + }, + }; + + const response = await genAI.models.generateContent(generateContentArgs); + + if (!response.candidates || !response.candidates[0]) { + throw new Error('No response candidates received'); + } + + const responseText = response.candidates[0].content.parts[0].text; + + // Parse the JSON response + let extractedData; + try { + // Clean the response text to extract JSON + const jsonMatch = responseText.match(/\{[\s\S]*\}/); + if (jsonMatch) { + extractedData = JSON.parse(jsonMatch[0]); + } else { + extractedData = JSON.parse(responseText); + } + } catch (parseError) { + console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText); + extractedData = { + title: null, + summary: responseText.trim() || 'Unable to extract structured content', + topImageUrl: null, + }; + } + + results.push({ + url, + canonical: url, // Use original URL as canonical for now + title: extractedData.title || null, + summary: extractedData.summary || null, + topImageUrl: extractedData.topImageUrl || null, + html: '', // Leave empty as requested + status: 'SUCCESS', + }); + + } catch (error) { + console.error('[geminiUrlScraper] Error processing URL:', url, error); + + rollbar.error('Gemini URL scraping error', { + url, + error: error.message, + }); + + results.push({ + url, + canonical: url, + title: null, + summary: null, + topImageUrl: null, + html: '', + status: 'ERROR', + error: error.message, + }); + } + } + + return results; +} \ No newline at end of file diff --git a/src/util/scrapUrls.js b/src/util/scrapUrls.js index 1d407c97..a3ce9936 100644 --- a/src/util/scrapUrls.js +++ b/src/util/scrapUrls.js @@ -3,7 +3,7 @@ import urlRegex from 'url-regex'; import DataLoader from 'dataloader'; import url from 'url'; -import resolveUrl from './grpc'; +import scrapeUrlsWithGemini from './geminiUrlScraper'; /** * Extracts urls from a string. @@ -33,17 +33,8 @@ async function scrapUrls( const normalizedUrls = removeFBCLIDIfExist(originalUrls); const scrapLoader = new DataLoader(async (urls) => { - const urlToIndex = urls.reduce((map, url, i) => { - map[url] = i; - return map; - }, {}); - const unorderedFetchResults = await resolveUrl(urls); - const orderedFetchResults = []; - unorderedFetchResults.forEach( - (fetchResult) => - (orderedFetchResults[urlToIndex[fetchResult.url]] = fetchResult) - ); - return orderedFetchResults; + // Use Gemini-based URL scraping instead of gRPC + return await scrapeUrlsWithGemini(urls); }); // result: list of ScrapResult, with its `url` being the url in text, @@ -82,8 +73,8 @@ async function scrapUrls( scrappingCount += 1; return scrapLoader.load(result).then((scrapped) => ({ ...scrapped, - url: originalUrls[i], - normalizedUrl: scrapped.url, + url: originalUrls[i], // Use original URL from text + normalizedUrl: result, // Use normalized URL for caching })); }) ); From 5194dc425c5222023d3d15cc26c0beb5c021b6e1 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com> Date: Tue, 12 Aug 2025 07:50:46 +0000 Subject: [PATCH 2/9] Fix Gemini URL scraper based on review feedback - Use correct urlContext tool configuration format - Process multiple URLs in single LLM call for efficiency - Switch to gemini-2.5-flash model - Add canonical URL extraction capability - Implement client reuse pattern for performance - Add safety check for empty URL list in experiment script Co-authored-by: Johnson Liang --- src/scripts/experimentUrlScraper.ts | 6 +- src/util/geminiUrlScraper.js | 217 +++++++++++++++------------- 2 files changed, 118 insertions(+), 105 deletions(-) diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts index 508fbcac..07713bb2 100644 --- a/src/scripts/experimentUrlScraper.ts +++ b/src/scripts/experimentUrlScraper.ts @@ -41,12 +41,16 @@ async function main({ if (single) { testUrls = [single]; } else if (urls) { - testUrls = urls.split(',').map(url => url.trim()); + testUrls = urls.split(',').map(url => url.trim()).filter(Boolean); } else { testUrls = DEFAULT_TEST_URLS; console.info('No URLs specified, using default test URLs:', testUrls); } + if (testUrls.length === 0) { + console.info('No valid URLs to process. Exiting.'); + return; + } console.info(`Testing URL scraping with ${testUrls.length} URLs`); const trace = langfuse.trace({ diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js index 15a74066..d4e26b6e 100644 --- a/src/util/geminiUrlScraper.js +++ b/src/util/geminiUrlScraper.js @@ -2,6 +2,21 @@ import { GoogleGenAI } from '@google/genai'; import { GoogleAuth } from 'google-auth-library'; import rollbar from '../rollbarInstance'; +// Singleton client for reuse across requests +let genAI; + +async function getGenAIClient() { + if (genAI) return genAI; + + const project = await new GoogleAuth().getProjectId(); + genAI = new GoogleGenAI({ + vertexai: true, + project, + location: 'us-central1', // Using us-central1 for better availability + }); + return genAI; +} + /** * Scrapes URL content using Gemini's urlContext tool * @@ -13,128 +28,122 @@ export default async function scrapeUrlsWithGemini(urls) { return []; } - const project = await new GoogleAuth().getProjectId(); - const genAI = new GoogleGenAI({ - vertexai: true, - project, - location: 'us-central1', // Using us-central1 for better availability - }); + const genAIClient = await getGenAIClient(); - const results = []; + try { + // Process all URLs in a single LLM call for efficiency + const urlList = urls.map((url, index) => `${index + 1}. ${url}`).join('\n'); + + const generateContentArgs = { + model: 'gemini-2.5-flash', + contents: [ + { + role: 'user', + parts: [ + { + text: `Please analyze the content at these URLs and extract information from each: - // Process URLs one by one to avoid overwhelming the API - for (const url of urls) { - try { - const generateContentArgs = { - model: 'gemini-2.0-flash-001', - contents: [ - { - role: 'user', - parts: [ - { - text: `Please analyze the content at this URL: ${url} +${urlList} -Extract and return ONLY a JSON object with the following structure (no markdown formatting, no extra text): -{ - "title": "The main title of the page", - "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes", - "topImageUrl": "URL of the most representative image on the page, or null if none exists" -} +For each URL, extract and return a JSON array with objects having the following structure (no markdown formatting, no extra text): +[ + { + "url": "original URL from the list", + "canonical": "canonical URL if different from original, or same as original", + "title": "The main title of the page", + "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes", + "topImageUrl": "URL of the most representative image on the page, or null if none exists" + } +] Requirements: +- url: Return the exact original URL from the input list +- canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found - title: Extract the main page title - summary: Should be detailed enough for search and fact-checking, capturing all important claims and information - topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists -- Return valid JSON only, no markdown code blocks or explanations`, - }, - ], - }, - ], - tools: [ - { - functionDeclarations: [ - { - name: 'url_context', - description: 'Retrieves content from the specified URL', - parameters: { - type: 'object', - properties: { - url: { - type: 'string', - description: 'The URL to retrieve content from', - }, - }, - required: ['url'], - }, - }, - ], - }, - ], - config: { - systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.', - responseModalities: ['TEXT'], - temperature: 0.1, // Low temperature for consistent extraction - maxOutputTokens: 2048, +- Return valid JSON array only, no markdown code blocks or explanations +- Process all URLs and return results for each, even if some fail`, + }, + ], }, - }; + ], + config: { + tools: [{ urlContext: {} }], + systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.', + responseModalities: ['TEXT'], + temperature: 0.1, // Low temperature for consistent extraction + maxOutputTokens: 4096, + }, + }; - const response = await genAI.models.generateContent(generateContentArgs); - - if (!response.candidates || !response.candidates[0]) { - throw new Error('No response candidates received'); - } + const response = await genAIClient.models.generateContent(generateContentArgs); + + if (!response.candidates || !response.candidates[0]) { + throw new Error('No response candidates received'); + } - const responseText = response.candidates[0].content.parts[0].text; - - // Parse the JSON response - let extractedData; - try { - // Clean the response text to extract JSON - const jsonMatch = responseText.match(/\{[\s\S]*\}/); - if (jsonMatch) { - extractedData = JSON.parse(jsonMatch[0]); - } else { - extractedData = JSON.parse(responseText); - } - } catch (parseError) { - console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText); - extractedData = { - title: null, - summary: responseText.trim() || 'Unable to extract structured content', - topImageUrl: null, - }; + const responseText = response.candidates[0].content.parts[0].text; + + // Parse the JSON response + let extractedDataArray; + try { + // Clean the response text to extract JSON array + const jsonMatch = responseText.match(/\[[\s\S]*\]/); + if (jsonMatch) { + extractedDataArray = JSON.parse(jsonMatch[0]); + } else { + extractedDataArray = JSON.parse(responseText); } - - results.push({ - url, - canonical: url, // Use original URL as canonical for now - title: extractedData.title || null, - summary: extractedData.summary || null, - topImageUrl: extractedData.topImageUrl || null, - html: '', // Leave empty as requested - status: 'SUCCESS', - }); - - } catch (error) { - console.error('[geminiUrlScraper] Error processing URL:', url, error); - - rollbar.error('Gemini URL scraping error', { - url, - error: error.message, - }); - - results.push({ + } catch (parseError) { + console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText); + // Fallback: create error results for all URLs + return urls.map(url => ({ url, canonical: url, title: null, - summary: null, + summary: 'Unable to extract structured content', topImageUrl: null, html: '', status: 'ERROR', - error: error.message, - }); + error: 'Failed to parse LLM response', + })); } - } - return results; + // Ensure we have results for all input URLs + const results = urls.map(url => { + const extracted = extractedDataArray.find(item => item.url === url) || {}; + return { + url, + canonical: extracted.canonical || url, + title: extracted.title || null, + summary: extracted.summary || null, + topImageUrl: extracted.topImageUrl || null, + html: '', // Leave empty as requested + status: 'SUCCESS', + }; + }); + + return results; + + } catch (error) { + console.error('[geminiUrlScraper] Error processing URLs:', error); + + rollbar.error('Gemini URL scraping error', { + urls, + error: error.message, + }); + + // Return error results for all URLs + return urls.map(url => ({ + url, + canonical: url, + title: null, + summary: null, + topImageUrl: null, + html: '', + status: 'ERROR', + error: error.message, + })); + } } \ No newline at end of file From 1402d393a586938f7676b17adeb3a791dab0bcef Mon Sep 17 00:00:00 2001 From: MrOrz Date: Sun, 24 Aug 2025 13:10:52 +0800 Subject: [PATCH 3/9] fix: lint --- src/scripts/experimentUrlScraper.ts | 48 +++++++++++++++++++---------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts index 07713bb2..a4d870e5 100644 --- a/src/scripts/experimentUrlScraper.ts +++ b/src/scripts/experimentUrlScraper.ts @@ -37,11 +37,14 @@ async function main({ runName: string; }) { let testUrls: string[]; - + if (single) { testUrls = [single]; } else if (urls) { - testUrls = urls.split(',').map(url => url.trim()).filter(Boolean); + testUrls = urls + .split(',') + .map((url) => url.trim()) + .filter(Boolean); } else { testUrls = DEFAULT_TEST_URLS; console.info('No URLs specified, using default test URLs:', testUrls); @@ -56,7 +59,7 @@ async function main({ const trace = langfuse.trace({ name: `URL Scraper Experiment: ${runName}`, input: testUrls, - metadata: { + metadata: { experimentType: 'url-scraping', tool: 'gemini-urlcontext', urlCount: testUrls.length, @@ -66,23 +69,29 @@ async function main({ try { console.info('Starting URL scraping...'); const startTime = Date.now(); - + const results = await scrapeUrlsWithGemini(testUrls); - + const endTime = Date.now(); const duration = endTime - startTime; console.info(`\n=== RESULTS ===`); console.info(`Processed ${results.length} URLs in ${duration}ms`); - console.info(`Average time per URL: ${Math.round(duration / results.length)}ms\n`); + console.info( + `Average time per URL: ${Math.round(duration / results.length)}ms\n` + ); results.forEach((result, index) => { console.info(`--- URL ${index + 1}: ${result.url} ---`); console.info(`Status: ${result.status}`); - + if (result.status === 'SUCCESS') { console.info(`Title: ${result.title || 'N/A'}`); - console.info(`Summary: ${result.summary ? result.summary.substring(0, 200) + '...' : 'N/A'}`); + console.info( + `Summary: ${ + result.summary ? result.summary.substring(0, 200) + '...' : 'N/A' + }` + ); console.info(`Top Image: ${result.topImageUrl || 'N/A'}`); } else { console.info(`Error: ${result.error}`); @@ -91,12 +100,20 @@ async function main({ }); // Count success/failure rates - const successCount = results.filter(r => r.status === 'SUCCESS').length; - const errorCount = results.filter(r => r.status === 'ERROR').length; - + const successCount = results.filter((r) => r.status === 'SUCCESS').length; + const errorCount = results.filter((r) => r.status === 'ERROR').length; + console.info('=== SUMMARY ==='); - console.info(`Success rate: ${successCount}/${results.length} (${Math.round(successCount / results.length * 100)}%)`); - console.info(`Error rate: ${errorCount}/${results.length} (${Math.round(errorCount / results.length * 100)}%)`); + console.info( + `Success rate: ${successCount}/${results.length} (${Math.round( + (successCount / results.length) * 100 + )}%)` + ); + console.info( + `Error rate: ${errorCount}/${results.length} (${Math.round( + (errorCount / results.length) * 100 + )}%)` + ); console.info(`Total processing time: ${duration}ms`); // Record results in Langfuse @@ -119,13 +136,12 @@ async function main({ // Score based on average processing time (lower is better, normalize to 0-1) const avgTimePerUrl = duration / results.length; - const timeScore = Math.max(0, 1 - (avgTimePerUrl / 10000)); // Penalize if >10s per URL + const timeScore = Math.max(0, 1 - avgTimePerUrl / 10000); // Penalize if >10s per URL trace.score({ name: 'processing-speed', value: timeScore, comment: `Average ${Math.round(avgTimePerUrl)}ms per URL`, }); - } catch (error) { console.error('Experiment failed:', error); trace.update({ @@ -165,4 +181,4 @@ if (require.main === module) { main(argv).catch(console.error); } -export default main; \ No newline at end of file +export default main; From 125194b32b26cc7683ffd7e2f3302b5058679650 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Sun, 24 Aug 2025 13:12:07 +0800 Subject: [PATCH 4/9] fix: improve error logging in URL scraper experiment --- src/scripts/experimentUrlScraper.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts index a4d870e5..ec249508 100644 --- a/src/scripts/experimentUrlScraper.ts +++ b/src/scripts/experimentUrlScraper.ts @@ -145,12 +145,12 @@ async function main({ } catch (error) { console.error('Experiment failed:', error); trace.update({ - output: { error: error.message }, + output: { error }, }); trace.score({ name: 'success-rate', value: 0, - comment: `Experiment failed: ${error.message}`, + comment: `Experiment failed: ${error}`, }); } From 2faeb47f491a47fb0a83bb47d83187d35848e0ae Mon Sep 17 00:00:00 2001 From: MrOrz Date: Sun, 24 Aug 2025 13:27:41 +0800 Subject: [PATCH 5/9] fix: format --- src/util/geminiUrlScraper.js | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js index d4e26b6e..a3e42e46 100644 --- a/src/util/geminiUrlScraper.js +++ b/src/util/geminiUrlScraper.js @@ -19,7 +19,7 @@ async function getGenAIClient() { /** * Scrapes URL content using Gemini's urlContext tool - * + * * @param {string[]} urls - Array of URLs to scrape * @returns {Promise>} */ @@ -33,7 +33,7 @@ export default async function scrapeUrlsWithGemini(urls) { try { // Process all URLs in a single LLM call for efficiency const urlList = urls.map((url, index) => `${index + 1}. ${url}`).join('\n'); - + const generateContentArgs = { model: 'gemini-2.5-flash', contents: [ @@ -70,21 +70,24 @@ Requirements: ], config: { tools: [{ urlContext: {} }], - systemInstruction: 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.', + systemInstruction: + 'You are a web content analyzer that extracts structured information from web pages for fact-checking purposes.', responseModalities: ['TEXT'], temperature: 0.1, // Low temperature for consistent extraction maxOutputTokens: 4096, }, }; - const response = await genAIClient.models.generateContent(generateContentArgs); - + const response = await genAIClient.models.generateContent( + generateContentArgs + ); + if (!response.candidates || !response.candidates[0]) { throw new Error('No response candidates received'); } const responseText = response.candidates[0].content.parts[0].text; - + // Parse the JSON response let extractedDataArray; try { @@ -96,9 +99,12 @@ Requirements: extractedDataArray = JSON.parse(responseText); } } catch (parseError) { - console.warn('[geminiUrlScraper] Failed to parse JSON response:', responseText); + console.warn( + '[geminiUrlScraper] Failed to parse JSON response:', + responseText + ); // Fallback: create error results for all URLs - return urls.map(url => ({ + return urls.map((url) => ({ url, canonical: url, title: null, @@ -111,8 +117,9 @@ Requirements: } // Ensure we have results for all input URLs - const results = urls.map(url => { - const extracted = extractedDataArray.find(item => item.url === url) || {}; + const results = urls.map((url) => { + const extracted = + extractedDataArray.find((item) => item.url === url) || {}; return { url, canonical: extracted.canonical || url, @@ -125,17 +132,16 @@ Requirements: }); return results; - } catch (error) { console.error('[geminiUrlScraper] Error processing URLs:', error); - + rollbar.error('Gemini URL scraping error', { urls, error: error.message, }); // Return error results for all URLs - return urls.map(url => ({ + return urls.map((url) => ({ url, canonical: url, title: null, @@ -146,4 +152,4 @@ Requirements: error: error.message, })); } -} \ No newline at end of file +} From ef1ca5f2c436fb4f8986296756c036d6b8a26a75 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com> Date: Sun, 24 Aug 2025 14:35:18 +0000 Subject: [PATCH 6/9] fix: update Gemini URL scraper prompt to extract exact content instead of summaries - Changed prompt to explicitly request original text content without rewriting - Added clear instructions to preserve all claims and statements as written - This ensures content can be properly indexed for fact-checking purposes Co-authored-by: Johnson Liang --- src/util/geminiUrlScraper.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js index a3e42e46..c50135e0 100644 --- a/src/util/geminiUrlScraper.js +++ b/src/util/geminiUrlScraper.js @@ -41,7 +41,7 @@ export default async function scrapeUrlsWithGemini(urls) { role: 'user', parts: [ { - text: `Please analyze the content at these URLs and extract information from each: + text: `Please analyze the content at these URLs and extract the exact original text content from each: ${urlList} @@ -51,7 +51,7 @@ For each URL, extract and return a JSON array with objects having the following "url": "original URL from the list", "canonical": "canonical URL if different from original, or same as original", "title": "The main title of the page", - "summary": "A comprehensive summary of the content that captures the key information for fact-checking purposes", + "summary": "The exact original text content from the page without any rewriting, summarization, or paraphrasing - preserve all claims, statements, and information as written in the source", "topImageUrl": "URL of the most representative image on the page, or null if none exists" } ] @@ -59,8 +59,8 @@ For each URL, extract and return a JSON array with objects having the following Requirements: - url: Return the exact original URL from the input list - canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found -- title: Extract the main page title -- summary: Should be detailed enough for search and fact-checking, capturing all important claims and information +- title: Extract the main page title exactly as it appears +- summary: Extract the complete original text content from the page - DO NOT summarize, rewrite, or paraphrase. Keep all original claims, statements, facts, and information exactly as written in the source material. This text will be used for indexing and search purposes. - topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists - Return valid JSON array only, no markdown code blocks or explanations - Process all URLs and return results for each, even if some fail`, From 97064147a28f8551afad2ff87389744b42206d30 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 25 Aug 2025 01:36:49 +0800 Subject: [PATCH 7/9] fix: show full summary logging in URL scraper experiments --- src/scripts/experimentUrlScraper.ts | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/scripts/experimentUrlScraper.ts b/src/scripts/experimentUrlScraper.ts index ec249508..880168e9 100644 --- a/src/scripts/experimentUrlScraper.ts +++ b/src/scripts/experimentUrlScraper.ts @@ -87,11 +87,7 @@ async function main({ if (result.status === 'SUCCESS') { console.info(`Title: ${result.title || 'N/A'}`); - console.info( - `Summary: ${ - result.summary ? result.summary.substring(0, 200) + '...' : 'N/A' - }` - ); + console.info(`Summary: ${result.summary || 'N/A'}`); console.info(`Top Image: ${result.topImageUrl || 'N/A'}`); } else { console.info(`Error: ${result.error}`); From 9d5f6973b3f40152e323887ec3026ea05736b8b2 Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 25 Aug 2025 01:47:07 +0800 Subject: [PATCH 8/9] fix: refine summary extraction criteria in Gemini URL scraper --- src/util/geminiUrlScraper.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/geminiUrlScraper.js b/src/util/geminiUrlScraper.js index c50135e0..cdba4cf0 100644 --- a/src/util/geminiUrlScraper.js +++ b/src/util/geminiUrlScraper.js @@ -60,10 +60,10 @@ Requirements: - url: Return the exact original URL from the input list - canonical: Extract the canonical URL from meta tags or use the original URL if no canonical is found - title: Extract the main page title exactly as it appears -- summary: Extract the complete original text content from the page - DO NOT summarize, rewrite, or paraphrase. Keep all original claims, statements, facts, and information exactly as written in the source material. This text will be used for indexing and search purposes. +- summary: Extract the main textual content of the page, such as articles, posts, or video descriptions. Exclude navigational elements, cookie banners, ads, and system messages (e.g., 'Tap to unmute', 'Your browser can't play this video'). The goal is to capture the substance of the content for fact-checking and search indexing. DO NOT summarize, rewrite, or paraphrase. Keep all original claims, statements, facts, and information exactly as written in the source material. - topImageUrl: Find the most representative image (not logos, ads, or decorative images), return null if no suitable image exists - Return valid JSON array only, no markdown code blocks or explanations -- Process all URLs and return results for each, even if some fail`, +- Process all URLs and return results for each. If a URL cannot be accessed or results in an error, return an object for it with "title" and "summary" set to null.`, }, ], }, From c24425bd3f3aae68383088758975973435d559ac Mon Sep 17 00:00:00 2001 From: MrOrz Date: Mon, 25 Aug 2025 01:53:42 +0800 Subject: [PATCH 9/9] feat: add YouTube video transcription function using Gemini model --- src/graphql/util.js | 109 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/src/graphql/util.js b/src/graphql/util.js index 8381a73a..1e2b8161 100644 --- a/src/graphql/util.js +++ b/src/graphql/util.js @@ -856,6 +856,115 @@ function extractTextFromFullTextAnnotation(fullTextAnnotation) { .join(''); } +/** + * Transcribes a YouTube video using Gemini model. + * + * @param {object} params + * @param {string} params.youtubeUrl - The URL of the YouTube video. + * @param {import('@langfuse/langfuse').Trace} params.langfuseTrace - Langfuse trace object + * @returns {Promise<{text: string, usage: {promptTokens?: number, completionTokens?: number, totalTokens?: number}}>} + */ +export async function transcribeYoutube({ youtubeUrl, langfuseTrace }) { + const modelName = 'gemini-2.5-flash'; + const location = 'global'; + + const project = await new GoogleAuth().getProjectId(); + const genAI = new GoogleGenAI({ + vertexai: true, + project, + location, + }); + + /**@type {import('@google/genai').GenerateContentParameters} */ + const generateContentArgs = { + model: modelName, + contents: [ + { + role: 'user', + parts: [ + { + fileData: { + fileUri: youtubeUrl, + mimeType: 'video/*', + }, + }, + ], + }, + ], + config: { + systemInstruction: `You are helping a fact checker to fact-check a YouTube video. Your task is to provide an accurate transcript of the video. The transcript will be used by human fact-checkers and for indexing in a hoax database. + +Please follow these rules carefully: +- Transcribe the exact text shown visually and spoken in the audio. +- If the verbal and visual text are the same, include it only once. +- Do not miss any text or spoken words. +- Output only the transcript. Do not include timestamps or any explanations. +- Use the same language as the video. For Chinese, use the character set (Traditional or Simplified) that appears in the video's text. If there is no text, prefer Traditional Chinese. +- Group sentences into paragraphs with appropriate punctuation to improve readability.`, + responseModalities: ['TEXT'], + temperature: 0.5, + maxOutputTokens: 65536, + thinkingConfig: { thinkingBudget: 0 }, + mediaResolution: 'MEDIA_RESOLUTION_LOW', + safetySettings: [ + { + category: 'HARM_CATEGORY_HATE_SPEECH', + threshold: 'OFF', + }, + { + category: 'HARM_CATEGORY_DANGEROUS_CONTENT', + threshold: 'OFF', + }, + { + category: 'HARM_CATEGORY_SEXUALLY_EXPLICIT', + threshold: 'OFF', + }, + { + category: 'HARM_CATEGORY_HARASSMENT', + threshold: 'OFF', + }, + ], + }, + }; + + const generation = langfuseTrace.generation({ + name: 'gemini-youtube-transcript', + modelParameters: { + temperature: generateContentArgs.config.temperature, + maxOutputTokens: generateContentArgs.config.maxOutputTokens, + thinkingBudget: generateContentArgs.config.thinkingConfig?.thinkingBudget, + safetySettings: JSON.stringify(generateContentArgs.config.safetySettings), + mediaResolution: generateContentArgs.config.mediaResolution, + }, + input: JSON.stringify({ + systemInstruction: generateContentArgs.config.systemInstruction, + contents: generateContentArgs.contents, + }), + }); + + const response = await genAI.models.generateContent(generateContentArgs); + console.log('[transcribeYoutube]', JSON.stringify(response)); + + const output = response.candidates[0].content.parts[0].text; + const usage = { + promptTokens: response.usageMetadata?.promptTokenCount, + completionTokens: response.usageMetadata?.candidatesTokenCount, + totalTokens: + (response.usageMetadata?.promptTokenCount || 0) + + (response.usageMetadata?.candidatesTokenCount || 0), + }; + + langfuseTrace.update({ output }); + generation.end({ + output: JSON.stringify(response), + usage, + model: modelName, + modelParameters: { location }, + }); + + return { text: output, usage }; +} + /** * Transcribes audio/video content using Gemini model *