Skip to content
184 changes: 184 additions & 0 deletions src/scripts/experimentUrlScraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
/**
* Script to test URL scraping using Gemini urlContext tool.
*
* Usage:
* npx tsx src/scripts/experimentUrlScraper.ts \
* --runName "url-scraper-experiment-1" \
* [--urls "https://example.com,https://another-site.com"] \
* [--single "https://single-test-url.com"]
*
* Required args:
* --runName: Name to identify this experiment run in Langfuse
*
* Optional args:
* --urls: Comma-separated list of URLs to test
* --single: Single URL to test (alternative to --urls)
*/
import 'dotenv/config';
import yargs from 'yargs';

import scrapeUrlsWithGemini from '../util/geminiUrlScraper.js';
import langfuse from 'util/langfuse';

// Default test URLs for experimentation
const DEFAULT_TEST_URLS = [
'https://www.cofacts.tw',
'https://github.com/cofacts/rumors-api',
'https://www.taiwannews.com.tw/en/news/5023456',
];

async function main({
urls,
single,
runName,
}: {
urls?: string;
single?: string;
runName: string;
}) {
let testUrls: string[];

if (single) {
testUrls = [single];
} else if (urls) {
testUrls = urls
.split(',')
.map((url) => url.trim())
.filter(Boolean);
} else {
testUrls = DEFAULT_TEST_URLS;
console.info('No URLs specified, using default test URLs:', testUrls);
}

if (testUrls.length === 0) {
console.info('No valid URLs to process. Exiting.');
return;
}
console.info(`Testing URL scraping with ${testUrls.length} URLs`);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

If the urls argument results in an empty list of URLs after trimming and filtering, the script could encounter division-by-zero errors later when calculating statistics. It's safer to add a check to exit early if testUrls is empty.

  if (testUrls.length === 0) {
    console.info('No valid URLs to process. Exiting.');
    return;
  }
  console.info(`Testing URL scraping with ${testUrls.length} URLs`);


const trace = langfuse.trace({
name: `URL Scraper Experiment: ${runName}`,
input: testUrls,
metadata: {
experimentType: 'url-scraping',
tool: 'gemini-urlcontext',
urlCount: testUrls.length,
},
});

try {
console.info('Starting URL scraping...');
const startTime = Date.now();

const results = await scrapeUrlsWithGemini(testUrls);

const endTime = Date.now();
const duration = endTime - startTime;

console.info(`\n=== RESULTS ===`);
console.info(`Processed ${results.length} URLs in ${duration}ms`);
console.info(
`Average time per URL: ${Math.round(duration / results.length)}ms\n`
);

results.forEach((result, index) => {
console.info(`--- URL ${index + 1}: ${result.url} ---`);
console.info(`Status: ${result.status}`);

if (result.status === 'SUCCESS') {
console.info(`Title: ${result.title || 'N/A'}`);
console.info(
`Summary: ${
result.summary ? result.summary.substring(0, 200) + '...' : 'N/A'
}`
);
console.info(`Top Image: ${result.topImageUrl || 'N/A'}`);
} else {
console.info(`Error: ${result.error}`);
}
console.info('');
});

// Count success/failure rates
const successCount = results.filter((r) => r.status === 'SUCCESS').length;
const errorCount = results.filter((r) => r.status === 'ERROR').length;

console.info('=== SUMMARY ===');
console.info(
`Success rate: ${successCount}/${results.length} (${Math.round(
(successCount / results.length) * 100
)}%)`
);
console.info(
`Error rate: ${errorCount}/${results.length} (${Math.round(
(errorCount / results.length) * 100
)}%)`
);
console.info(`Total processing time: ${duration}ms`);

// Record results in Langfuse
trace.update({
output: results,
metadata: {
successCount,
errorCount,
totalDuration: duration,
averageDurationPerUrl: Math.round(duration / results.length),
},
});

// Score the experiment based on success rate
trace.score({
name: 'success-rate',
value: successCount / results.length,
comment: `${successCount} successful out of ${results.length} URLs`,
});

// Score based on average processing time (lower is better, normalize to 0-1)
const avgTimePerUrl = duration / results.length;
const timeScore = Math.max(0, 1 - avgTimePerUrl / 10000); // Penalize if >10s per URL
trace.score({
name: 'processing-speed',
value: timeScore,
comment: `Average ${Math.round(avgTimePerUrl)}ms per URL`,
});
} catch (error) {
console.error('Experiment failed:', error);
trace.update({
output: { error },
});
trace.score({
name: 'success-rate',
value: 0,
comment: `Experiment failed: ${error}`,
});
}

await langfuse.flushAsync();
}

/* istanbul ignore if */
if (require.main === module) {
const argv = yargs
.options({
runName: {
description: 'Name to identify this experiment run in Langfuse',
type: 'string',
demandOption: true,
},
urls: {
description: 'Comma-separated list of URLs to test',
type: 'string',
},
single: {
description: 'Single URL to test (alternative to --urls)',
type: 'string',
},
})
.help('help')
.parseSync();

main(argv).catch(console.error);
}

export default main;
4 changes: 4 additions & 0 deletions src/util/__mocks__/geminiUrlScraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Mock implementation for Gemini URL scraper
const scrapeUrlsWithGemini = jest.fn();

export default scrapeUrlsWithGemini;

Check failure on line 4 in src/util/__mocks__/geminiUrlScraper.js

View workflow job for this annotation

GitHub Actions / install-and-test

Insert `⏎`

Check failure on line 4 in src/util/__mocks__/geminiUrlScraper.js

View workflow job for this annotation

GitHub Actions / install-and-test

Insert `⏎`
30 changes: 15 additions & 15 deletions src/util/__tests__/scrapUrls.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
jest.mock('../grpc');
jest.mock('../geminiUrlScraper');

import MockDate from 'mockdate';

Expand All @@ -7,7 +7,7 @@
import scrapUrls, { removeFBCLIDIfExist } from '../scrapUrls';
import DataLoaders from 'graphql/dataLoaders';
import client from 'util/client';
import resolveUrl from '../grpc';
import scrapeUrlsWithGemini from '../geminiUrlScraper';

describe('scrapping & storage', () => {
afterAll(async () => {
Expand All @@ -24,25 +24,25 @@
it('scraps from Internet and handles error', async () => {
MockDate.set(1485593157011);

resolveUrl.__addMockResponse([
// Mimics the out-of-order nature of gRPC
{
url: 'http://example.com/not-found',
canonical: 'http://example.com/not-found',
title: '',
summary: 'Not Found',
topImageUrl: '',
html: '<html><head></head><body>Not Found</body></html>',
status: 404,
},
scrapeUrlsWithGemini.mockResolvedValue([
{
url: 'http://example.com/index.html',
canonical: 'http://example.com/index.html',
title: 'Some title',
summary: 'Some text as summary',
topImageUrl: '',
html: '<html><head></head><body>Hello world</body></html>',
status: 200,
html: '',
status: 'SUCCESS',
},
{
url: 'http://example.com/not-found',
canonical: 'http://example.com/not-found',
title: '',
summary: 'Not Found',
topImageUrl: '',
html: '',
status: 'ERROR',
error: 'Not Found',
},
]);

Expand All @@ -56,8 +56,8 @@
);
MockDate.reset();

expect(resolveUrl.__getRequests()).toMatchSnapshot('GraphQL requests');

Check failure on line 59 in src/util/__tests__/scrapUrls.js

View workflow job for this annotation

GitHub Actions / install-and-test

'resolveUrl' is not defined

Check failure on line 59 in src/util/__tests__/scrapUrls.js

View workflow job for this annotation

GitHub Actions / install-and-test

'resolveUrl' is not defined
resolveUrl.__reset();

Check failure on line 60 in src/util/__tests__/scrapUrls.js

View workflow job for this annotation

GitHub Actions / install-and-test

'resolveUrl' is not defined

Check failure on line 60 in src/util/__tests__/scrapUrls.js

View workflow job for this annotation

GitHub Actions / install-and-test

'resolveUrl' is not defined

expect(foundResult).toMatchSnapshot('foundResult');
expect(notFoundResult).toMatchSnapshot('notFoundResult');
Expand Down
Loading
Loading