Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ on:
branches:
- main

# These permissions are needed to interact with AWS S3 via GitHub's OIDC Token endpoint
permissions:
id-token: write
contents: read
pull-requests: read

jobs:
unit-tests:
runs-on: ${{ matrix.os }}
Expand Down Expand Up @@ -58,6 +64,32 @@ jobs:
- name: Install cdx_toolkit
run: pip install .[test]

- name: Configure AWS credentials from OIDC (disabled for forks)
if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push'
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::837454214164:role/GitHubActions-Role
aws-region: us-east-1

- name: Disable S3 unit tests for Python 3.8 (boto3 requires Python 3.9+)
if: ${{ startsWith(matrix.python-version, '3.8') }}
uses: actions/github-script@v7
with:
script: |
core.exportVariable('CDXT_DISABLE_S3_TESTS', '1')
- name: Set environment variables for faster unit tests (requests are mocked)
uses: actions/github-script@v7
with:
script: |
core.exportVariable('CDXT_MAX_ERRORS', '2')
core.exportVariable('CDXT_WARNING_AFTER_N_ERRORS', '2')
core.exportVariable('CDXT_DEFAULT_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_CC_DATA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('CDXT_IA_MIN_RETRY_INTERVAL', '0.01')
core.exportVariable('DISABLE_ATHENA_TESTS', '1')
core.exportVariable('LOGLEVEL', 'DEBUG')

- name: Lint code
run: |
make lint
Expand Down
53 changes: 9 additions & 44 deletions cdx_toolkit/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import os

import cdx_toolkit
from cdx_toolkit.commoncrawl import normalize_crawl

from cdx_toolkit.utils import get_version, setup


LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -151,49 +153,6 @@ def set_loglevel(cmd):
LOGGER.info('set loglevel to %s', str(loglevel))


def get_version():
return cdx_toolkit.__version__


def setup(cmd):
kwargs = {}
kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
if kwargs['source'] is None:
raise ValueError('must specify --cc, --ia, or a --source')
if cmd.wb:
kwargs['wb'] = cmd.wb
if cmd.cc_mirror:
kwargs['cc_mirror'] = cmd.cc_mirror
if cmd.crawl:
kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list
if getattr(cmd, 'warc_download_prefix', None) is not None:
kwargs['warc_download_prefix'] = cmd.warc_download_prefix

cdx = cdx_toolkit.CDXFetcher(**kwargs)

kwargs = {}
if cmd.limit:
kwargs['limit'] = cmd.limit
if 'from' in vars(cmd) and vars(cmd)['from']: # python, uh, from is a reserved word
kwargs['from_ts'] = vars(cmd)['from']
if cmd.to:
kwargs['to'] = cmd.to
if cmd.closest:
if not cmd.get: # pragma: no cover
LOGGER.info('note: --closest works best with --get')
kwargs['closest'] = cmd.closest
if cmd.filter:
kwargs['filter'] = cmd.filter

if cmd.cmd == 'warc' and cmd.size:
kwargs['size'] = cmd.size

if cmd.cmd == 'size' and cmd.details:
kwargs['details'] = cmd.details

return cdx, kwargs


def winnow_fields(cmd, fields, obj):
if cmd.all_fields:
printme = obj
Expand Down Expand Up @@ -275,9 +234,15 @@ def warcer(cmd, cmdline):
LOGGER.warning('revisit record being resolved for url %s %s', url, timestamp)
writer.write_record(record)

writer.close()


def sizer(cmd, cmdline):
cdx, kwargs = setup(cmd)

size = cdx.get_size_estimate(cmd.url, **kwargs)
print(size)


if __name__ == "__main__":
main()
12 changes: 9 additions & 3 deletions cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import json
import logging

from cdx_toolkit.settings import get_mock_time

from .myrequests import myrequests_get
from .timeutils import (
time_to_timestamp,
Expand Down Expand Up @@ -128,9 +130,13 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
else:
if not now:
# now is passed in by tests. if not set, use actual now.
# XXX could be changed to mock
now = time.time()
# Check for test/override time first
mock_time = get_mock_time()
if mock_time:
now = mock_time
else:
# now is passed in by tests. if not set, use actual now.
now = time.time()
params['from_ts'] = time_to_timestamp(now - year)
LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
else:
Expand Down
27 changes: 21 additions & 6 deletions cdx_toolkit/myrequests.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
from typing import Optional
import requests
import logging
import time
from urllib.parse import urlparse

from . import __version__
from .settings import (
DEFAULT_MIN_RETRY_INTERVAL,
CC_DATA_MIN_RETRY_INTERVAL,
CC_INDEX_MIN_RETRY_INTERVAL,
IA_MIN_RETRY_INTERVAL,
MAX_ERRORS,
WARNING_AFTER_N_ERRORS,
)

LOGGER = logging.getLogger(__name__)

Expand All @@ -23,19 +32,19 @@ def dns_fatal(hostname):
retry_info = {
'default': {
'next_fetch': 0,
'minimum_interval': 3.0,
'minimum_interval': DEFAULT_MIN_RETRY_INTERVAL,
},
'index.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 1.0,
'minimum_interval': CC_INDEX_MIN_RETRY_INTERVAL,
},
'data.commoncrawl.org': {
'next_fetch': 0,
'minimum_interval': 0.55,
'minimum_interval': CC_DATA_MIN_RETRY_INTERVAL,
},
'web.archive.org': {
'next_fetch': 0,
'minimum_interval': 6.0,
'minimum_interval': IA_MIN_RETRY_INTERVAL,
},
}

Expand All @@ -60,12 +69,18 @@ def myrequests_get(
headers=None,
cdx=False,
allow404=False,
raise_error_after_n_errors: int = 100,
raise_warning_after_n_errors: int = 10,
raise_error_after_n_errors: Optional[int] = None,
raise_warning_after_n_errors: Optional[int] = None,
retry_max_sec: int = 60,
):
t = time.time()

if raise_error_after_n_errors is None:
raise_error_after_n_errors = MAX_ERRORS

if raise_warning_after_n_errors is None:
raise_warning_after_n_errors = WARNING_AFTER_N_ERRORS

hostname = urlparse(url).hostname
next_fetch, minimum_interval = get_retries(hostname)

Expand Down
15 changes: 15 additions & 0 deletions cdx_toolkit/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import os

MAX_ERRORS = int(os.environ.get('CDXT_MAX_ERRORS', 100))
WARNING_AFTER_N_ERRORS = int(os.environ.get('CDXT_WARNING_AFTER_N_ERRORS', 10))

DEFAULT_MIN_RETRY_INTERVAL = float(os.environ.get('CDXT_DEFAULT_MIN_RETRY_INTERVAL', 3.0))
CC_INDEX_MIN_RETRY_INTERVAL = float(os.environ.get('CDXT_CC_INDEX_MIN_RETRY_INTERVAL', 1.0))
CC_DATA_MIN_RETRY_INTERVAL = float(os.environ.get('CDXT_CC_DATA_MIN_RETRY_INTERVAL', 0.55))
IA_MIN_RETRY_INTERVAL = float(os.environ.get('CDXT_IA_MIN_RETRY_INTERVAL', 6.0))


def get_mock_time():
"""Get the mock time from environment variable, evaluated dynamically"""
mock_time = os.environ.get('CDXT_MOCK_TIME')
return float(mock_time) if mock_time else None
49 changes: 49 additions & 0 deletions cdx_toolkit/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import cdx_toolkit
from cdx_toolkit.commoncrawl import normalize_crawl

import logging

LOGGER = logging.getLogger(__name__)


def get_version():
return cdx_toolkit.__version__


def setup(cmd):
kwargs = {}
kwargs['source'] = 'cc' if cmd.crawl else cmd.cc or cmd.ia or cmd.source or None
if kwargs['source'] is None:
raise ValueError('must specify --cc, --ia, or a --source')
if cmd.wb:
kwargs['wb'] = cmd.wb
if cmd.cc_mirror:
kwargs['cc_mirror'] = cmd.cc_mirror
if cmd.crawl:
kwargs['crawl'] = normalize_crawl([cmd.crawl]) # currently a string, not a list
if getattr(cmd, 'warc_download_prefix', None) is not None:
kwargs['warc_download_prefix'] = cmd.warc_download_prefix

cdx = cdx_toolkit.CDXFetcher(**kwargs)

kwargs = {}
if cmd.limit:
kwargs['limit'] = cmd.limit
if 'from' in vars(cmd) and vars(cmd)['from']: # python, uh, from is a reserved word
kwargs['from_ts'] = vars(cmd)['from']
if cmd.to:
kwargs['to'] = cmd.to
if cmd.closest:
if not cmd.get: # pragma: no cover
LOGGER.info('note: --closest works best with --get')
kwargs['closest'] = cmd.closest
if cmd.filter:
kwargs['filter'] = cmd.filter

if cmd.cmd == 'warc' and cmd.size:
kwargs['size'] = cmd.size

if cmd.cmd == 'size' and cmd.details:
kwargs['details'] = cmd.details

return cdx, kwargs
Loading