Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion scrapy_deltafetch/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@
logger = logging.getLogger(__name__)


class DeltaFetchPseudoItem(dict):
"""
A pseudo item class to be used when:
- No actual item shall be generated from a page, and
- The page shall be skipped in future runs
"""
pass


class DeltaFetch(object):
"""
This is a spider middleware to ignore requests to pages containing items
Expand Down Expand Up @@ -86,7 +95,11 @@ def process_spider_output(self, response, result, spider):
self.db[key] = str(time.time())
if self.stats:
self.stats.inc_value('deltafetch/stored', spider=spider)
yield r
if isinstance(r, DeltaFetchPseudoItem):
reason = r.get('reason', 'pseudo_item')
self.stats.inc_value('deltafetch/stored/%s' % reason, spider=spider)
if not isinstance(r, DeltaFetchPseudoItem):
yield r

def _get_key(self, request):
key = request.meta.get('deltafetch_key') or request_fingerprint(request)
Expand Down
18 changes: 17 additions & 1 deletion tests/test_deltafetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from scrapy.statscollectors import StatsCollector
from scrapy.utils.test import get_crawler

from scrapy_deltafetch.middleware import DeltaFetch
from scrapy_deltafetch.middleware import DeltaFetch, DeltaFetchPseudoItem


dbmodule = None
Expand Down Expand Up @@ -201,6 +201,22 @@ def test_process_spider_output(self):
b'test_key_2']))
assert mw.db[b'key']

def test_process_spider_output_pseudo_item(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
mw.spider_opened(self.spider)
response = mock.Mock()
response.request = Request('http://url',
meta={'deltafetch_key': 'key'})
result = [DeltaFetchPseudoItem(reason='skip')]
self.assertEqual(list(mw.process_spider_output(
response, result, self.spider)), [])
self.assertEqual(set(mw.db.keys()),
set([b'key',
b'test_key_1',
b'test_key_2']))
assert mw.db[b'key']

def test_process_spider_output_dict(self):
self._create_test_db()
mw = self.mwcls(self.temp_dir, reset=False, stats=self.stats)
Expand Down