diff --git a/scrapy_splash/__init__.py b/scrapy_splash/__init__.py index cbdc7cf..d16777d 100644 --- a/scrapy_splash/__init__.py +++ b/scrapy_splash/__init__.py @@ -9,5 +9,7 @@ ) from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint from .cache import SplashAwareFSCacheStorage -from .response import SplashResponse, SplashTextResponse, SplashJsonResponse +from .response import ( + SplashResponse, SplashTextResponse, SplashHtmlResponse, SplashJsonResponse, +) from .request import SplashRequest, SplashFormRequest diff --git a/scrapy_splash/middleware.py b/scrapy_splash/middleware.py index 24ab23a..5ec2c04 100644 --- a/scrapy_splash/middleware.py +++ b/scrapy_splash/middleware.py @@ -14,6 +14,7 @@ from scrapy.exceptions import NotConfigured from scrapy.http.headers import Headers from scrapy.http.response.text import TextResponse +from scrapy.http.response.html import HtmlResponse from scrapy import signals from scrapy_splash.responsetypes import responsetypes @@ -397,19 +398,24 @@ def process_response(self, request, response, spider): return response def _change_response_class(self, request, response): - from scrapy_splash import SplashResponse, SplashTextResponse - if not isinstance(response, (SplashResponse, SplashTextResponse)): + from scrapy_splash import SplashResponse + from scrapy_splash.response import splash_scrapy_text_responses + splash_text_response_types = tuple(x for x, y in splash_scrapy_text_responses) + + if not isinstance(response, (SplashResponse, splash_text_response_types)): # create a custom Response subclass based on response Content-Type # XXX: usually request is assigned to response only when all # downloader middlewares are executed. Here it is set earlier. # Does it have any negative consequences? respcls = responsetypes.from_args(headers=response.headers) - if isinstance(response, TextResponse) and respcls is SplashResponse: + for splash_cls, scrapy_cls in splash_scrapy_text_responses: # Even if the headers say it's binary, it has already - # been detected as a text response by scrapy (for example + # been detected as a text/html response by scrapy (for example # because it was decoded successfully), so we should not # convert it to SplashResponse. - respcls = SplashTextResponse + if isinstance(response, scrapy_cls) and respcls is SplashResponse: + respcls = splash_cls + break response = response.replace(cls=respcls, request=request) return response diff --git a/scrapy_splash/response.py b/scrapy_splash/response.py index e5250c2..4871b4c 100644 --- a/scrapy_splash/response.py +++ b/scrapy_splash/response.py @@ -5,7 +5,7 @@ import base64 import re -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy import Selector from scrapy_splash.utils import headers_to_scrapy @@ -87,6 +87,15 @@ def replace(self, *args, **kwargs): return _SplashResponseMixin.replace(self, *args, **kwargs) +class SplashHtmlResponse(SplashTextResponse, HtmlResponse): + """ + This HtmlResponse subclass sets response.url to the URL of a remote website + instead of an URL of Splash server. "Real" response URL is still available + as ``response.real_url``. + """ + pass + + class SplashJsonResponse(SplashResponse): """ Splash Response with JSON data. It provides a convenient way to access @@ -185,3 +194,9 @@ def _load_from_json(self): # response.headers if 'headers' in self.data: self.headers = headers_to_scrapy(self.data['headers']) + + +splash_scrapy_text_responses = ( + (SplashTextResponse, TextResponse), + (SplashHtmlResponse, HtmlResponse), +) diff --git a/scrapy_splash/responsetypes.py b/scrapy_splash/responsetypes.py index 04e9264..71fb242 100644 --- a/scrapy_splash/responsetypes.py +++ b/scrapy_splash/responsetypes.py @@ -9,12 +9,12 @@ class SplashResponseTypes(ResponseTypes): CLASSES = { - 'text/html': 'scrapy_splash.response.SplashTextResponse', + 'text/html': 'scrapy_splash.response.SplashHtmlResponse', 'application/atom+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rdf+xml': 'scrapy_splash.response.SplashTextResponse', 'application/rss+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/xhtml+xml': 'scrapy_splash.response.SplashTextResponse', - 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashTextResponse', + 'application/xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', + 'application/vnd.wap.xhtml+xml': 'scrapy_splash.response.SplashHtmlResponse', 'application/xml': 'scrapy_splash.response.SplashTextResponse', 'application/json': 'scrapy_splash.response.SplashJsonResponse', 'application/x-json': 'scrapy_splash.response.SplashJsonResponse', diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 66b79ce..66ab83d 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -3,11 +3,12 @@ import copy import json import base64 +import pytest import scrapy from scrapy.core.engine import ExecutionEngine from scrapy.utils.test import get_crawler -from scrapy.http import Response, TextResponse +from scrapy.http import Response, TextResponse, HtmlResponse from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware import scrapy_splash @@ -18,6 +19,13 @@ SlotPolicy, SplashCookiesMiddleware, SplashDeduplicateArgsMiddleware, + SplashHtmlResponse, + SplashTextResponse, +) + +splash_scrapy_content_types = ( + (SplashTextResponse, TextResponse, b'text/*'), + (SplashHtmlResponse, HtmlResponse, b'text/html'), ) @@ -60,7 +68,11 @@ def test_nosplash(): assert response3.url == "http://example.com" -def test_splash_request(): +@pytest.mark.parametrize( + 'splash_response_type, scrapy_response_type, content_type', + splash_scrapy_content_types, +) +def test_splash_request(splash_response_type, scrapy_response_type, content_type): mw = _get_mw() cookie_mw = _get_cookie_mw() @@ -82,25 +94,26 @@ def test_splash_request(): assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing - response = TextResponse("http://127.0.0.1:8050/render.html", + response = scrapy_response_type( + url="http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, - headers={b'Content-Type': b'text/html'}, - body=b"Hello") + headers={b'Content-Type': content_type}, + body=b"Hello", ) response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) - assert isinstance(response2, scrapy_splash.SplashTextResponse) + assert isinstance(response2, splash_response_type) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url assert response2.body == b"Hello" assert response2.css("body").extract_first() == "Hello" - assert response2.headers == {b'Content-Type': [b'text/html']} + assert response2.headers == {b'Content-Type': [content_type, ]} # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 - assert isinstance(response3, scrapy_splash.SplashTextResponse) + assert isinstance(response3, splash_response_type) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr) diff --git a/tests/test_response.py b/tests/test_response.py new file mode 100644 index 0000000..8756e17 --- /dev/null +++ b/tests/test_response.py @@ -0,0 +1,10 @@ +from scrapy.http import HtmlResponse, TextResponse, Response +from scrapy_splash.response import ( + SplashTextResponse, SplashHtmlResponse, SplashResponse, +) + + +def test_response_types(): + assert issubclass(SplashResponse, Response) + assert issubclass(SplashTextResponse, TextResponse) + assert issubclass(SplashHtmlResponse, HtmlResponse)