back to Reference (Gold) summary
Reference (Gold): scrapy
Pytest Summary for test tests
status | count |
---|---|
passed | 2858 |
failed | 33 |
skipped | 371 |
xfailed | 21 |
error | 13 |
total | 3296 |
collected | 3296 |
Failed pytests:
test_commands.py::GenspiderCommandTest::test_template
test_commands.py::GenspiderCommandTest::test_template
self =tplname = 'crawl' def test_template(self, tplname="crawl"): args = [f"--template={tplname}"] if tplname else [] spname = "test_spider" spmodule = f"{self.project_name}.spiders.{spname}" p, out, err = self.proc("genspider", spname, "test.com", *args) self.assertIn( f"Created spider {spname!r} using template {tplname!r} in module:{os.linesep} {spmodule}", out, ) self.assertTrue(Path(self.proj_mod_path, "spiders", "test_spider.py").exists()) modify_time_before = ( Path(self.proj_mod_path, "spiders", "test_spider.py").stat().st_mtime ) p, out, err = self.proc("genspider", spname, "test.com", *args) self.assertIn(f"Spider {spname!r} already exists in module", out) modify_time_after = ( Path(self.proj_mod_path, "spiders", "test_spider.py").stat().st_mtime ) > self.assertEqual(modify_time_after, modify_time_before) /testbed/tests/test_commands.py:472: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:444: in assertEqual super().assertEqual(first, second, msg) E twisted.trial.unittest.FailTest: 1727385204.784021 != 1727385204.785175
test_commands.py::GenspiderCommandTest::test_template_basic
test_commands.py::GenspiderCommandTest::test_template_basic
self =def test_template_basic(self): > self.test_template("basic") /testbed/tests/test_commands.py:475: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_commands.py:472: in test_template self.assertEqual(modify_time_after, modify_time_before) /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:444: in assertEqual super().assertEqual(first, second, msg) E twisted.trial.unittest.FailTest: 1727385206.4120243 != 1727385206.4133556
test_commands.py::GenspiderCommandTest::test_template_csvfeed
test_commands.py::GenspiderCommandTest::test_template_csvfeed
self =def test_template_csvfeed(self): > self.test_template("csvfeed") /testbed/tests/test_commands.py:478: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_commands.py:472: in test_template self.assertEqual(modify_time_after, modify_time_before) /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:444: in assertEqual super().assertEqual(first, second, msg) E twisted.trial.unittest.FailTest: 1727385208.0210276 != 1727385208.0219028
test_commands.py::GenspiderCommandTest::test_template_xmlfeed
test_commands.py::GenspiderCommandTest::test_template_xmlfeed
self =def test_template_xmlfeed(self): > self.test_template("xmlfeed") /testbed/tests/test_commands.py:481: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_commands.py:472: in test_template self.assertEqual(modify_time_after, modify_time_before) /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:444: in assertEqual super().assertEqual(first, second, msg) E twisted.trial.unittest.FailTest: 1727385210.770033 != 1727385210.7708642
test_commands.py::GenspiderStandaloneCommandTest::test_same_name_as_existing_file
test_commands.py::GenspiderStandaloneCommandTest::test_same_name_as_existing_file
self =force = False def test_same_name_as_existing_file(self, force=False): file_name = "example" file_path = Path(self.temp_path, file_name + ".py") p, out, err = self.proc("genspider", file_name, "example.com") self.assertIn(f"Created spider {file_name!r} using template 'basic' ", out) assert file_path.exists() modify_time_before = file_path.stat().st_mtime file_contents_before = file_path.read_text(encoding="utf-8") if force: # use different template to ensure contents were changed p, out, err = self.proc( "genspider", "--force", "-t", "crawl", file_name, "example.com" ) self.assertIn(f"Created spider {file_name!r} using template 'crawl' ", out) modify_time_after = file_path.stat().st_mtime self.assertNotEqual(modify_time_after, modify_time_before) file_contents_after = file_path.read_text(encoding="utf-8") self.assertNotEqual(file_contents_after, file_contents_before) else: p, out, err = self.proc("genspider", file_name, "example.com") self.assertIn( f"{Path(self.temp_path, file_name + '.py').resolve()} already exists", out, ) modify_time_after = file_path.stat().st_mtime > self.assertEqual(modify_time_after, modify_time_before) /testbed/tests/test_commands.py:641: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:444: in assertEqual super().assertEqual(first, second, msg) E twisted.trial.unittest.FailTest: 1727385214.8590412 != 1727385214.8595073
test_crawl.py::CrawlSpiderTestCase::test_response_ssl_certificate_empty_response
test_crawl.py::CrawlSpiderTestCase::test_response_ssl_certificate_empty_response
self =@mark.xfail(reason="Responses with no body return early and contain no certificate") @defer.inlineCallbacks def test_response_ssl_certificate_empty_response(self): crawler = get_crawler(SingleRequestSpider) url = self.mockserver.url("/status?n=200", is_secure=True) yield crawler.crawl(seed=url, mockserver=self.mockserver) cert = crawler.spider.meta["responses"][0].certificate > self.assertIsInstance(cert, Certificate) /testbed/tests/test_crawl.py:629: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:666: in assertIsInstance self.fail(f"{instance!r} is not an instance of {classOrTuple}{suffix}") _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = msg = "None is not an instance of " def fail(self, msg: Optional[object] = None) -> NoReturn: """ Absolutely fail the test. Do not pass go, do not collect $200. @param msg: the message that will be displayed as the reason for the failure """ > raise self.failureException(msg) E twisted.trial.unittest.FailTest: None is not an instance of /testbed/.venv/lib/python3.12/site-packages/twisted/trial/_synctest.py:381: FailTest
test_downloader_handlers.py::HttpTestCase::test_download_head
test_downloader_handlers.py::HttpTestCase::test_download_head
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::HttpTestCase::test_get_duplicate_header
test_downloader_handlers.py::HttpTestCase::test_get_duplicate_header
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::HttpTestCase::test_host_header_not_in_request_headers
test_downloader_handlers.py::HttpTestCase::test_host_header_not_in_request_headers
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::HttpTestCase::test_payload
test_downloader_handlers.py::HttpTestCase::test_payload
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::HttpTestCase::test_response_header_content_length
test_downloader_handlers.py::HttpTestCase::test_response_header_content_length
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_content_length_zero_bodyless_post_request_headers
test_downloader_handlers.py::Https10TestCase::test_content_length_zero_bodyless_post_request_headers
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_payload
test_downloader_handlers.py::Https10TestCase::test_payload
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_protocol
test_downloader_handlers.py::Https10TestCase::test_protocol
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_redirect_status
test_downloader_handlers.py::Https10TestCase::test_redirect_status
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_redirect_status_head
test_downloader_handlers.py::Https10TestCase::test_redirect_status_head
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_response_class_from_body
test_downloader_handlers.py::Https10TestCase::test_response_class_from_body
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https10TestCase::test_response_header_content_length
test_downloader_handlers.py::Https10TestCase::test_response_header_content_length
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_content_length_zero_bodyless_post_only_one
test_downloader_handlers.py::Http11TestCase::test_content_length_zero_bodyless_post_only_one
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_download_head
test_downloader_handlers.py::Http11TestCase::test_download_head
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_download_with_maxsize
test_downloader_handlers.py::Http11TestCase::test_download_with_maxsize
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_download_with_maxsize_per_req
test_downloader_handlers.py::Http11TestCase::test_download_with_maxsize_per_req
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_download_with_small_maxsize_per_spider
test_downloader_handlers.py::Http11TestCase::test_download_with_small_maxsize_per_spider
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_host_header_not_in_request_headers
test_downloader_handlers.py::Http11TestCase::test_host_header_not_in_request_headers
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_response_class_choosing_request
test_downloader_handlers.py::Http11TestCase::test_response_class_choosing_request
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11TestCase::test_response_header_content_length
test_downloader_handlers.py::Http11TestCase::test_response_header_content_length
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https11WrongHostnameTestCase::test_download_broken_chunked_content_cause_data_loss
test_downloader_handlers.py::Https11WrongHostnameTestCase::test_download_broken_chunked_content_cause_data_loss
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Https11InvalidDNSPattern::test_timeout_download_from_spider_nodata_rcvd
test_downloader_handlers.py::Https11InvalidDNSPattern::test_timeout_download_from_spider_nodata_rcvd
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::Http11ProxyTestCase::test_download_with_proxy_without_http_scheme
test_downloader_handlers.py::Http11ProxyTestCase::test_download_with_proxy_without_http_scheme
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_download_nonexistent
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_download_nonexistent
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_download_success
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_download_success
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_local_filename
test_downloader_handlers.py::BaseFTPTestCase::test_ftp_local_filename
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::BaseFTPTestCase::test_response_class_from_body
test_downloader_handlers.py::BaseFTPTestCase::test_response_class_from_body
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::FTPTestCase::test_ftp_download_path_with_spaces
test_downloader_handlers.py::FTPTestCase::test_ftp_download_path_with_spaces
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::FTPTestCase::test_ftp_download_success
test_downloader_handlers.py::FTPTestCase::test_ftp_download_success
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::FTPTestCase::test_response_class_from_url
test_downloader_handlers.py::FTPTestCase::test_response_class_from_url
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::AnonymousFTPTestCase::test_ftp_download_nonexistent
test_downloader_handlers.py::AnonymousFTPTestCase::test_ftp_download_nonexistent
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers.py::AnonymousFTPTestCase::test_response_class_from_url
test_downloader_handlers.py::AnonymousFTPTestCase::test_response_class_from_url
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_downloader_handlers_http2.py::Http11ProxyTestCase::test_download_with_proxy
test_downloader_handlers_http2.py::Http11ProxyTestCase::test_download_with_proxy
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers_http2.py::Http11ProxyTestCase::test_download_without_proxy
test_downloader_handlers_http2.py::Http11ProxyTestCase::test_download_without_proxy
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers_http2.py::Https11TestCase::test_download_broken_chunked_content_cause_data_loss
test_downloader_handlers_http2.py::Https11TestCase::test_download_broken_chunked_content_cause_data_loss
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloader_handlers_http2.py::Https11TestCase::test_timeout_download_from_spider_nodata_rcvd
test_downloader_handlers_http2.py::Https11TestCase::test_timeout_download_from_spider_nodata_rcvd
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. Selectables:
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_keep_cookie_from_default_request_headers_middleware
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_keep_cookie_from_default_request_headers_middleware
self =@pytest.mark.xfail(reason="Cookie header is not currently being processed") def test_keep_cookie_from_default_request_headers_middleware(self): DEFAULT_REQUEST_HEADERS = dict(Cookie="default=value; asdf=qwerty") mw_default_headers = DefaultHeadersMiddleware(DEFAULT_REQUEST_HEADERS.items()) # overwrite with values from 'cookies' request argument req1 = Request("http://example.org", cookies={"default": "something"}) assert mw_default_headers.process_request(req1, self.spider) is None assert self.mw.process_request(req1, self.spider) is None > self.assertCookieValEqual( req1.headers["Cookie"], b"default=something; asdf=qwerty" ) /testbed/tests/test_downloadermiddleware_cookies.py:329: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_downloadermiddleware_cookies.py:59: in assertCookieValEqual return self.assertEqual(split_cookies(first), split_cookies(second), msg=msg) E AssertionError: Lists differ: [b'default=something'] != [b'asdf=qwerty', b'default=something'] E E First differing element 0: E b'default=something' E b'asdf=qwerty' E E Second list contains 1 additional elements. E First extra element 1: E b'default=something' E E - [b'default=something'] E + [b'asdf=qwerty', b'default=something']
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_keep_cookie_header
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_keep_cookie_header
self =@pytest.mark.xfail(reason="Cookie header is not currently being processed") def test_keep_cookie_header(self): # keep only cookies from 'Cookie' request header req1 = Request("http://scrapytest.org", headers={"Cookie": "a=b; c=d"}) assert self.mw.process_request(req1, self.spider) is None > self.assertCookieValEqual(req1.headers["Cookie"], "a=b; c=d") /testbed/tests/test_downloadermiddleware_cookies.py:345: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/scrapy/http/headers.py:49: in __getitem__ return super().__getitem__(key)[-1] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = {}, key = 'Cookie' def __getitem__(self, key): > return dict.__getitem__(self, self.normkey(key)) E KeyError: b'Cookie' /testbed/scrapy/utils/datatypes.py:41: KeyError
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_request_headers_cookie_encoding
test_downloadermiddleware_cookies.py::CookiesMiddlewareTest::test_request_headers_cookie_encoding
self =@pytest.mark.xfail(reason="Cookie header is not currently being processed") def test_request_headers_cookie_encoding(self): # 1) UTF8-encoded bytes req1 = Request("http://example.org", headers={"Cookie": "a=รก".encode("utf8")}) assert self.mw.process_request(req1, self.spider) is None > self.assertCookieValEqual(req1.headers["Cookie"], b"a=\xc3\xa1") /testbed/tests/test_downloadermiddleware_cookies.py:382: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/scrapy/http/headers.py:49: in __getitem__ return super().__getitem__(key)[-1] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = {}, key = 'Cookie' def __getitem__(self, key): > return dict.__getitem__(self, self.normkey(key)) E KeyError: b'Cookie' /testbed/scrapy/utils/datatypes.py:41: KeyError
test_engine.py::EngineTest::test_crawler_dupefilter
test_engine.py::EngineTest::test_crawler_dupefilter
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_engine_stop_download_bytes.py::EngineTest::test_crawler_change_close_reason_on_idle
test_engine_stop_download_bytes.py::EngineTest::test_crawler_change_close_reason_on_idle
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_engine_stop_download_bytes.py::EngineTest::test_crawler_itemerror
test_engine_stop_download_bytes.py::EngineTest::test_crawler_itemerror
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_engine_stop_download_bytes.py::BytesReceivedEngineTest::test_crawler_itemerror
test_engine_stop_download_bytes.py::BytesReceivedEngineTest::test_crawler_itemerror
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
test_engine_stop_download_headers.py::EngineTest::test_crawler_change_close_reason_on_idle
test_engine_stop_download_headers.py::EngineTest::test_crawler_change_close_reason_on_idle
'NoneType' object is not iterable During handling of the above exception, another exception occurred: NOTE: Incompatible Exception Representation, displaying natively: twisted.trial.util.DirtyReactorAggregateError: Reactor was unclean. DelayedCalls: (set twisted.internet.base.DelayedCall.debug = True to debug)
init.py::BaseSettingsTest::test_update_iterable
__init__.py::BaseSettingsTest::test_update_iterable
self =@pytest.mark.xfail( raises=AttributeError, reason="BaseSettings.update doesn't support iterable input", ) def test_update_iterable(self): settings = BaseSettings({"key": 0}) > settings.update([("key", 1)]) /testbed/tests/test_settings/__init__.py:217: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = values = [('key', 1)], priority = 'project' def update(self, values: _SettingsInputT, priority: Union[int, str] = "project") -> None: # type: ignore[override] """ Store key/value pairs with a given priority. This is a helper function that calls :meth:`~scrapy.settings.BaseSettings.set` for every item of ``values`` with the provided ``priority``. If ``values`` is a string, it is assumed to be JSON-encoded and parsed into a dict with ``json.loads()`` first. If it is a :class:`~scrapy.settings.BaseSettings` instance, the per-key priorities will be used and the ``priority`` parameter ignored. This allows inserting/updating settings with different priorities with a single command. :param values: the settings names and values :type values: dict or string or :class:`~scrapy.settings.BaseSettings` :param priority: the priority of the settings. Should be a key of :attr:`~scrapy.settings.SETTINGS_PRIORITIES` or an integer :type priority: str or int """ self._assert_mutability() if isinstance(values, str): values = cast(dict, json.loads(values)) if values is not None: if isinstance(values, BaseSettings): for name, value in values.items(): self.set(name, value, cast(int, values.getpriority(name))) else: > for name, value in values.items(): E AttributeError: 'list' object has no attribute 'items' /testbed/scrapy/settings/__init__.py:421: AttributeError
init.py::BaseSettingsTest::test_update_kwargs
__init__.py::BaseSettingsTest::test_update_kwargs
self =@pytest.mark.xfail( raises=TypeError, reason="BaseSettings.update doesn't support kwargs input" ) def test_update_kwargs(self): settings = BaseSettings({"key": 0}) > settings.update(key=1) # pylint: disable=unexpected-keyword-arg E TypeError: BaseSettings.update() got an unexpected keyword argument 'key' /testbed/tests/test_settings/__init__.py:209: TypeError
test_squeues.py::MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize1MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize1MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize2MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize2MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize3MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize3MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize4MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize4MarshalFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize1PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize1PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize2PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize2PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize3PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize3PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::ChunkSize4PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::ChunkSize4PickleFifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::MarshalLifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::MarshalLifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_squeues.py::PickleLifoDiskQueueTest::test_non_bytes_raises_typeerror
test_squeues.py::PickleLifoDiskQueueTest::test_non_bytes_raises_typeerror
self =@pytest.mark.xfail( reason="Reenable once Scrapy.squeues stops extending from this testsuite" ) def test_non_bytes_raises_typeerror(self): q = self.queue() > self.assertRaises(TypeError, q.push, 0) E AssertionError: TypeError not raised by push /testbed/.venv/lib/python3.12/site-packages/queuelib/tests/test_queue.py:223: AssertionError
test_utils_defer.py::AsyncDefTestsuiteTest::test_deferred_f_from_coro_f_xfail
test_utils_defer.py::AsyncDefTestsuiteTest::test_deferred_f_from_coro_f_xfail
self =@mark.xfail(reason="Checks that the test is actually executed", strict=True) @deferred_f_from_coro_f async def test_deferred_f_from_coro_f_xfail(self): > raise Exception("This is expected to be raised") E Exception: This is expected to be raised /testbed/tests/test_utils_defer.py:171: Exception
test_utils_request.py::RequestFingerprintTest::test_part_separation
test_utils_request.py::RequestFingerprintTest::test_part_separation
self =@pytest.mark.xfail(reason="known bug kept for backward compatibility", strict=True) def test_part_separation(self): > super().test_part_separation() /testbed/tests/test_utils_request.py:325: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_utils_request.py:224: in test_part_separation self.assertNotEqual(fp1, fp2) E AssertionError: '4e38b5ad81c4739738db8a4e3573c22aba5c5c28' == '4e38b5ad81c4739738db8a4e3573c22aba5c5c28'
test_utils_request.py::RequestFingerprintAsBytesTest::test_part_separation
test_utils_request.py::RequestFingerprintAsBytesTest::test_part_separation
self =@pytest.mark.xfail(reason="known bug kept for backward compatibility", strict=True) def test_part_separation(self): > super().test_part_separation() /testbed/tests/test_utils_request.py:361: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ /testbed/tests/test_utils_request.py:224: in test_part_separation self.assertNotEqual(fp1, fp2) E AssertionError: b'N8\xb5\xad\x81\xc4s\x978\xdb\x8aN5s\xc2*\xba\\\\(' == b'N8\xb5\xad\x81\xc4s\x978\xdb\x8aN5s\xc2*\xba\\\\('
Patch diff
diff --git a/scrapy/addons.py b/scrapy/addons.py
index b20d143a9..9060d4f3f 100644
--- a/scrapy/addons.py
+++ b/scrapy/addons.py
@@ -1,28 +1,53 @@
import logging
from typing import TYPE_CHECKING, Any, List
+
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.utils.conf import build_component_list
from scrapy.utils.misc import create_instance, load_object
+
if TYPE_CHECKING:
from scrapy.crawler import Crawler
+
logger = logging.getLogger(__name__)
class AddonManager:
"""This class facilitates loading and storing :ref:`topics-addons`."""
- def __init__(self, crawler: 'Crawler') ->None:
- self.crawler: 'Crawler' = crawler
+ def __init__(self, crawler: "Crawler") -> None:
+ self.crawler: "Crawler" = crawler
self.addons: List[Any] = []
- def load_settings(self, settings: Settings) ->None:
+ def load_settings(self, settings: Settings) -> None:
"""Load add-ons and configurations from a settings object and apply them.
This will load the add-on for every add-on path in the
``ADDONS`` setting and execute their ``update_settings`` methods.
- :param settings: The :class:`~scrapy.settings.Settings` object from which to read the add-on configuration
+ :param settings: The :class:`~scrapy.settings.Settings` object from \
+ which to read the add-on configuration
:type settings: :class:`~scrapy.settings.Settings`
"""
- pass
+ for clspath in build_component_list(settings["ADDONS"]):
+ try:
+ addoncls = load_object(clspath)
+ addon = create_instance(
+ addoncls, settings=settings, crawler=self.crawler
+ )
+ addon.update_settings(settings)
+ self.addons.append(addon)
+ except NotConfigured as e:
+ if e.args:
+ logger.warning(
+ "Disabled %(clspath)s: %(eargs)s",
+ {"clspath": clspath, "eargs": e.args[0]},
+ extra={"crawler": self.crawler},
+ )
+ logger.info(
+ "Enabled addons:\n%(addons)s",
+ {
+ "addons": self.addons,
+ },
+ extra={"crawler": self.crawler},
+ )
diff --git a/scrapy/cmdline.py b/scrapy/cmdline.py
index bf63f266a..6580ba9ce 100644
--- a/scrapy/cmdline.py
+++ b/scrapy/cmdline.py
@@ -4,6 +4,7 @@ import inspect
import os
import sys
from importlib.metadata import entry_points
+
import scrapy
from scrapy.commands import BaseRunSpiderCommand, ScrapyCommand, ScrapyHelpFormatter
from scrapy.crawler import CrawlerProcess
@@ -14,11 +15,175 @@ from scrapy.utils.python import garbage_collect
class ScrapyArgumentParser(argparse.ArgumentParser):
- pass
+ def _parse_optional(self, arg_string):
+ # if starts with -: it means that is a parameter not a argument
+ if arg_string[:2] == "-:":
+ return None
+
+ return super()._parse_optional(arg_string)
+
+
+def _iter_command_classes(module_name):
+ # TODO: add `name` attribute to commands and merge this function with
+ # scrapy.utils.spider.iter_spider_classes
+ for module in walk_modules(module_name):
+ for obj in vars(module).values():
+ if (
+ inspect.isclass(obj)
+ and issubclass(obj, ScrapyCommand)
+ and obj.__module__ == module.__name__
+ and obj not in (ScrapyCommand, BaseRunSpiderCommand)
+ ):
+ yield obj
+
+
+def _get_commands_from_module(module, inproject):
+ d = {}
+ for cmd in _iter_command_classes(module):
+ if inproject or not cmd.requires_project:
+ cmdname = cmd.__module__.split(".")[-1]
+ d[cmdname] = cmd()
+ return d
+
+
+def _get_commands_from_entry_points(inproject, group="scrapy.commands"):
+ cmds = {}
+ if sys.version_info >= (3, 10):
+ eps = entry_points(group=group)
+ else:
+ eps = entry_points().get(group, ())
+ for entry_point in eps:
+ obj = entry_point.load()
+ if inspect.isclass(obj):
+ cmds[entry_point.name] = obj()
+ else:
+ raise Exception(f"Invalid entry point {entry_point.name}")
+ return cmds
+
+
+def _get_commands_dict(settings, inproject):
+ cmds = _get_commands_from_module("scrapy.commands", inproject)
+ cmds.update(_get_commands_from_entry_points(inproject))
+ cmds_module = settings["COMMANDS_MODULE"]
+ if cmds_module:
+ cmds.update(_get_commands_from_module(cmds_module, inproject))
+ return cmds
+
+
+def _pop_command_name(argv):
+ i = 0
+ for arg in argv[1:]:
+ if not arg.startswith("-"):
+ del argv[i]
+ return arg
+ i += 1
+
+
+def _print_header(settings, inproject):
+ version = scrapy.__version__
+ if inproject:
+ print(f"Scrapy {version} - active project: {settings['BOT_NAME']}\n")
+
+ else:
+ print(f"Scrapy {version} - no active project\n")
+
+
+def _print_commands(settings, inproject):
+ _print_header(settings, inproject)
+ print("Usage:")
+ print(" scrapy <command> [options] [args]\n")
+ print("Available commands:")
+ cmds = _get_commands_dict(settings, inproject)
+ for cmdname, cmdclass in sorted(cmds.items()):
+ print(f" {cmdname:<13} {cmdclass.short_desc()}")
+ if not inproject:
+ print()
+ print(" [ more ] More commands available when run from project directory")
+ print()
+ print('Use "scrapy <command> -h" to see more info about a command')
+
+
+def _print_unknown_command(settings, cmdname, inproject):
+ _print_header(settings, inproject)
+ print(f"Unknown command: {cmdname}\n")
+ print('Use "scrapy" to see available commands')
+
+
+def _run_print_help(parser, func, *a, **kw):
+ try:
+ func(*a, **kw)
+ except UsageError as e:
+ if str(e):
+ parser.error(str(e))
+ if e.print_help:
+ parser.print_help()
+ sys.exit(2)
+
+
+def execute(argv=None, settings=None):
+ if argv is None:
+ argv = sys.argv
+
+ if settings is None:
+ settings = get_project_settings()
+ # set EDITOR from environment if available
+ try:
+ editor = os.environ["EDITOR"]
+ except KeyError:
+ pass
+ else:
+ settings["EDITOR"] = editor
+
+ inproject = inside_project()
+ cmds = _get_commands_dict(settings, inproject)
+ cmdname = _pop_command_name(argv)
+ if not cmdname:
+ _print_commands(settings, inproject)
+ sys.exit(0)
+ elif cmdname not in cmds:
+ _print_unknown_command(settings, cmdname, inproject)
+ sys.exit(2)
+
+ cmd = cmds[cmdname]
+ parser = ScrapyArgumentParser(
+ formatter_class=ScrapyHelpFormatter,
+ usage=f"scrapy {cmdname} {cmd.syntax()}",
+ conflict_handler="resolve",
+ description=cmd.long_desc(),
+ )
+ settings.setdict(cmd.default_settings, priority="command")
+ cmd.settings = settings
+ cmd.add_options(parser)
+ opts, args = parser.parse_known_args(args=argv[1:])
+ _run_print_help(parser, cmd.process_options, args, opts)
+
+ cmd.crawler_process = CrawlerProcess(settings)
+ _run_print_help(parser, _run_command, cmd, args, opts)
+ sys.exit(cmd.exitcode)
+
+
+def _run_command(cmd, args, opts):
+ if opts.profile:
+ _run_command_profiled(cmd, args, opts)
+ else:
+ cmd.run(args, opts)
+
+
+def _run_command_profiled(cmd, args, opts):
+ if opts.profile:
+ sys.stderr.write(f"scrapy: writing cProfile stats to {opts.profile!r}\n")
+ loc = locals()
+ p = cProfile.Profile()
+ p.runctx("cmd.run(args, opts)", globals(), loc)
+ if opts.profile:
+ p.dump_stats(opts.profile)
-if __name__ == '__main__':
+if __name__ == "__main__":
try:
execute()
finally:
+ # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit:
+ # http://doc.pypy.org/en/latest/cpython_differences.html
+ # ?highlight=gc.collect#differences-related-to-garbage-collection-strategies
garbage_collect()
diff --git a/scrapy/commands/bench.py b/scrapy/commands/bench.py
index 1c049d02b..e1ccdc451 100644
--- a/scrapy/commands/bench.py
+++ b/scrapy/commands/bench.py
@@ -2,23 +2,34 @@ import subprocess
import sys
import time
from urllib.parse import urlencode
+
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.linkextractors import LinkExtractor
class Command(ScrapyCommand):
- default_settings = {'LOG_LEVEL': 'INFO', 'LOGSTATS_INTERVAL': 1,
- 'CLOSESPIDER_TIMEOUT': 10}
+ default_settings = {
+ "LOG_LEVEL": "INFO",
+ "LOGSTATS_INTERVAL": 1,
+ "CLOSESPIDER_TIMEOUT": 10,
+ }
+ def short_desc(self):
+ return "Run quick benchmark test"
-class _BenchServer:
+ def run(self, args, opts):
+ with _BenchServer():
+ self.crawler_process.crawl(_BenchSpider, total=100000)
+ self.crawler_process.start()
+
+class _BenchServer:
def __enter__(self):
from scrapy.utils.test import get_testenv
- pargs = [sys.executable, '-u', '-m', 'scrapy.utils.benchserver']
- self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE, env=
- get_testenv())
+
+ pargs = [sys.executable, "-u", "-m", "scrapy.utils.benchserver"]
+ self.proc = subprocess.Popen(pargs, stdout=subprocess.PIPE, env=get_testenv())
self.proc.stdout.readline()
def __exit__(self, exc_type, exc_value, traceback):
@@ -29,8 +40,18 @@ class _BenchServer:
class _BenchSpider(scrapy.Spider):
"""A spider that follows all links"""
- name = 'follow'
+
+ name = "follow"
total = 10000
show = 20
- baseurl = 'http://localhost:8998'
+ baseurl = "http://localhost:8998"
link_extractor = LinkExtractor()
+
+ def start_requests(self):
+ qargs = {"total": self.total, "show": self.show}
+ url = f"{self.baseurl}?{urlencode(qargs, doseq=True)}"
+ return [scrapy.Request(url, dont_filter=True)]
+
+ def parse(self, response):
+ for link in self.link_extractor.extract_links(response):
+ yield scrapy.Request(link.url, callback=self.parse)
diff --git a/scrapy/commands/check.py b/scrapy/commands/check.py
index 7d6b7e3ed..de54ca4d3 100644
--- a/scrapy/commands/check.py
+++ b/scrapy/commands/check.py
@@ -2,6 +2,7 @@ import time
from collections import defaultdict
from unittest import TextTestResult as _TextTestResult
from unittest import TextTestRunner
+
from scrapy.commands import ScrapyCommand
from scrapy.contracts import ContractsManager
from scrapy.utils.conf import build_component_list
@@ -9,9 +10,99 @@ from scrapy.utils.misc import load_object, set_environ
class TextTestResult(_TextTestResult):
- pass
+ def printSummary(self, start, stop):
+ write = self.stream.write
+ writeln = self.stream.writeln
+
+ run = self.testsRun
+ plural = "s" if run != 1 else ""
+
+ writeln(self.separator2)
+ writeln(f"Ran {run} contract{plural} in {stop - start:.3f}s")
+ writeln()
+
+ infos = []
+ if not self.wasSuccessful():
+ write("FAILED")
+ failed, errored = map(len, (self.failures, self.errors))
+ if failed:
+ infos.append(f"failures={failed}")
+ if errored:
+ infos.append(f"errors={errored}")
+ else:
+ write("OK")
+
+ if infos:
+ writeln(f" ({', '.join(infos)})")
+ else:
+ write("\n")
class Command(ScrapyCommand):
requires_project = True
- default_settings = {'LOG_ENABLED': False}
+ default_settings = {"LOG_ENABLED": False}
+
+ def syntax(self):
+ return "[options] <spider>"
+
+ def short_desc(self):
+ return "Check spider contracts"
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument(
+ "-l",
+ "--list",
+ dest="list",
+ action="store_true",
+ help="only list contracts, without checking them",
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ dest="verbose",
+ default=False,
+ action="store_true",
+ help="print contract tests for all spiders",
+ )
+
+ def run(self, args, opts):
+ # load contracts
+ contracts = build_component_list(self.settings.getwithbase("SPIDER_CONTRACTS"))
+ conman = ContractsManager(load_object(c) for c in contracts)
+ runner = TextTestRunner(verbosity=2 if opts.verbose else 1)
+ result = TextTestResult(runner.stream, runner.descriptions, runner.verbosity)
+
+ # contract requests
+ contract_reqs = defaultdict(list)
+
+ spider_loader = self.crawler_process.spider_loader
+
+ with set_environ(SCRAPY_CHECK="true"):
+ for spidername in args or spider_loader.list():
+ spidercls = spider_loader.load(spidername)
+ spidercls.start_requests = lambda s: conman.from_spider(s, result)
+
+ tested_methods = conman.tested_methods_from_spidercls(spidercls)
+ if opts.list:
+ for method in tested_methods:
+ contract_reqs[spidercls.name].append(method)
+ elif tested_methods:
+ self.crawler_process.crawl(spidercls)
+
+ # start checks
+ if opts.list:
+ for spider, methods in sorted(contract_reqs.items()):
+ if not methods and not opts.verbose:
+ continue
+ print(spider)
+ for method in sorted(methods):
+ print(f" * {method}")
+ else:
+ start = time.time()
+ self.crawler_process.start()
+ stop = time.time()
+
+ result.printErrors()
+ result.printSummary(start, stop)
+ self.exitcode = int(not result.wasSuccessful())
diff --git a/scrapy/commands/crawl.py b/scrapy/commands/crawl.py
index 2348fd64d..2f0f1c7b9 100644
--- a/scrapy/commands/crawl.py
+++ b/scrapy/commands/crawl.py
@@ -4,3 +4,34 @@ from scrapy.exceptions import UsageError
class Command(BaseRunSpiderCommand):
requires_project = True
+
+ def syntax(self):
+ return "[options] <spider>"
+
+ def short_desc(self):
+ return "Run a spider"
+
+ def run(self, args, opts):
+ if len(args) < 1:
+ raise UsageError()
+ elif len(args) > 1:
+ raise UsageError(
+ "running 'scrapy crawl' with more than one spider is not supported"
+ )
+ spname = args[0]
+
+ crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
+
+ if getattr(crawl_defer, "result", None) is not None and issubclass(
+ crawl_defer.result.type, Exception
+ ):
+ self.exitcode = 1
+ else:
+ self.crawler_process.start()
+
+ if (
+ self.crawler_process.bootstrap_failed
+ or hasattr(self.crawler_process, "has_exception")
+ and self.crawler_process.has_exception
+ ):
+ self.exitcode = 1
diff --git a/scrapy/commands/edit.py b/scrapy/commands/edit.py
index ce7b67cc7..03a8ed5c7 100644
--- a/scrapy/commands/edit.py
+++ b/scrapy/commands/edit.py
@@ -1,9 +1,40 @@
import os
import sys
+
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
class Command(ScrapyCommand):
requires_project = True
- default_settings = {'LOG_ENABLED': False}
+ default_settings = {"LOG_ENABLED": False}
+
+ def syntax(self):
+ return "<spider>"
+
+ def short_desc(self):
+ return "Edit spider"
+
+ def long_desc(self):
+ return (
+ "Edit a spider using the editor defined in the EDITOR environment"
+ " variable or else the EDITOR setting"
+ )
+
+ def _err(self, msg):
+ sys.stderr.write(msg + os.linesep)
+ self.exitcode = 1
+
+ def run(self, args, opts):
+ if len(args) != 1:
+ raise UsageError()
+
+ editor = self.settings["EDITOR"]
+ try:
+ spidercls = self.crawler_process.spider_loader.load(args[0])
+ except KeyError:
+ return self._err(f"Spider not found: {args[0]}")
+
+ sfile = sys.modules[spidercls.__module__].__file__
+ sfile = sfile.replace(".pyc", ".py")
+ self.exitcode = os.system(f'{editor} "{sfile}"')
diff --git a/scrapy/commands/fetch.py b/scrapy/commands/fetch.py
index 59dcdb771..cdb7ad4ae 100644
--- a/scrapy/commands/fetch.py
+++ b/scrapy/commands/fetch.py
@@ -1,7 +1,9 @@
import sys
from argparse import Namespace
from typing import List, Type
+
from w3lib.url import is_url
+
from scrapy import Spider
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
@@ -12,3 +14,74 @@ from scrapy.utils.spider import DefaultSpider, spidercls_for_request
class Command(ScrapyCommand):
requires_project = False
+
+ def syntax(self):
+ return "[options] <url>"
+
+ def short_desc(self):
+ return "Fetch a URL using the Scrapy downloader"
+
+ def long_desc(self):
+ return (
+ "Fetch a URL using the Scrapy downloader and print its content"
+ " to stdout. You may want to use --nolog to disable logging"
+ )
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument("--spider", dest="spider", help="use this spider")
+ parser.add_argument(
+ "--headers",
+ dest="headers",
+ action="store_true",
+ help="print response HTTP headers instead of body",
+ )
+ parser.add_argument(
+ "--no-redirect",
+ dest="no_redirect",
+ action="store_true",
+ default=False,
+ help="do not handle HTTP 3xx status codes and print response as-is",
+ )
+
+ def _print_headers(self, headers, prefix):
+ for key, values in headers.items():
+ for value in values:
+ self._print_bytes(prefix + b" " + key + b": " + value)
+
+ def _print_response(self, response, opts):
+ if opts.headers:
+ self._print_headers(response.request.headers, b">")
+ print(">")
+ self._print_headers(response.headers, b"<")
+ else:
+ self._print_bytes(response.body)
+
+ def _print_bytes(self, bytes_):
+ sys.stdout.buffer.write(bytes_ + b"\n")
+
+ def run(self, args: List[str], opts: Namespace) -> None:
+ if len(args) != 1 or not is_url(args[0]):
+ raise UsageError()
+ request = Request(
+ args[0],
+ callback=self._print_response,
+ cb_kwargs={"opts": opts},
+ dont_filter=True,
+ )
+ # by default, let the framework handle redirects,
+ # i.e. command handles all codes expect 3xx
+ if not opts.no_redirect:
+ request.meta["handle_httpstatus_list"] = SequenceExclude(range(300, 400))
+ else:
+ request.meta["handle_httpstatus_all"] = True
+
+ spidercls: Type[Spider] = DefaultSpider
+ assert self.crawler_process
+ spider_loader = self.crawler_process.spider_loader
+ if opts.spider:
+ spidercls = spider_loader.load(opts.spider)
+ else:
+ spidercls = spidercls_for_request(spider_loader, request, spidercls)
+ self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
+ self.crawler_process.start()
diff --git a/scrapy/commands/genspider.py b/scrapy/commands/genspider.py
index 5c5068083..68cbe8ff6 100644
--- a/scrapy/commands/genspider.py
+++ b/scrapy/commands/genspider.py
@@ -5,6 +5,7 @@ from importlib import import_module
from pathlib import Path
from typing import Optional, cast
from urllib.parse import urlparse
+
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
@@ -16,23 +17,186 @@ def sanitize_module_name(module_name):
with underscores and prefixing it with a letter if it doesn't start
with one
"""
- pass
+ module_name = module_name.replace("-", "_").replace(".", "_")
+ if module_name[0] not in string.ascii_letters:
+ module_name = "a" + module_name
+ return module_name
def extract_domain(url):
"""Extract domain name from URL string"""
- pass
+ o = urlparse(url)
+ if o.scheme == "" and o.netloc == "":
+ o = urlparse("//" + url.lstrip("/"))
+ return o.netloc
def verify_url_scheme(url):
"""Check url for scheme and insert https if none found."""
- pass
+ parsed = urlparse(url)
+ if parsed.scheme == "" and parsed.netloc == "":
+ parsed = urlparse("//" + url)._replace(scheme="https")
+ return parsed.geturl()
class Command(ScrapyCommand):
requires_project = False
- default_settings = {'LOG_ENABLED': False}
+ default_settings = {"LOG_ENABLED": False}
+
+ def syntax(self):
+ return "[options] <name> <domain>"
+
+ def short_desc(self):
+ return "Generate new spider using pre-defined templates"
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument(
+ "-l",
+ "--list",
+ dest="list",
+ action="store_true",
+ help="List available templates",
+ )
+ parser.add_argument(
+ "-e",
+ "--edit",
+ dest="edit",
+ action="store_true",
+ help="Edit spider after creating it",
+ )
+ parser.add_argument(
+ "-d",
+ "--dump",
+ dest="dump",
+ metavar="TEMPLATE",
+ help="Dump template to standard output",
+ )
+ parser.add_argument(
+ "-t",
+ "--template",
+ dest="template",
+ default="basic",
+ help="Uses a custom template.",
+ )
+ parser.add_argument(
+ "--force",
+ dest="force",
+ action="store_true",
+ help="If the spider already exists, overwrite it with the template",
+ )
+
+ def run(self, args, opts):
+ if opts.list:
+ self._list_templates()
+ return
+ if opts.dump:
+ template_file = self._find_template(opts.dump)
+ if template_file:
+ print(template_file.read_text(encoding="utf-8"))
+ return
+ if len(args) != 2:
+ raise UsageError()
+
+ name, url = args[0:2]
+ url = verify_url_scheme(url)
+ module = sanitize_module_name(name)
+
+ if self.settings.get("BOT_NAME") == module:
+ print("Cannot create a spider with the same name as your project")
+ return
+
+ if not opts.force and self._spider_exists(name):
+ return
+
+ template_file = self._find_template(opts.template)
+ if template_file:
+ self._genspider(module, name, url, opts.template, template_file)
+ if opts.edit:
+ self.exitcode = os.system(f'scrapy edit "{name}"')
def _genspider(self, module, name, url, template_name, template_file):
"""Generate the spider module, based on the given template"""
- pass
+ capitalized_module = "".join(s.capitalize() for s in module.split("_"))
+ domain = extract_domain(url)
+ tvars = {
+ "project_name": self.settings.get("BOT_NAME"),
+ "ProjectName": string_camelcase(self.settings.get("BOT_NAME")),
+ "module": module,
+ "name": name,
+ "url": url,
+ "domain": domain,
+ "classname": f"{capitalized_module}Spider",
+ }
+ if self.settings.get("NEWSPIDER_MODULE"):
+ spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
+ spiders_dir = Path(spiders_module.__file__).parent.resolve()
+ else:
+ spiders_module = None
+ spiders_dir = Path(".")
+ spider_file = f"{spiders_dir / module}.py"
+ shutil.copyfile(template_file, spider_file)
+ render_templatefile(spider_file, **tvars)
+ print(
+ f"Created spider {name!r} using template {template_name!r} ",
+ end=("" if spiders_module else "\n"),
+ )
+ if spiders_module:
+ print(f"in module:\n {spiders_module.__name__}.{module}")
+
+ def _find_template(self, template: str) -> Optional[Path]:
+ template_file = Path(self.templates_dir, f"{template}.tmpl")
+ if template_file.exists():
+ return template_file
+ print(f"Unable to find template: {template}\n")
+ print('Use "scrapy genspider --list" to see all available templates.')
+ return None
+
+ def _list_templates(self):
+ print("Available templates:")
+ for file in sorted(Path(self.templates_dir).iterdir()):
+ if file.suffix == ".tmpl":
+ print(f" {file.stem}")
+
+ def _spider_exists(self, name: str) -> bool:
+ if not self.settings.get("NEWSPIDER_MODULE"):
+ # if run as a standalone command and file with same filename already exists
+ path = Path(name + ".py")
+ if path.exists():
+ print(f"{path.resolve()} already exists")
+ return True
+ return False
+
+ assert (
+ self.crawler_process is not None
+ ), "crawler_process must be set before calling run"
+
+ try:
+ spidercls = self.crawler_process.spider_loader.load(name)
+ except KeyError:
+ pass
+ else:
+ # if spider with same name exists
+ print(f"Spider {name!r} already exists in module:")
+ print(f" {spidercls.__module__}")
+ return True
+
+ # a file with the same name exists in the target directory
+ spiders_module = import_module(self.settings["NEWSPIDER_MODULE"])
+ spiders_dir = Path(cast(str, spiders_module.__file__)).parent
+ spiders_dir_abs = spiders_dir.resolve()
+ path = spiders_dir_abs / (name + ".py")
+ if path.exists():
+ print(f"{path} already exists")
+ return True
+
+ return False
+
+ @property
+ def templates_dir(self) -> str:
+ return str(
+ Path(
+ self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
+ "spiders",
+ )
+ )
diff --git a/scrapy/commands/list.py b/scrapy/commands/list.py
index 59078bd88..2f5032360 100644
--- a/scrapy/commands/list.py
+++ b/scrapy/commands/list.py
@@ -3,4 +3,11 @@ from scrapy.commands import ScrapyCommand
class Command(ScrapyCommand):
requires_project = True
- default_settings = {'LOG_ENABLED': False}
+ default_settings = {"LOG_ENABLED": False}
+
+ def short_desc(self):
+ return "List available spiders"
+
+ def run(self, args, opts):
+ for s in sorted(self.crawler_process.spider_loader.list()):
+ print(s)
diff --git a/scrapy/commands/parse.py b/scrapy/commands/parse.py
index 63e47a92b..c9f8586d3 100644
--- a/scrapy/commands/parse.py
+++ b/scrapy/commands/parse.py
@@ -3,9 +3,11 @@ import inspect
import json
import logging
from typing import Dict
+
from itemadapter import ItemAdapter, is_item
from twisted.internet.defer import maybeDeferred
from w3lib.url import is_url
+
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
from scrapy.http import Request
@@ -15,12 +17,338 @@ from scrapy.utils.defer import aiter_errback, deferred_from_coro
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.spider import spidercls_for_request
+
logger = logging.getLogger(__name__)
class Command(BaseRunSpiderCommand):
requires_project = True
+
spider = None
items: Dict[int, list] = {}
requests: Dict[int, list] = {}
+
first_response = None
+
+ def syntax(self):
+ return "[options] <url>"
+
+ def short_desc(self):
+ return "Parse URL (using its spider) and print the results"
+
+ def add_options(self, parser):
+ BaseRunSpiderCommand.add_options(self, parser)
+ parser.add_argument(
+ "--spider",
+ dest="spider",
+ default=None,
+ help="use this spider without looking for one",
+ )
+ parser.add_argument(
+ "--pipelines", action="store_true", help="process items through pipelines"
+ )
+ parser.add_argument(
+ "--nolinks",
+ dest="nolinks",
+ action="store_true",
+ help="don't show links to follow (extracted requests)",
+ )
+ parser.add_argument(
+ "--noitems",
+ dest="noitems",
+ action="store_true",
+ help="don't show scraped items",
+ )
+ parser.add_argument(
+ "--nocolour",
+ dest="nocolour",
+ action="store_true",
+ help="avoid using pygments to colorize the output",
+ )
+ parser.add_argument(
+ "-r",
+ "--rules",
+ dest="rules",
+ action="store_true",
+ help="use CrawlSpider rules to discover the callback",
+ )
+ parser.add_argument(
+ "-c",
+ "--callback",
+ dest="callback",
+ help="use this callback for parsing, instead looking for a callback",
+ )
+ parser.add_argument(
+ "-m",
+ "--meta",
+ dest="meta",
+ help="inject extra meta into the Request, it must be a valid raw json string",
+ )
+ parser.add_argument(
+ "--cbkwargs",
+ dest="cbkwargs",
+ help="inject extra callback kwargs into the Request, it must be a valid raw json string",
+ )
+ parser.add_argument(
+ "-d",
+ "--depth",
+ dest="depth",
+ type=int,
+ default=1,
+ help="maximum depth for parsing requests [default: %(default)s]",
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ dest="verbose",
+ action="store_true",
+ help="print each depth level one by one",
+ )
+
+ @property
+ def max_level(self):
+ max_items, max_requests = 0, 0
+ if self.items:
+ max_items = max(self.items)
+ if self.requests:
+ max_requests = max(self.requests)
+ return max(max_items, max_requests)
+
+ def handle_exception(self, _failure):
+ logger.error(
+ "An error is caught while iterating the async iterable",
+ exc_info=failure_to_exc_info(_failure),
+ )
+
+ def iterate_spider_output(self, result):
+ if inspect.isasyncgen(result):
+ d = deferred_from_coro(
+ collect_asyncgen(aiter_errback(result, self.handle_exception))
+ )
+ d.addCallback(self.iterate_spider_output)
+ return d
+ if inspect.iscoroutine(result):
+ d = deferred_from_coro(result)
+ d.addCallback(self.iterate_spider_output)
+ return d
+ return arg_to_iter(deferred_from_coro(result))
+
+ def add_items(self, lvl, new_items):
+ old_items = self.items.get(lvl, [])
+ self.items[lvl] = old_items + new_items
+
+ def add_requests(self, lvl, new_reqs):
+ old_reqs = self.requests.get(lvl, [])
+ self.requests[lvl] = old_reqs + new_reqs
+
+ def print_items(self, lvl=None, colour=True):
+ if lvl is None:
+ items = [item for lst in self.items.values() for item in lst]
+ else:
+ items = self.items.get(lvl, [])
+
+ print("# Scraped Items ", "-" * 60)
+ display.pprint([ItemAdapter(x).asdict() for x in items], colorize=colour)
+
+ def print_requests(self, lvl=None, colour=True):
+ if lvl is None:
+ if self.requests:
+ requests = self.requests[max(self.requests)]
+ else:
+ requests = []
+ else:
+ requests = self.requests.get(lvl, [])
+
+ print("# Requests ", "-" * 65)
+ display.pprint(requests, colorize=colour)
+
+ def print_results(self, opts):
+ colour = not opts.nocolour
+
+ if opts.verbose:
+ for level in range(1, self.max_level + 1):
+ print(f"\n>>> DEPTH LEVEL: {level} <<<")
+ if not opts.noitems:
+ self.print_items(level, colour)
+ if not opts.nolinks:
+ self.print_requests(level, colour)
+ else:
+ print(f"\n>>> STATUS DEPTH LEVEL {self.max_level} <<<")
+ if not opts.noitems:
+ self.print_items(colour=colour)
+ if not opts.nolinks:
+ self.print_requests(colour=colour)
+
+ def _get_items_and_requests(self, spider_output, opts, depth, spider, callback):
+ items, requests = [], []
+ for x in spider_output:
+ if is_item(x):
+ items.append(x)
+ elif isinstance(x, Request):
+ requests.append(x)
+ return items, requests, opts, depth, spider, callback
+
+ def run_callback(self, response, callback, cb_kwargs=None):
+ cb_kwargs = cb_kwargs or {}
+ d = maybeDeferred(self.iterate_spider_output, callback(response, **cb_kwargs))
+ return d
+
+ def get_callback_from_rules(self, spider, response):
+ if getattr(spider, "rules", None):
+ for rule in spider.rules:
+ if rule.link_extractor.matches(response.url):
+ return rule.callback or "parse"
+ else:
+ logger.error(
+ "No CrawlSpider rules found in spider %(spider)r, "
+ "please specify a callback to use for parsing",
+ {"spider": spider.name},
+ )
+
+ def set_spidercls(self, url, opts):
+ spider_loader = self.crawler_process.spider_loader
+ if opts.spider:
+ try:
+ self.spidercls = spider_loader.load(opts.spider)
+ except KeyError:
+ logger.error(
+ "Unable to find spider: %(spider)s", {"spider": opts.spider}
+ )
+ else:
+ self.spidercls = spidercls_for_request(spider_loader, Request(url))
+ if not self.spidercls:
+ logger.error("Unable to find spider for: %(url)s", {"url": url})
+
+ def _start_requests(spider):
+ yield self.prepare_request(spider, Request(url), opts)
+
+ if self.spidercls:
+ self.spidercls.start_requests = _start_requests
+
+ def start_parsing(self, url, opts):
+ self.crawler_process.crawl(self.spidercls, **opts.spargs)
+ self.pcrawler = list(self.crawler_process.crawlers)[0]
+ self.crawler_process.start()
+
+ if not self.first_response:
+ logger.error("No response downloaded for: %(url)s", {"url": url})
+
+ def scraped_data(self, args):
+ items, requests, opts, depth, spider, callback = args
+ if opts.pipelines:
+ itemproc = self.pcrawler.engine.scraper.itemproc
+ for item in items:
+ itemproc.process_item(item, spider)
+ self.add_items(depth, items)
+ self.add_requests(depth, requests)
+
+ scraped_data = items if opts.output else []
+ if depth < opts.depth:
+ for req in requests:
+ req.meta["_depth"] = depth + 1
+ req.meta["_callback"] = req.callback
+ req.callback = callback
+ scraped_data += requests
+
+ return scraped_data
+
+ def _get_callback(self, *, spider, opts, response=None):
+ cb = None
+ if response:
+ cb = response.meta["_callback"]
+ if not cb:
+ if opts.callback:
+ cb = opts.callback
+ elif response and opts.rules and self.first_response == response:
+ cb = self.get_callback_from_rules(spider, response)
+ if not cb:
+ raise ValueError(
+ f"Cannot find a rule that matches {response.url!r} in spider: "
+ f"{spider.name}"
+ )
+ else:
+ cb = "parse"
+
+ if not callable(cb):
+ cb_method = getattr(spider, cb, None)
+ if callable(cb_method):
+ cb = cb_method
+ else:
+ raise ValueError(
+ f"Cannot find callback {cb!r} in spider: {spider.name}"
+ )
+ return cb
+
+ def prepare_request(self, spider, request, opts):
+ def callback(response, **cb_kwargs):
+ # memorize first request
+ if not self.first_response:
+ self.first_response = response
+
+ cb = self._get_callback(spider=spider, opts=opts, response=response)
+
+ # parse items and requests
+ depth = response.meta["_depth"]
+
+ d = self.run_callback(response, cb, cb_kwargs)
+ d.addCallback(self._get_items_and_requests, opts, depth, spider, callback)
+ d.addCallback(self.scraped_data)
+ return d
+
+ # update request meta if any extra meta was passed through the --meta/-m opts.
+ if opts.meta:
+ request.meta.update(opts.meta)
+
+ # update cb_kwargs if any extra values were was passed through the --cbkwargs option.
+ if opts.cbkwargs:
+ request.cb_kwargs.update(opts.cbkwargs)
+
+ request.meta["_depth"] = 1
+ request.meta["_callback"] = request.callback
+ if not request.callback and not opts.rules:
+ cb = self._get_callback(spider=spider, opts=opts)
+ functools.update_wrapper(callback, cb)
+ request.callback = callback
+ return request
+
+ def process_options(self, args, opts):
+ BaseRunSpiderCommand.process_options(self, args, opts)
+
+ self.process_request_meta(opts)
+ self.process_request_cb_kwargs(opts)
+
+ def process_request_meta(self, opts):
+ if opts.meta:
+ try:
+ opts.meta = json.loads(opts.meta)
+ except ValueError:
+ raise UsageError(
+ "Invalid -m/--meta value, pass a valid json string to -m or --meta. "
+ 'Example: --meta=\'{"foo" : "bar"}\'',
+ print_help=False,
+ )
+
+ def process_request_cb_kwargs(self, opts):
+ if opts.cbkwargs:
+ try:
+ opts.cbkwargs = json.loads(opts.cbkwargs)
+ except ValueError:
+ raise UsageError(
+ "Invalid --cbkwargs value, pass a valid json string to --cbkwargs. "
+ 'Example: --cbkwargs=\'{"foo" : "bar"}\'',
+ print_help=False,
+ )
+
+ def run(self, args, opts):
+ # parse arguments
+ if not len(args) == 1 or not is_url(args[0]):
+ raise UsageError()
+ else:
+ url = args[0]
+
+ # prepare spidercls
+ self.set_spidercls(url, opts)
+
+ if self.spidercls and opts.depth > 0:
+ self.start_parsing(url, opts)
+ self.print_results(opts)
diff --git a/scrapy/commands/runspider.py b/scrapy/commands/runspider.py
index d6b20ae5f..58ed89a81 100644
--- a/scrapy/commands/runspider.py
+++ b/scrapy/commands/runspider.py
@@ -4,11 +4,55 @@ from os import PathLike
from pathlib import Path
from types import ModuleType
from typing import Union
+
from scrapy.commands import BaseRunSpiderCommand
from scrapy.exceptions import UsageError
from scrapy.utils.spider import iter_spider_classes
+def _import_file(filepath: Union[str, PathLike]) -> ModuleType:
+ abspath = Path(filepath).resolve()
+ if abspath.suffix not in (".py", ".pyw"):
+ raise ValueError(f"Not a Python source file: {abspath}")
+ dirname = str(abspath.parent)
+ sys.path = [dirname] + sys.path
+ try:
+ module = import_module(abspath.stem)
+ finally:
+ sys.path.pop(0)
+ return module
+
+
class Command(BaseRunSpiderCommand):
requires_project = False
- default_settings = {'SPIDER_LOADER_WARN_ONLY': True}
+ default_settings = {"SPIDER_LOADER_WARN_ONLY": True}
+
+ def syntax(self):
+ return "[options] <spider_file>"
+
+ def short_desc(self):
+ return "Run a self-contained spider (without creating a project)"
+
+ def long_desc(self):
+ return "Run the spider defined in the given file"
+
+ def run(self, args, opts):
+ if len(args) != 1:
+ raise UsageError()
+ filename = Path(args[0])
+ if not filename.exists():
+ raise UsageError(f"File not found: {filename}\n")
+ try:
+ module = _import_file(filename)
+ except (ImportError, ValueError) as e:
+ raise UsageError(f"Unable to load {str(filename)!r}: {e}\n")
+ spclasses = list(iter_spider_classes(module))
+ if not spclasses:
+ raise UsageError(f"No spider found in file: {filename}\n")
+ spidercls = spclasses.pop()
+
+ self.crawler_process.crawl(spidercls, **opts.spargs)
+ self.crawler_process.start()
+
+ if self.crawler_process.bootstrap_failed:
+ self.exitcode = 1
diff --git a/scrapy/commands/settings.py b/scrapy/commands/settings.py
index 017f56138..318187204 100644
--- a/scrapy/commands/settings.py
+++ b/scrapy/commands/settings.py
@@ -1,8 +1,62 @@
import json
+
from scrapy.commands import ScrapyCommand
from scrapy.settings import BaseSettings
class Command(ScrapyCommand):
requires_project = False
- default_settings = {'LOG_ENABLED': False, 'SPIDER_LOADER_WARN_ONLY': True}
+ default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
+
+ def syntax(self):
+ return "[options]"
+
+ def short_desc(self):
+ return "Get settings values"
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument(
+ "--get", dest="get", metavar="SETTING", help="print raw setting value"
+ )
+ parser.add_argument(
+ "--getbool",
+ dest="getbool",
+ metavar="SETTING",
+ help="print setting value, interpreted as a boolean",
+ )
+ parser.add_argument(
+ "--getint",
+ dest="getint",
+ metavar="SETTING",
+ help="print setting value, interpreted as an integer",
+ )
+ parser.add_argument(
+ "--getfloat",
+ dest="getfloat",
+ metavar="SETTING",
+ help="print setting value, interpreted as a float",
+ )
+ parser.add_argument(
+ "--getlist",
+ dest="getlist",
+ metavar="SETTING",
+ help="print setting value, interpreted as a list",
+ )
+
+ def run(self, args, opts):
+ settings = self.crawler_process.settings
+ if opts.get:
+ s = settings.get(opts.get)
+ if isinstance(s, BaseSettings):
+ print(json.dumps(s.copy_to_dict()))
+ else:
+ print(s)
+ elif opts.getbool:
+ print(settings.getbool(opts.getbool))
+ elif opts.getint:
+ print(settings.getint(opts.getint))
+ elif opts.getfloat:
+ print(settings.getfloat(opts.getfloat))
+ elif opts.getlist:
+ print(settings.getlist(opts.getlist))
diff --git a/scrapy/commands/shell.py b/scrapy/commands/shell.py
index fadb2c519..12e37babc 100644
--- a/scrapy/commands/shell.py
+++ b/scrapy/commands/shell.py
@@ -6,6 +6,7 @@ See documentation in docs/topics/shell.rst
from argparse import Namespace
from threading import Thread
from typing import List, Type
+
from scrapy import Spider
from scrapy.commands import ScrapyCommand
from scrapy.http import Request
@@ -16,11 +17,80 @@ from scrapy.utils.url import guess_scheme
class Command(ScrapyCommand):
requires_project = False
- default_settings = {'KEEP_ALIVE': True, 'LOGSTATS_INTERVAL': 0,
- 'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter'}
+ default_settings = {
+ "KEEP_ALIVE": True,
+ "LOGSTATS_INTERVAL": 0,
+ "DUPEFILTER_CLASS": "scrapy.dupefilters.BaseDupeFilter",
+ }
+
+ def syntax(self):
+ return "[url|file]"
+
+ def short_desc(self):
+ return "Interactive scraping console"
+
+ def long_desc(self):
+ return (
+ "Interactive console for scraping the given url or file. "
+ "Use ./file.html syntax or full path for local file."
+ )
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument(
+ "-c",
+ dest="code",
+ help="evaluate the code in the shell, print the result and exit",
+ )
+ parser.add_argument("--spider", dest="spider", help="use this spider")
+ parser.add_argument(
+ "--no-redirect",
+ dest="no_redirect",
+ action="store_true",
+ default=False,
+ help="do not handle HTTP 3xx status codes and print response as-is",
+ )
def update_vars(self, vars):
"""You can use this function to update the Scrapy objects that will be
available in the shell
"""
pass
+
+ def run(self, args: List[str], opts: Namespace) -> None:
+ url = args[0] if args else None
+ if url:
+ # first argument may be a local file
+ url = guess_scheme(url)
+
+ assert self.crawler_process
+ spider_loader = self.crawler_process.spider_loader
+
+ spidercls: Type[Spider] = DefaultSpider
+ if opts.spider:
+ spidercls = spider_loader.load(opts.spider)
+ elif url:
+ spidercls = spidercls_for_request(
+ spider_loader, Request(url), spidercls, log_multiple=True
+ )
+
+ # The crawler is created this way since the Shell manually handles the
+ # crawling engine, so the set up in the crawl method won't work
+ crawler = self.crawler_process._create_crawler(spidercls)
+ crawler._apply_settings()
+ # The Shell class needs a persistent engine in the crawler
+ crawler.engine = crawler._create_engine()
+ crawler.engine.start()
+
+ self._start_crawler_thread()
+
+ shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
+ shell.start(url=url, redirect=not opts.no_redirect)
+
+ def _start_crawler_thread(self):
+ t = Thread(
+ target=self.crawler_process.start,
+ kwargs={"stop_after_crawl": False, "install_signal_handlers": False},
+ )
+ t.daemon = True
+ t.start()
diff --git a/scrapy/commands/startproject.py b/scrapy/commands/startproject.py
index c9783c28e..fde609c6f 100644
--- a/scrapy/commands/startproject.py
+++ b/scrapy/commands/startproject.py
@@ -5,19 +5,53 @@ from importlib.util import find_spec
from pathlib import Path
from shutil import copy2, copystat, ignore_patterns, move
from stat import S_IWUSR as OWNER_WRITE_PERMISSION
+
import scrapy
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError
from scrapy.utils.template import render_templatefile, string_camelcase
-TEMPLATES_TO_RENDER = ('scrapy.cfg',), ('${project_name}', 'settings.py.tmpl'
- ), ('${project_name}', 'items.py.tmpl'), ('${project_name}',
- 'pipelines.py.tmpl'), ('${project_name}', 'middlewares.py.tmpl')
-IGNORE = ignore_patterns('*.pyc', '__pycache__', '.svn')
+
+TEMPLATES_TO_RENDER = (
+ ("scrapy.cfg",),
+ ("${project_name}", "settings.py.tmpl"),
+ ("${project_name}", "items.py.tmpl"),
+ ("${project_name}", "pipelines.py.tmpl"),
+ ("${project_name}", "middlewares.py.tmpl"),
+)
+
+IGNORE = ignore_patterns("*.pyc", "__pycache__", ".svn")
+
+
+def _make_writable(path):
+ current_permissions = os.stat(path).st_mode
+ os.chmod(path, current_permissions | OWNER_WRITE_PERMISSION)
class Command(ScrapyCommand):
requires_project = False
- default_settings = {'LOG_ENABLED': False, 'SPIDER_LOADER_WARN_ONLY': True}
+ default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
+
+ def syntax(self):
+ return "<project_name> [project_dir]"
+
+ def short_desc(self):
+ return "Create new project"
+
+ def _is_valid_name(self, project_name):
+ def _module_exists(module_name):
+ spec = find_spec(module_name)
+ return spec is not None and spec.loader is not None
+
+ if not re.search(r"^[_a-zA-Z]\w*$", project_name):
+ print(
+ "Error: Project names must begin with a letter and contain"
+ " only\nletters, numbers and underscores"
+ )
+ elif _module_exists(project_name):
+ print(f"Error: Module {project_name!r} already exists")
+ else:
+ return True
+ return False
def _copytree(self, src: Path, dst: Path):
"""
@@ -28,4 +62,77 @@ class Command(ScrapyCommand):
More info at:
https://github.com/scrapy/scrapy/pull/2005
"""
- pass
+ ignore = IGNORE
+ names = [x.name for x in src.iterdir()]
+ ignored_names = ignore(src, names)
+
+ if not dst.exists():
+ dst.mkdir(parents=True)
+
+ for name in names:
+ if name in ignored_names:
+ continue
+
+ srcname = src / name
+ dstname = dst / name
+ if srcname.is_dir():
+ self._copytree(srcname, dstname)
+ else:
+ copy2(srcname, dstname)
+ _make_writable(dstname)
+
+ copystat(src, dst)
+ _make_writable(dst)
+
+ def run(self, args, opts):
+ if len(args) not in (1, 2):
+ raise UsageError()
+
+ project_name = args[0]
+
+ if len(args) == 2:
+ project_dir = Path(args[1])
+ else:
+ project_dir = Path(args[0])
+
+ if (project_dir / "scrapy.cfg").exists():
+ self.exitcode = 1
+ print(f"Error: scrapy.cfg already exists in {project_dir.resolve()}")
+ return
+
+ if not self._is_valid_name(project_name):
+ self.exitcode = 1
+ return
+
+ self._copytree(Path(self.templates_dir), project_dir.resolve())
+ move(project_dir / "module", project_dir / project_name)
+ for paths in TEMPLATES_TO_RENDER:
+ tplfile = Path(
+ project_dir,
+ *(
+ string.Template(s).substitute(project_name=project_name)
+ for s in paths
+ ),
+ )
+ render_templatefile(
+ tplfile,
+ project_name=project_name,
+ ProjectName=string_camelcase(project_name),
+ )
+ print(
+ f"New Scrapy project '{project_name}', using template directory "
+ f"'{self.templates_dir}', created in:"
+ )
+ print(f" {project_dir.resolve()}\n")
+ print("You can start your first spider with:")
+ print(f" cd {project_dir}")
+ print(" scrapy genspider example example.com")
+
+ @property
+ def templates_dir(self) -> str:
+ return str(
+ Path(
+ self.settings["TEMPLATES_DIR"] or Path(scrapy.__path__[0], "templates"),
+ "project",
+ )
+ )
diff --git a/scrapy/commands/version.py b/scrapy/commands/version.py
index 409f6ebb2..47582866b 100644
--- a/scrapy/commands/version.py
+++ b/scrapy/commands/version.py
@@ -4,4 +4,29 @@ from scrapy.utils.versions import scrapy_components_versions
class Command(ScrapyCommand):
- default_settings = {'LOG_ENABLED': False, 'SPIDER_LOADER_WARN_ONLY': True}
+ default_settings = {"LOG_ENABLED": False, "SPIDER_LOADER_WARN_ONLY": True}
+
+ def syntax(self):
+ return "[-v]"
+
+ def short_desc(self):
+ return "Print Scrapy version"
+
+ def add_options(self, parser):
+ ScrapyCommand.add_options(self, parser)
+ parser.add_argument(
+ "--verbose",
+ "-v",
+ dest="verbose",
+ action="store_true",
+ help="also display twisted/python/platform info (useful for bug reports)",
+ )
+
+ def run(self, args, opts):
+ if opts.verbose:
+ versions = scrapy_components_versions()
+ width = max(len(n) for (n, _) in versions)
+ for name, version in versions:
+ print(f"{name:<{width}} : {version}")
+ else:
+ print(f"Scrapy {scrapy.__version__}")
diff --git a/scrapy/commands/view.py b/scrapy/commands/view.py
index 4b95dcfbd..ebdfa10a8 100644
--- a/scrapy/commands/view.py
+++ b/scrapy/commands/view.py
@@ -1,7 +1,21 @@
import argparse
+
from scrapy.commands import fetch
from scrapy.utils.response import open_in_browser
class Command(fetch.Command):
- pass
+ def short_desc(self):
+ return "Open URL in browser, as seen by Scrapy"
+
+ def long_desc(self):
+ return (
+ "Fetch a URL using the Scrapy downloader and show its contents in a browser"
+ )
+
+ def add_options(self, parser):
+ super().add_options(parser)
+ parser.add_argument("--headers", help=argparse.SUPPRESS)
+
+ def _print_response(self, response, opts):
+ open_in_browser(response)
diff --git a/scrapy/contracts/default.py b/scrapy/contracts/default.py
index 63c140a96..eac702cef 100644
--- a/scrapy/contracts/default.py
+++ b/scrapy/contracts/default.py
@@ -1,15 +1,23 @@
import json
+
from itemadapter import ItemAdapter, is_item
+
from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
from scrapy.http import Request
+# contracts
class UrlContract(Contract):
"""Contract to set the url of the request (mandatory)
@url http://scrapy.org
"""
- name = 'url'
+
+ name = "url"
+
+ def adjust_request_args(self, args):
+ args["url"] = self.args[0]
+ return args
class CallbackKeywordArgumentsContract(Contract):
@@ -18,7 +26,12 @@ class CallbackKeywordArgumentsContract(Contract):
@cb_kwargs {"arg1": "some value"}
"""
- name = 'cb_kwargs'
+
+ name = "cb_kwargs"
+
+ def adjust_request_args(self, args):
+ args["cb_kwargs"] = json.loads(" ".join(self.args))
+ return args
class ReturnsContract(Contract):
@@ -33,31 +46,65 @@ class ReturnsContract(Contract):
@returns request 2 10
@returns request 0 10
"""
- name = 'returns'
- object_type_verifiers = {'request': lambda x: isinstance(x, Request),
- 'requests': lambda x: isinstance(x, Request), 'item': is_item,
- 'items': is_item}
+
+ name = "returns"
+ object_type_verifiers = {
+ "request": lambda x: isinstance(x, Request),
+ "requests": lambda x: isinstance(x, Request),
+ "item": is_item,
+ "items": is_item,
+ }
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+
if len(self.args) not in [1, 2, 3]:
raise ValueError(
- f'Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}'
- )
+ f"Incorrect argument quantity: expected 1, 2 or 3, got {len(self.args)}"
+ )
self.obj_name = self.args[0] or None
self.obj_type_verifier = self.object_type_verifiers[self.obj_name]
+
try:
self.min_bound = int(self.args[1])
except IndexError:
self.min_bound = 1
+
try:
self.max_bound = int(self.args[2])
except IndexError:
- self.max_bound = float('inf')
+ self.max_bound = float("inf")
+
+ def post_process(self, output):
+ occurrences = 0
+ for x in output:
+ if self.obj_type_verifier(x):
+ occurrences += 1
+
+ assertion = self.min_bound <= occurrences <= self.max_bound
+
+ if not assertion:
+ if self.min_bound == self.max_bound:
+ expected = self.min_bound
+ else:
+ expected = f"{self.min_bound}..{self.max_bound}"
+
+ raise ContractFail(
+ f"Returned {occurrences} {self.obj_name}, expected {expected}"
+ )
class ScrapesContract(Contract):
"""Contract to check presence of fields in scraped items
@scrapes page_name page_body
"""
- name = 'scrapes'
+
+ name = "scrapes"
+
+ def post_process(self, output):
+ for x in output:
+ if is_item(x):
+ missing = [arg for arg in self.args if arg not in ItemAdapter(x)]
+ if missing:
+ missing_fields = ", ".join(missing)
+ raise ContractFail(f"Missing fields: {missing_fields}")
diff --git a/scrapy/core/downloader/contextfactory.py b/scrapy/core/downloader/contextfactory.py
index fe70014d1..909cc273f 100644
--- a/scrapy/core/downloader/contextfactory.py
+++ b/scrapy/core/downloader/contextfactory.py
@@ -1,15 +1,27 @@
import warnings
from typing import TYPE_CHECKING, Any, List, Optional
+
from OpenSSL import SSL
from twisted.internet._sslverify import _setAcceptableProtocols
-from twisted.internet.ssl import AcceptableCiphers, CertificateOptions, optionsForClientTLS, platformTrust
+from twisted.internet.ssl import (
+ AcceptableCiphers,
+ CertificateOptions,
+ optionsForClientTLS,
+ platformTrust,
+)
from twisted.web.client import BrowserLikePolicyForHTTPS
from twisted.web.iweb import IPolicyForHTTPS
from zope.interface.declarations import implementer
from zope.interface.verify import verifyObject
-from scrapy.core.downloader.tls import DEFAULT_CIPHERS, ScrapyClientTLSOptions, openssl_methods
+
+from scrapy.core.downloader.tls import (
+ DEFAULT_CIPHERS,
+ ScrapyClientTLSOptions,
+ openssl_methods,
+)
from scrapy.settings import BaseSettings
from scrapy.utils.misc import create_instance, load_object
+
if TYPE_CHECKING:
from twisted.internet._sslverify import ClientTLSOptions
@@ -26,19 +38,76 @@ class ScrapyClientContextFactory(BrowserLikePolicyForHTTPS):
understand the TLSv1, TLSv1.1 and TLSv1.2 protocols.'
"""
- def __init__(self, method: int=SSL.SSLv23_METHOD, tls_verbose_logging:
- bool=False, tls_ciphers: Optional[str]=None, *args: Any, **kwargs: Any
- ):
+ def __init__(
+ self,
+ method: int = SSL.SSLv23_METHOD,
+ tls_verbose_logging: bool = False,
+ tls_ciphers: Optional[str] = None,
+ *args: Any,
+ **kwargs: Any,
+ ):
super().__init__(*args, **kwargs)
self._ssl_method: int = method
self.tls_verbose_logging: bool = tls_verbose_logging
self.tls_ciphers: AcceptableCiphers
if tls_ciphers:
- self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(
- tls_ciphers)
+ self.tls_ciphers = AcceptableCiphers.fromOpenSSLCipherString(tls_ciphers)
else:
self.tls_ciphers = DEFAULT_CIPHERS
+ @classmethod
+ def from_settings(
+ cls,
+ settings: BaseSettings,
+ method: int = SSL.SSLv23_METHOD,
+ *args: Any,
+ **kwargs: Any,
+ ):
+ tls_verbose_logging: bool = settings.getbool(
+ "DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING"
+ )
+ tls_ciphers: Optional[str] = settings["DOWNLOADER_CLIENT_TLS_CIPHERS"]
+ return cls( # type: ignore[misc]
+ method=method,
+ tls_verbose_logging=tls_verbose_logging,
+ tls_ciphers=tls_ciphers,
+ *args,
+ **kwargs,
+ )
+
+ def getCertificateOptions(self) -> CertificateOptions:
+ # setting verify=True will require you to provide CAs
+ # to verify against; in other words: it's not that simple
+
+ # backward-compatible SSL/TLS method:
+ #
+ # * this will respect `method` attribute in often recommended
+ # `ScrapyClientContextFactory` subclass
+ # (https://github.com/scrapy/scrapy/issues/1429#issuecomment-131782133)
+ #
+ # * getattr() for `_ssl_method` attribute for context factories
+ # not calling super().__init__
+ return CertificateOptions(
+ verify=False,
+ method=getattr(self, "method", getattr(self, "_ssl_method", None)),
+ fixBrokenPeers=True,
+ acceptableCiphers=self.tls_ciphers,
+ )
+
+ # kept for old-style HTTP/1.0 downloader context twisted calls,
+ # e.g. connectSSL()
+ def getContext(self, hostname: Any = None, port: Any = None) -> SSL.Context:
+ ctx = self.getCertificateOptions().getContext()
+ ctx.set_options(0x4) # OP_LEGACY_SERVER_CONNECT
+ return ctx
+
+ def creatorForNetloc(self, hostname: bytes, port: int) -> "ClientTLSOptions":
+ return ScrapyClientTLSOptions(
+ hostname.decode("ascii"),
+ self.getContext(),
+ verbose_logging=self.tls_verbose_logging,
+ )
+
@implementer(IPolicyForHTTPS)
class BrowserLikeContextFactory(ScrapyClientContextFactory):
@@ -59,6 +128,17 @@ class BrowserLikeContextFactory(ScrapyClientContextFactory):
``SSLv23_METHOD``) which allows TLS protocol negotiation.
"""
+ def creatorForNetloc(self, hostname: bytes, port: int) -> "ClientTLSOptions":
+ # trustRoot set to platformTrust() will use the platform's root CAs.
+ #
+ # This means that a website like https://www.cacert.org will be rejected
+ # by default, since CAcert.org CA certificate is seldom shipped.
+ return optionsForClientTLS(
+ hostname=hostname.decode("ascii"),
+ trustRoot=platformTrust(),
+ extraCertificateOptions={"method": self._ssl_method},
+ )
+
@implementer(IPolicyForHTTPS)
class AcceptableProtocolsContextFactory:
@@ -67,8 +147,44 @@ class AcceptableProtocolsContextFactory:
negotiation.
"""
- def __init__(self, context_factory: Any, acceptable_protocols: List[bytes]
- ):
+ def __init__(self, context_factory: Any, acceptable_protocols: List[bytes]):
verifyObject(IPolicyForHTTPS, context_factory)
self._wrapped_context_factory: Any = context_factory
self._acceptable_protocols: List[bytes] = acceptable_protocols
+
+ def creatorForNetloc(self, hostname: bytes, port: int) -> "ClientTLSOptions":
+ options: "ClientTLSOptions" = self._wrapped_context_factory.creatorForNetloc(
+ hostname, port
+ )
+ _setAcceptableProtocols(options._ctx, self._acceptable_protocols)
+ return options
+
+
+def load_context_factory_from_settings(settings, crawler):
+ ssl_method = openssl_methods[settings.get("DOWNLOADER_CLIENT_TLS_METHOD")]
+ context_factory_cls = load_object(settings["DOWNLOADER_CLIENTCONTEXTFACTORY"])
+ # try method-aware context factory
+ try:
+ context_factory = create_instance(
+ objcls=context_factory_cls,
+ settings=settings,
+ crawler=crawler,
+ method=ssl_method,
+ )
+ except TypeError:
+ # use context factory defaults
+ context_factory = create_instance(
+ objcls=context_factory_cls,
+ settings=settings,
+ crawler=crawler,
+ )
+ msg = (
+ f"{settings['DOWNLOADER_CLIENTCONTEXTFACTORY']} does not accept "
+ "a `method` argument (type OpenSSL.SSL method, e.g. "
+ "OpenSSL.SSL.SSLv23_METHOD) and/or a `tls_verbose_logging` "
+ "argument and/or a `tls_ciphers` argument. Please, upgrade your "
+ "context factory class to handle them or ignore them."
+ )
+ warnings.warn(msg)
+
+ return context_factory
diff --git a/scrapy/core/downloader/handlers/datauri.py b/scrapy/core/downloader/handlers/datauri.py
index 25a176778..8b78c53c1 100644
--- a/scrapy/core/downloader/handlers/datauri.py
+++ b/scrapy/core/downloader/handlers/datauri.py
@@ -1,4 +1,5 @@
from w3lib.url import parse_data_uri
+
from scrapy.http import TextResponse
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
@@ -6,3 +7,15 @@ from scrapy.utils.decorators import defers
class DataURIDownloadHandler:
lazy = False
+
+ @defers
+ def download_request(self, request, spider):
+ uri = parse_data_uri(request.url)
+ respcls = responsetypes.from_mimetype(uri.media_type)
+
+ resp_kwargs = {}
+ if issubclass(respcls, TextResponse) and uri.media_type.split("/")[0] == "text":
+ charset = uri.media_type_parameters.get("charset")
+ resp_kwargs["encoding"] = charset
+
+ return respcls(url=request.url, body=uri.data, **resp_kwargs)
diff --git a/scrapy/core/downloader/handlers/file.py b/scrapy/core/downloader/handlers/file.py
index 8fa3d2938..4824167da 100644
--- a/scrapy/core/downloader/handlers/file.py
+++ b/scrapy/core/downloader/handlers/file.py
@@ -1,8 +1,17 @@
from pathlib import Path
+
from w3lib.url import file_uri_to_path
+
from scrapy.responsetypes import responsetypes
from scrapy.utils.decorators import defers
class FileDownloadHandler:
lazy = False
+
+ @defers
+ def download_request(self, request, spider):
+ filepath = file_uri_to_path(request.url)
+ body = Path(filepath).read_bytes()
+ respcls = responsetypes.from_args(filename=filepath, body=body)
+ return respcls(url=request.url, body=body)
diff --git a/scrapy/core/downloader/handlers/ftp.py b/scrapy/core/downloader/handlers/ftp.py
index 78ad52f12..4081545ce 100644
--- a/scrapy/core/downloader/handlers/ftp.py
+++ b/scrapy/core/downloader/handlers/ftp.py
@@ -27,11 +27,14 @@ In case of status 200 request, response.headers will come with two keys:
'Local Filename' - with the value of the local filename if given
'Size' - with size of the downloaded data
"""
+
import re
from io import BytesIO
from urllib.parse import unquote
+
from twisted.internet.protocol import ClientCreator, Protocol
from twisted.protocols.ftp import CommandFailed, FTPClient
+
from scrapy.http import Response
from scrapy.responsetypes import responsetypes
from scrapy.utils.httpobj import urlparse_cached
@@ -39,21 +42,84 @@ from scrapy.utils.python import to_bytes
class ReceivedDataProtocol(Protocol):
-
def __init__(self, filename=None):
self.__filename = filename
- self.body = open(filename, 'wb') if filename else BytesIO()
+ self.body = open(filename, "wb") if filename else BytesIO()
self.size = 0
+ def dataReceived(self, data):
+ self.body.write(data)
+ self.size += len(data)
+
+ @property
+ def filename(self):
+ return self.__filename
+
+ def close(self):
+ self.body.close() if self.filename else self.body.seek(0)
+
-_CODE_RE = re.compile('\\d+')
+_CODE_RE = re.compile(r"\d+")
class FTPDownloadHandler:
lazy = False
- CODE_MAPPING = {'550': 404, 'default': 503}
+
+ CODE_MAPPING = {
+ "550": 404,
+ "default": 503,
+ }
def __init__(self, settings):
- self.default_user = settings['FTP_USER']
- self.default_password = settings['FTP_PASSWORD']
- self.passive_mode = settings['FTP_PASSIVE_MODE']
+ self.default_user = settings["FTP_USER"]
+ self.default_password = settings["FTP_PASSWORD"]
+ self.passive_mode = settings["FTP_PASSIVE_MODE"]
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings)
+
+ def download_request(self, request, spider):
+ from twisted.internet import reactor
+
+ parsed_url = urlparse_cached(request)
+ user = request.meta.get("ftp_user", self.default_user)
+ password = request.meta.get("ftp_password", self.default_password)
+ passive_mode = (
+ 1 if bool(request.meta.get("ftp_passive", self.passive_mode)) else 0
+ )
+ creator = ClientCreator(
+ reactor, FTPClient, user, password, passive=passive_mode
+ )
+ dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
+ return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
+
+ def gotClient(self, client, request, filepath):
+ self.client = client
+ protocol = ReceivedDataProtocol(request.meta.get("ftp_local_filename"))
+ return client.retrieveFile(filepath, protocol).addCallbacks(
+ callback=self._build_response,
+ callbackArgs=(request, protocol),
+ errback=self._failed,
+ errbackArgs=(request,),
+ )
+
+ def _build_response(self, result, request, protocol):
+ self.result = result
+ protocol.close()
+ headers = {"local filename": protocol.filename or "", "size": protocol.size}
+ body = to_bytes(protocol.filename or protocol.body.read())
+ respcls = responsetypes.from_args(url=request.url, body=body)
+ return respcls(url=request.url, status=200, body=body, headers=headers)
+
+ def _failed(self, result, request):
+ message = result.getErrorMessage()
+ if result.type == CommandFailed:
+ m = _CODE_RE.search(message)
+ if m:
+ ftpcode = m.group()
+ httpcode = self.CODE_MAPPING.get(ftpcode, self.CODE_MAPPING["default"])
+ return Response(
+ url=request.url, status=httpcode, body=to_bytes(message)
+ )
+ raise result.type(result.value)
diff --git a/scrapy/core/downloader/handlers/http.py b/scrapy/core/downloader/handlers/http.py
index a62ecadc7..52535bd8b 100644
--- a/scrapy/core/downloader/handlers/http.py
+++ b/scrapy/core/downloader/handlers/http.py
@@ -1,2 +1,4 @@
from scrapy.core.downloader.handlers.http10 import HTTP10DownloadHandler
-from scrapy.core.downloader.handlers.http11 import HTTP11DownloadHandler as HTTPDownloadHandler
+from scrapy.core.downloader.handlers.http11 import (
+ HTTP11DownloadHandler as HTTPDownloadHandler,
+)
diff --git a/scrapy/core/downloader/handlers/http10.py b/scrapy/core/downloader/handlers/http10.py
index 6f9e8f618..6c1dac4a5 100644
--- a/scrapy/core/downloader/handlers/http10.py
+++ b/scrapy/core/downloader/handlers/http10.py
@@ -8,13 +8,32 @@ class HTTP10DownloadHandler:
lazy = False
def __init__(self, settings, crawler=None):
- self.HTTPClientFactory = load_object(settings[
- 'DOWNLOADER_HTTPCLIENTFACTORY'])
- self.ClientContextFactory = load_object(settings[
- 'DOWNLOADER_CLIENTCONTEXTFACTORY'])
+ self.HTTPClientFactory = load_object(settings["DOWNLOADER_HTTPCLIENTFACTORY"])
+ self.ClientContextFactory = load_object(
+ settings["DOWNLOADER_CLIENTCONTEXTFACTORY"]
+ )
self._settings = settings
self._crawler = crawler
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings, crawler)
+
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
- pass
+ factory = self.HTTPClientFactory(request)
+ self._connect(factory)
+ return factory.deferred
+
+ def _connect(self, factory):
+ from twisted.internet import reactor
+
+ host, port = to_unicode(factory.host), factory.port
+ if factory.scheme == b"https":
+ client_context_factory = create_instance(
+ objcls=self.ClientContextFactory,
+ settings=self._settings,
+ crawler=self._crawler,
+ )
+ return reactor.connectSSL(host, port, factory, client_context_factory)
+ return reactor.connectTCP(host, port, factory)
diff --git a/scrapy/core/downloader/handlers/http11.py b/scrapy/core/downloader/handlers/http11.py
index 2a58c6f22..c3704de3d 100644
--- a/scrapy/core/downloader/handlers/http11.py
+++ b/scrapy/core/downloader/handlers/http11.py
@@ -1,4 +1,5 @@
"""Download handlers for http and https schemes"""
+
import ipaddress
import logging
import re
@@ -6,15 +7,23 @@ from contextlib import suppress
from io import BytesIO
from time import time
from urllib.parse import urldefrag, urlunparse
+
from twisted.internet import defer, protocol, ssl
from twisted.internet.endpoints import TCP4ClientEndpoint
from twisted.internet.error import TimeoutError
from twisted.python.failure import Failure
-from twisted.web.client import URI, Agent, HTTPConnectionPool, ResponseDone, ResponseFailed
+from twisted.web.client import (
+ URI,
+ Agent,
+ HTTPConnectionPool,
+ ResponseDone,
+ ResponseFailed,
+)
from twisted.web.http import PotentialDataLoss, _DataLoss
from twisted.web.http_headers import Headers as TxHeaders
from twisted.web.iweb import UNKNOWN_LENGTH, IBodyProducer
from zope.interface import implementer
+
from scrapy import signals
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
@@ -22,6 +31,7 @@ from scrapy.exceptions import StopDownload
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
from scrapy.utils.python import to_bytes, to_unicode
+
logger = logging.getLogger(__name__)
@@ -30,21 +40,58 @@ class HTTP11DownloadHandler:
def __init__(self, settings, crawler=None):
self._crawler = crawler
+
from twisted.internet import reactor
+
self._pool = HTTPConnectionPool(reactor, persistent=True)
self._pool.maxPersistentPerHost = settings.getint(
- 'CONCURRENT_REQUESTS_PER_DOMAIN')
+ "CONCURRENT_REQUESTS_PER_DOMAIN"
+ )
self._pool._factory.noisy = False
- self._contextFactory = load_context_factory_from_settings(settings,
- crawler)
- self._default_maxsize = settings.getint('DOWNLOAD_MAXSIZE')
- self._default_warnsize = settings.getint('DOWNLOAD_WARNSIZE')
- self._fail_on_dataloss = settings.getbool('DOWNLOAD_FAIL_ON_DATALOSS')
+
+ self._contextFactory = load_context_factory_from_settings(settings, crawler)
+ self._default_maxsize = settings.getint("DOWNLOAD_MAXSIZE")
+ self._default_warnsize = settings.getint("DOWNLOAD_WARNSIZE")
+ self._fail_on_dataloss = settings.getbool("DOWNLOAD_FAIL_ON_DATALOSS")
self._disconnect_timeout = 1
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings, crawler)
+
def download_request(self, request, spider):
"""Return a deferred for the HTTP download"""
- pass
+ agent = ScrapyAgent(
+ contextFactory=self._contextFactory,
+ pool=self._pool,
+ maxsize=getattr(spider, "download_maxsize", self._default_maxsize),
+ warnsize=getattr(spider, "download_warnsize", self._default_warnsize),
+ fail_on_dataloss=self._fail_on_dataloss,
+ crawler=self._crawler,
+ )
+ return agent.download_request(request)
+
+ def close(self):
+ from twisted.internet import reactor
+
+ d = self._pool.closeCachedConnections()
+ # closeCachedConnections will hang on network or server issues, so
+ # we'll manually timeout the deferred.
+ #
+ # Twisted issue addressing this problem can be found here:
+ # https://twistedmatrix.com/trac/ticket/7738.
+ #
+ # closeCachedConnections doesn't handle external errbacks, so we'll
+ # issue a callback after `_disconnect_timeout` seconds.
+ delayed_call = reactor.callLater(self._disconnect_timeout, d.callback, [])
+
+ def cancel_delayed_call(result):
+ if delayed_call.active():
+ delayed_call.cancel()
+ return result
+
+ d.addBoth(cancel_delayed_call)
+ return d
class TunnelError(Exception):
@@ -59,13 +106,23 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
with this endpoint comes from the pool and a CONNECT has already been issued
for it.
"""
+
_truncatedLength = 1000
- _responseAnswer = 'HTTP/1\\.. (?P<status>\\d{3})(?P<reason>.{,' + str(
- _truncatedLength) + '})'
+ _responseAnswer = (
+ r"HTTP/1\.. (?P<status>\d{3})(?P<reason>.{," + str(_truncatedLength) + r"})"
+ )
_responseMatcher = re.compile(_responseAnswer.encode())
- def __init__(self, reactor, host, port, proxyConf, contextFactory,
- timeout=30, bindAddress=None):
+ def __init__(
+ self,
+ reactor,
+ host,
+ port,
+ proxyConf,
+ contextFactory,
+ timeout=30,
+ bindAddress=None,
+ ):
proxyHost, proxyPort, self._proxyAuthHeader = proxyConf
super().__init__(reactor, proxyHost, proxyPort, timeout, bindAddress)
self._tunnelReadyDeferred = defer.Deferred()
@@ -76,33 +133,83 @@ class TunnelingTCP4ClientEndpoint(TCP4ClientEndpoint):
def requestTunnel(self, protocol):
"""Asks the proxy to open a tunnel."""
- pass
+ tunnelReq = tunnel_request_data(
+ self._tunneledHost, self._tunneledPort, self._proxyAuthHeader
+ )
+ protocol.transport.write(tunnelReq)
+ self._protocolDataReceived = protocol.dataReceived
+ protocol.dataReceived = self.processProxyResponse
+ self._protocol = protocol
+ return protocol
def processProxyResponse(self, rcvd_bytes):
"""Processes the response from the proxy. If the tunnel is successfully
created, notifies the client that we are ready to send requests. If not
raises a TunnelError.
"""
- pass
+ self._connectBuffer += rcvd_bytes
+ # make sure that enough (all) bytes are consumed
+ # and that we've got all HTTP headers (ending with a blank line)
+ # from the proxy so that we don't send those bytes to the TLS layer
+ #
+ # see https://github.com/scrapy/scrapy/issues/2491
+ if b"\r\n\r\n" not in self._connectBuffer:
+ return
+ self._protocol.dataReceived = self._protocolDataReceived
+ respm = TunnelingTCP4ClientEndpoint._responseMatcher.match(self._connectBuffer)
+ if respm and int(respm.group("status")) == 200:
+ # set proper Server Name Indication extension
+ sslOptions = self._contextFactory.creatorForNetloc(
+ self._tunneledHost, self._tunneledPort
+ )
+ self._protocol.transport.startTLS(sslOptions, self._protocolFactory)
+ self._tunnelReadyDeferred.callback(self._protocol)
+ else:
+ if respm:
+ extra = {
+ "status": int(respm.group("status")),
+ "reason": respm.group("reason").strip(),
+ }
+ else:
+ extra = rcvd_bytes[: self._truncatedLength]
+ self._tunnelReadyDeferred.errback(
+ TunnelError(
+ "Could not open CONNECT tunnel with proxy "
+ f"{self._host}:{self._port} [{extra!r}]"
+ )
+ )
def connectFailed(self, reason):
"""Propagates the errback to the appropriate deferred."""
- pass
+ self._tunnelReadyDeferred.errback(reason)
+
+ def connect(self, protocolFactory):
+ self._protocolFactory = protocolFactory
+ connectDeferred = super().connect(protocolFactory)
+ connectDeferred.addCallback(self.requestTunnel)
+ connectDeferred.addErrback(self.connectFailed)
+ return self._tunnelReadyDeferred
def tunnel_request_data(host, port, proxy_auth_header=None):
- """
+ r"""
Return binary content of a CONNECT request.
>>> from scrapy.utils.python import to_unicode as s
>>> s(tunnel_request_data("example.com", 8080))
- 'CONNECT example.com:8080 HTTP/1.1\\r\\nHost: example.com:8080\\r\\n\\r\\n'
+ 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\n\r\n'
>>> s(tunnel_request_data("example.com", 8080, b"123"))
- 'CONNECT example.com:8080 HTTP/1.1\\r\\nHost: example.com:8080\\r\\nProxy-Authorization: 123\\r\\n\\r\\n'
+ 'CONNECT example.com:8080 HTTP/1.1\r\nHost: example.com:8080\r\nProxy-Authorization: 123\r\n\r\n'
>>> s(tunnel_request_data(b"example.com", "8090"))
- 'CONNECT example.com:8090 HTTP/1.1\\r\\nHost: example.com:8090\\r\\n\\r\\n'
+ 'CONNECT example.com:8090 HTTP/1.1\r\nHost: example.com:8090\r\n\r\n'
"""
- pass
+ host_value = to_bytes(host, encoding="ascii") + b":" + to_bytes(str(port))
+ tunnel_req = b"CONNECT " + host_value + b" HTTP/1.1\r\n"
+ tunnel_req += b"Host: " + host_value + b"\r\n"
+ if proxy_auth_header:
+ tunnel_req += b"Proxy-Authorization: " + proxy_auth_header + b"\r\n"
+ tunnel_req += b"\r\n"
+ return tunnel_req
class TunnelingAgent(Agent):
@@ -113,27 +220,75 @@ class TunnelingAgent(Agent):
proxy involved.
"""
- def __init__(self, reactor, proxyConf, contextFactory=None,
- connectTimeout=None, bindAddress=None, pool=None):
- super().__init__(reactor, contextFactory, connectTimeout,
- bindAddress, pool)
+ def __init__(
+ self,
+ reactor,
+ proxyConf,
+ contextFactory=None,
+ connectTimeout=None,
+ bindAddress=None,
+ pool=None,
+ ):
+ super().__init__(reactor, contextFactory, connectTimeout, bindAddress, pool)
self._proxyConf = proxyConf
self._contextFactory = contextFactory
+ def _getEndpoint(self, uri):
+ return TunnelingTCP4ClientEndpoint(
+ reactor=self._reactor,
+ host=uri.host,
+ port=uri.port,
+ proxyConf=self._proxyConf,
+ contextFactory=self._contextFactory,
+ timeout=self._endpointFactory._connectTimeout,
+ bindAddress=self._endpointFactory._bindAddress,
+ )
-class ScrapyProxyAgent(Agent):
+ def _requestWithEndpoint(
+ self, key, endpoint, method, parsedURI, headers, bodyProducer, requestPath
+ ):
+ # proxy host and port are required for HTTP pool `key`
+ # otherwise, same remote host connection request could reuse
+ # a cached tunneled connection to a different proxy
+ key += self._proxyConf
+ return super()._requestWithEndpoint(
+ key=key,
+ endpoint=endpoint,
+ method=method,
+ parsedURI=parsedURI,
+ headers=headers,
+ bodyProducer=bodyProducer,
+ requestPath=requestPath,
+ )
- def __init__(self, reactor, proxyURI, connectTimeout=None, bindAddress=
- None, pool=None):
- super().__init__(reactor=reactor, connectTimeout=connectTimeout,
- bindAddress=bindAddress, pool=pool)
+
+class ScrapyProxyAgent(Agent):
+ def __init__(
+ self, reactor, proxyURI, connectTimeout=None, bindAddress=None, pool=None
+ ):
+ super().__init__(
+ reactor=reactor,
+ connectTimeout=connectTimeout,
+ bindAddress=bindAddress,
+ pool=pool,
+ )
self._proxyURI = URI.fromBytes(proxyURI)
def request(self, method, uri, headers=None, bodyProducer=None):
"""
Issue a new request via the configured proxy.
"""
- pass
+ # Cache *all* connections under the same key, since we are only
+ # connecting to a single destination, the proxy:
+ return self._requestWithEndpoint(
+ key=("http-proxy", self._proxyURI.host, self._proxyURI.port),
+ endpoint=self._getEndpoint(self._proxyURI),
+ method=method,
+ parsedURI=URI.fromBytes(uri),
+ headers=headers,
+ bodyProducer=bodyProducer,
+ requestPath=uri,
+ )
class ScrapyAgent:
@@ -141,9 +296,17 @@ class ScrapyAgent:
_ProxyAgent = ScrapyProxyAgent
_TunnelingAgent = TunnelingAgent
- def __init__(self, contextFactory=None, connectTimeout=10, bindAddress=
- None, pool=None, maxsize=0, warnsize=0, fail_on_dataloss=True,
- crawler=None):
+ def __init__(
+ self,
+ contextFactory=None,
+ connectTimeout=10,
+ bindAddress=None,
+ pool=None,
+ maxsize=0,
+ warnsize=0,
+ fail_on_dataloss=True,
+ crawler=None,
+ ):
self._contextFactory = contextFactory
self._connectTimeout = connectTimeout
self._bindAddress = bindAddress
@@ -154,19 +317,236 @@ class ScrapyAgent:
self._txresponse = None
self._crawler = crawler
+ def _get_agent(self, request, timeout):
+ from twisted.internet import reactor
+
+ bindaddress = request.meta.get("bindaddress") or self._bindAddress
+ proxy = request.meta.get("proxy")
+ if proxy:
+ proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(proxy)
+ scheme = _parse(request.url)[0]
+ proxyHost = to_unicode(proxyHost)
+ if scheme == b"https":
+ proxyAuth = request.headers.get(b"Proxy-Authorization", None)
+ proxyConf = (proxyHost, proxyPort, proxyAuth)
+ return self._TunnelingAgent(
+ reactor=reactor,
+ proxyConf=proxyConf,
+ contextFactory=self._contextFactory,
+ connectTimeout=timeout,
+ bindAddress=bindaddress,
+ pool=self._pool,
+ )
+ proxyScheme = proxyScheme or b"http"
+ proxyURI = urlunparse((proxyScheme, proxyNetloc, proxyParams, "", "", ""))
+ return self._ProxyAgent(
+ reactor=reactor,
+ proxyURI=to_bytes(proxyURI, encoding="ascii"),
+ connectTimeout=timeout,
+ bindAddress=bindaddress,
+ pool=self._pool,
+ )
+
+ return self._Agent(
+ reactor=reactor,
+ contextFactory=self._contextFactory,
+ connectTimeout=timeout,
+ bindAddress=bindaddress,
+ pool=self._pool,
+ )
+
+ def download_request(self, request):
+ from twisted.internet import reactor
+
+ timeout = request.meta.get("download_timeout") or self._connectTimeout
+ agent = self._get_agent(request, timeout)
+
+ # request details
+ url = urldefrag(request.url)[0]
+ method = to_bytes(request.method)
+ headers = TxHeaders(request.headers)
+ if isinstance(agent, self._TunnelingAgent):
+ headers.removeHeader(b"Proxy-Authorization")
+ if request.body:
+ bodyproducer = _RequestBodyProducer(request.body)
+ else:
+ bodyproducer = None
+ start_time = time()
+ d = agent.request(
+ method, to_bytes(url, encoding="ascii"), headers, bodyproducer
+ )
+ # set download latency
+ d.addCallback(self._cb_latency, request, start_time)
+ # response body is ready to be consumed
+ d.addCallback(self._cb_bodyready, request)
+ d.addCallback(self._cb_bodydone, request, url)
+ # check download timeout
+ self._timeout_cl = reactor.callLater(timeout, d.cancel)
+ d.addBoth(self._cb_timeout, request, url, timeout)
+ return d
+
+ def _cb_timeout(self, result, request, url, timeout):
+ if self._timeout_cl.active():
+ self._timeout_cl.cancel()
+ return result
+ # needed for HTTPS requests, otherwise _ResponseReader doesn't
+ # receive connectionLost()
+ if self._txresponse:
+ self._txresponse._transport.stopProducing()
+
+ raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
+
+ def _cb_latency(self, result, request, start_time):
+ request.meta["download_latency"] = time() - start_time
+ return result
+
+ @staticmethod
+ def _headers_from_twisted_response(response):
+ headers = Headers()
+ if response.length != UNKNOWN_LENGTH:
+ headers[b"Content-Length"] = str(response.length).encode()
+ headers.update(response.headers.getAllRawHeaders())
+ return headers
+
+ def _cb_bodyready(self, txresponse, request):
+ headers_received_result = self._crawler.signals.send_catch_log(
+ signal=signals.headers_received,
+ headers=self._headers_from_twisted_response(txresponse),
+ body_length=txresponse.length,
+ request=request,
+ spider=self._crawler.spider,
+ )
+ for handler, result in headers_received_result:
+ if isinstance(result, Failure) and isinstance(result.value, StopDownload):
+ logger.debug(
+ "Download stopped for %(request)s from signal handler %(handler)s",
+ {"request": request, "handler": handler.__qualname__},
+ )
+ txresponse._transport.stopProducing()
+ txresponse._transport.loseConnection()
+ return {
+ "txresponse": txresponse,
+ "body": b"",
+ "flags": ["download_stopped"],
+ "certificate": None,
+ "ip_address": None,
+ "failure": result if result.value.fail else None,
+ }
+
+ # deliverBody hangs for responses without body
+ if txresponse.length == 0:
+ return {
+ "txresponse": txresponse,
+ "body": b"",
+ "flags": None,
+ "certificate": None,
+ "ip_address": None,
+ }
+
+ maxsize = request.meta.get("download_maxsize", self._maxsize)
+ warnsize = request.meta.get("download_warnsize", self._warnsize)
+ expected_size = txresponse.length if txresponse.length != UNKNOWN_LENGTH else -1
+ fail_on_dataloss = request.meta.get(
+ "download_fail_on_dataloss", self._fail_on_dataloss
+ )
+
+ if maxsize and expected_size > maxsize:
+ warning_msg = (
+ "Cancelling download of %(url)s: expected response "
+ "size (%(size)s) larger than download max size (%(maxsize)s)."
+ )
+ warning_args = {
+ "url": request.url,
+ "size": expected_size,
+ "maxsize": maxsize,
+ }
+
+ logger.warning(warning_msg, warning_args)
+
+ txresponse._transport.loseConnection()
+ raise defer.CancelledError(warning_msg % warning_args)
+
+ if warnsize and expected_size > warnsize:
+ logger.warning(
+ "Expected response size (%(size)s) larger than "
+ "download warn size (%(warnsize)s) in request %(request)s.",
+ {"size": expected_size, "warnsize": warnsize, "request": request},
+ )
+
+ def _cancel(_):
+ # Abort connection immediately.
+ txresponse._transport._producer.abortConnection()
+
+ d = defer.Deferred(_cancel)
+ txresponse.deliverBody(
+ _ResponseReader(
+ finished=d,
+ txresponse=txresponse,
+ request=request,
+ maxsize=maxsize,
+ warnsize=warnsize,
+ fail_on_dataloss=fail_on_dataloss,
+ crawler=self._crawler,
+ )
+ )
+
+ # save response for timeouts
+ self._txresponse = txresponse
+
+ return d
+
+ def _cb_bodydone(self, result, request, url):
+ headers = self._headers_from_twisted_response(result["txresponse"])
+ respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"])
+ try:
+ version = result["txresponse"].version
+ protocol = f"{to_unicode(version[0])}/{version[1]}.{version[2]}"
+ except (AttributeError, TypeError, IndexError):
+ protocol = None
+ response = respcls(
+ url=url,
+ status=int(result["txresponse"].code),
+ headers=headers,
+ body=result["body"],
+ flags=result["flags"],
+ certificate=result["certificate"],
+ ip_address=result["ip_address"],
+ protocol=protocol,
+ )
+ if result.get("failure"):
+ result["failure"].value.response = response
+ return result["failure"]
+ return response
+
@implementer(IBodyProducer)
class _RequestBodyProducer:
-
def __init__(self, body):
self.body = body
self.length = len(body)
+ def startProducing(self, consumer):
+ consumer.write(self.body)
+ return defer.succeed(None)
-class _ResponseReader(protocol.Protocol):
+ def pauseProducing(self):
+ pass
- def __init__(self, finished, txresponse, request, maxsize, warnsize,
- fail_on_dataloss, crawler):
+ def stopProducing(self):
+ pass
+
+
+class _ResponseReader(protocol.Protocol):
+ def __init__(
+ self,
+ finished,
+ txresponse,
+ request,
+ maxsize,
+ warnsize,
+ fail_on_dataloss,
+ crawler,
+ ):
self._finished = finished
self._txresponse = txresponse
self._request = request
@@ -180,3 +560,108 @@ class _ResponseReader(protocol.Protocol):
self._certificate = None
self._ip_address = None
self._crawler = crawler
+
+ def _finish_response(self, flags=None, failure=None):
+ self._finished.callback(
+ {
+ "txresponse": self._txresponse,
+ "body": self._bodybuf.getvalue(),
+ "flags": flags,
+ "certificate": self._certificate,
+ "ip_address": self._ip_address,
+ "failure": failure,
+ }
+ )
+
+ def connectionMade(self):
+ if self._certificate is None:
+ with suppress(AttributeError):
+ self._certificate = ssl.Certificate(
+ self.transport._producer.getPeerCertificate()
+ )
+
+ if self._ip_address is None:
+ self._ip_address = ipaddress.ip_address(
+ self.transport._producer.getPeer().host
+ )
+
+ def dataReceived(self, bodyBytes):
+ # This maybe called several times after cancel was called with buffered data.
+ if self._finished.called:
+ return
+
+ self._bodybuf.write(bodyBytes)
+ self._bytes_received += len(bodyBytes)
+
+ bytes_received_result = self._crawler.signals.send_catch_log(
+ signal=signals.bytes_received,
+ data=bodyBytes,
+ request=self._request,
+ spider=self._crawler.spider,
+ )
+ for handler, result in bytes_received_result:
+ if isinstance(result, Failure) and isinstance(result.value, StopDownload):
+ logger.debug(
+ "Download stopped for %(request)s from signal handler %(handler)s",
+ {"request": self._request, "handler": handler.__qualname__},
+ )
+ self.transport.stopProducing()
+ self.transport.loseConnection()
+ failure = result if result.value.fail else None
+ self._finish_response(flags=["download_stopped"], failure=failure)
+
+ if self._maxsize and self._bytes_received > self._maxsize:
+ logger.warning(
+ "Received (%(bytes)s) bytes larger than download "
+ "max size (%(maxsize)s) in request %(request)s.",
+ {
+ "bytes": self._bytes_received,
+ "maxsize": self._maxsize,
+ "request": self._request,
+ },
+ )
+ # Clear buffer earlier to avoid keeping data in memory for a long time.
+ self._bodybuf.truncate(0)
+ self._finished.cancel()
+
+ if (
+ self._warnsize
+ and self._bytes_received > self._warnsize
+ and not self._reached_warnsize
+ ):
+ self._reached_warnsize = True
+ logger.warning(
+ "Received more bytes than download "
+ "warn size (%(warnsize)s) in request %(request)s.",
+ {"warnsize": self._warnsize, "request": self._request},
+ )
+
+ def connectionLost(self, reason):
+ if self._finished.called:
+ return
+
+ if reason.check(ResponseDone):
+ self._finish_response()
+ return
+
+ if reason.check(PotentialDataLoss):
+ self._finish_response(flags=["partial"])
+ return
+
+ if reason.check(ResponseFailed) and any(
+ r.check(_DataLoss) for r in reason.value.reasons
+ ):
+ if not self._fail_on_dataloss:
+ self._finish_response(flags=["dataloss"])
+ return
+
+ if not self._fail_on_dataloss_warned:
+ logger.warning(
+ "Got data loss in %s. If you want to process broken "
+ "responses set the setting DOWNLOAD_FAIL_ON_DATALOSS = False"
+ " -- This message won't be shown in further requests",
+ self._txresponse.request.absoluteURI.decode(),
+ )
+ self._fail_on_dataloss_warned = True
+
+ self._finished.errback(reason)
diff --git a/scrapy/core/downloader/handlers/http2.py b/scrapy/core/downloader/handlers/http2.py
index 37c42a70f..b2579362c 100644
--- a/scrapy/core/downloader/handlers/http2.py
+++ b/scrapy/core/downloader/handlers/http2.py
@@ -1,10 +1,12 @@
from time import time
from typing import Optional, Type, TypeVar
from urllib.parse import urldefrag
+
from twisted.internet.base import DelayedCall
from twisted.internet.defer import Deferred
from twisted.internet.error import TimeoutError
from twisted.web.client import URI
+
from scrapy.core.downloader.contextfactory import load_context_factory_from_settings
from scrapy.core.downloader.webclient import _parse
from scrapy.core.http2.agent import H2Agent, H2ConnectionPool, ScrapyProxyH2Agent
@@ -13,29 +15,116 @@ from scrapy.http import Request, Response
from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.utils.python import to_bytes
-H2DownloadHandlerOrSubclass = TypeVar('H2DownloadHandlerOrSubclass', bound=
- 'H2DownloadHandler')
+H2DownloadHandlerOrSubclass = TypeVar(
+ "H2DownloadHandlerOrSubclass", bound="H2DownloadHandler"
+)
-class H2DownloadHandler:
- def __init__(self, settings: Settings, crawler: Optional[Crawler]=None):
+class H2DownloadHandler:
+ def __init__(self, settings: Settings, crawler: Optional[Crawler] = None):
self._crawler = crawler
+
from twisted.internet import reactor
+
self._pool = H2ConnectionPool(reactor, settings)
- self._context_factory = load_context_factory_from_settings(settings,
- crawler)
+ self._context_factory = load_context_factory_from_settings(settings, crawler)
+
+ @classmethod
+ def from_crawler(
+ cls: Type[H2DownloadHandlerOrSubclass], crawler: Crawler
+ ) -> H2DownloadHandlerOrSubclass:
+ return cls(crawler.settings, crawler)
+
+ def download_request(self, request: Request, spider: Spider) -> Deferred:
+ agent = ScrapyH2Agent(
+ context_factory=self._context_factory,
+ pool=self._pool,
+ crawler=self._crawler,
+ )
+ return agent.download_request(request, spider)
+
+ def close(self) -> None:
+ self._pool.close_connections()
class ScrapyH2Agent:
_Agent = H2Agent
_ProxyAgent = ScrapyProxyH2Agent
- def __init__(self, context_factory, pool: H2ConnectionPool,
- connect_timeout: int=10, bind_address: Optional[bytes]=None,
- crawler: Optional[Crawler]=None) ->None:
+ def __init__(
+ self,
+ context_factory,
+ pool: H2ConnectionPool,
+ connect_timeout: int = 10,
+ bind_address: Optional[bytes] = None,
+ crawler: Optional[Crawler] = None,
+ ) -> None:
self._context_factory = context_factory
self._connect_timeout = connect_timeout
self._bind_address = bind_address
self._pool = pool
self._crawler = crawler
+
+ def _get_agent(self, request: Request, timeout: Optional[float]) -> H2Agent:
+ from twisted.internet import reactor
+
+ bind_address = request.meta.get("bindaddress") or self._bind_address
+ proxy = request.meta.get("proxy")
+ if proxy:
+ _, _, proxy_host, proxy_port, proxy_params = _parse(proxy)
+ scheme = _parse(request.url)[0]
+
+ if scheme == b"https":
+ # ToDo
+ raise NotImplementedError(
+ "Tunneling via CONNECT method using HTTP/2.0 is not yet supported"
+ )
+ return self._ProxyAgent(
+ reactor=reactor,
+ context_factory=self._context_factory,
+ proxy_uri=URI.fromBytes(to_bytes(proxy, encoding="ascii")),
+ connect_timeout=timeout,
+ bind_address=bind_address,
+ pool=self._pool,
+ )
+
+ return self._Agent(
+ reactor=reactor,
+ context_factory=self._context_factory,
+ connect_timeout=timeout,
+ bind_address=bind_address,
+ pool=self._pool,
+ )
+
+ def download_request(self, request: Request, spider: Spider) -> Deferred:
+ from twisted.internet import reactor
+
+ timeout = request.meta.get("download_timeout") or self._connect_timeout
+ agent = self._get_agent(request, timeout)
+
+ start_time = time()
+ d = agent.request(request, spider)
+ d.addCallback(self._cb_latency, request, start_time)
+
+ timeout_cl = reactor.callLater(timeout, d.cancel)
+ d.addBoth(self._cb_timeout, request, timeout, timeout_cl)
+ return d
+
+ @staticmethod
+ def _cb_latency(
+ response: Response, request: Request, start_time: float
+ ) -> Response:
+ request.meta["download_latency"] = time() - start_time
+ return response
+
+ @staticmethod
+ def _cb_timeout(
+ response: Response, request: Request, timeout: float, timeout_cl: DelayedCall
+ ) -> Response:
+ if timeout_cl.active():
+ timeout_cl.cancel()
+ return response
+
+ url = urldefrag(request.url)[0]
+ raise TimeoutError(f"Getting {url} took longer than {timeout} seconds.")
diff --git a/scrapy/core/downloader/handlers/s3.py b/scrapy/core/downloader/handlers/s3.py
index 6f341f7e2..81d8e8115 100644
--- a/scrapy/core/downloader/handlers/s3.py
+++ b/scrapy/core/downloader/handlers/s3.py
@@ -6,33 +6,78 @@ from scrapy.utils.misc import create_instance
class S3DownloadHandler:
-
- def __init__(self, settings, *, crawler=None, aws_access_key_id=None,
- aws_secret_access_key=None, aws_session_token=None,
- httpdownloadhandler=HTTPDownloadHandler, **kw):
+ def __init__(
+ self,
+ settings,
+ *,
+ crawler=None,
+ aws_access_key_id=None,
+ aws_secret_access_key=None,
+ aws_session_token=None,
+ httpdownloadhandler=HTTPDownloadHandler,
+ **kw,
+ ):
if not is_botocore_available():
- raise NotConfigured('missing botocore library')
+ raise NotConfigured("missing botocore library")
+
if not aws_access_key_id:
- aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
+ aws_access_key_id = settings["AWS_ACCESS_KEY_ID"]
if not aws_secret_access_key:
- aws_secret_access_key = settings['AWS_SECRET_ACCESS_KEY']
+ aws_secret_access_key = settings["AWS_SECRET_ACCESS_KEY"]
if not aws_session_token:
- aws_session_token = settings['AWS_SESSION_TOKEN']
- anon = kw.get('anon')
- if (anon is None and not aws_access_key_id and not
- aws_secret_access_key):
- kw['anon'] = True
- self.anon = kw.get('anon')
+ aws_session_token = settings["AWS_SESSION_TOKEN"]
+
+ # If no credentials could be found anywhere,
+ # consider this an anonymous connection request by default;
+ # unless 'anon' was set explicitly (True/False).
+ anon = kw.get("anon")
+ if anon is None and not aws_access_key_id and not aws_secret_access_key:
+ kw["anon"] = True
+ self.anon = kw.get("anon")
+
self._signer = None
import botocore.auth
import botocore.credentials
- kw.pop('anon', None)
+
+ kw.pop("anon", None)
if kw:
- raise TypeError(f'Unexpected keyword arguments: {kw}')
+ raise TypeError(f"Unexpected keyword arguments: {kw}")
if not self.anon:
- SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
- self._signer = SignerCls(botocore.credentials.Credentials(
- aws_access_key_id, aws_secret_access_key, aws_session_token))
- _http_handler = create_instance(objcls=httpdownloadhandler,
- settings=settings, crawler=crawler)
+ SignerCls = botocore.auth.AUTH_TYPE_MAPS["s3"]
+ self._signer = SignerCls(
+ botocore.credentials.Credentials(
+ aws_access_key_id, aws_secret_access_key, aws_session_token
+ )
+ )
+
+ _http_handler = create_instance(
+ objcls=httpdownloadhandler,
+ settings=settings,
+ crawler=crawler,
+ )
self._download_http = _http_handler.download_request
+
+ @classmethod
+ def from_crawler(cls, crawler, **kwargs):
+ return cls(crawler.settings, crawler=crawler, **kwargs)
+
+ def download_request(self, request, spider):
+ p = urlparse_cached(request)
+ scheme = "https" if request.meta.get("is_secure") else "http"
+ bucket = p.hostname
+ path = p.path + "?" + p.query if p.query else p.path
+ url = f"{scheme}://{bucket}.s3.amazonaws.com{path}"
+ if self.anon:
+ request = request.replace(url=url)
+ else:
+ import botocore.awsrequest
+
+ awsrequest = botocore.awsrequest.AWSRequest(
+ method=request.method,
+ url=f"{scheme}://s3.amazonaws.com/{bucket}{path}",
+ headers=request.headers.to_unicode_dict(),
+ data=request.body,
+ )
+ self._signer.add_auth(awsrequest)
+ request = request.replace(url=url, headers=awsrequest.headers.items())
+ return self._download_http(request, spider)
diff --git a/scrapy/core/downloader/middleware.py b/scrapy/core/downloader/middleware.py
index 377040b14..dca13c01e 100644
--- a/scrapy/core/downloader/middleware.py
+++ b/scrapy/core/downloader/middleware.py
@@ -4,8 +4,10 @@ Downloader Middleware manager
See documentation in docs/topics/downloader-middleware.rst
"""
from typing import Any, Callable, Generator, List, Union, cast
+
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
+
from scrapy import Spider
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Request, Response
@@ -16,4 +18,86 @@ from scrapy.utils.defer import deferred_from_coro, mustbe_deferred
class DownloaderMiddlewareManager(MiddlewareManager):
- component_name = 'downloader middleware'
+ component_name = "downloader middleware"
+
+ @classmethod
+ def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
+ return build_component_list(settings.getwithbase("DOWNLOADER_MIDDLEWARES"))
+
+ def _add_middleware(self, mw: Any) -> None:
+ if hasattr(mw, "process_request"):
+ self.methods["process_request"].append(mw.process_request)
+ if hasattr(mw, "process_response"):
+ self.methods["process_response"].appendleft(mw.process_response)
+ if hasattr(mw, "process_exception"):
+ self.methods["process_exception"].appendleft(mw.process_exception)
+
+ def download(
+ self, download_func: Callable, request: Request, spider: Spider
+ ) -> Deferred:
+ @inlineCallbacks
+ def process_request(request: Request) -> Generator[Deferred, Any, Any]:
+ for method in self.methods["process_request"]:
+ method = cast(Callable, method)
+ response = yield deferred_from_coro(
+ method(request=request, spider=spider)
+ )
+ if response is not None and not isinstance(
+ response, (Response, Request)
+ ):
+ raise _InvalidOutput(
+ f"Middleware {method.__qualname__} must return None, Response or "
+ f"Request, got {response.__class__.__name__}"
+ )
+ if response:
+ return response
+ return (yield download_func(request=request, spider=spider))
+
+ @inlineCallbacks
+ def process_response(
+ response: Union[Response, Request]
+ ) -> Generator[Deferred, Any, Union[Response, Request]]:
+ if response is None:
+ raise TypeError("Received None in process_response")
+ elif isinstance(response, Request):
+ return response
+
+ for method in self.methods["process_response"]:
+ method = cast(Callable, method)
+ response = yield deferred_from_coro(
+ method(request=request, response=response, spider=spider)
+ )
+ if not isinstance(response, (Response, Request)):
+ raise _InvalidOutput(
+ f"Middleware {method.__qualname__} must return Response or Request, "
+ f"got {type(response)}"
+ )
+ if isinstance(response, Request):
+ return response
+ return response
+
+ @inlineCallbacks
+ def process_exception(
+ failure: Failure,
+ ) -> Generator[Deferred, Any, Union[Failure, Response, Request]]:
+ exception = failure.value
+ for method in self.methods["process_exception"]:
+ method = cast(Callable, method)
+ response = yield deferred_from_coro(
+ method(request=request, exception=exception, spider=spider)
+ )
+ if response is not None and not isinstance(
+ response, (Response, Request)
+ ):
+ raise _InvalidOutput(
+ f"Middleware {method.__qualname__} must return None, Response or "
+ f"Request, got {type(response)}"
+ )
+ if response:
+ return response
+ return failure
+
+ deferred = mustbe_deferred(process_request, request)
+ deferred.addErrback(process_exception)
+ deferred.addCallback(process_response)
+ return deferred
diff --git a/scrapy/core/downloader/tls.py b/scrapy/core/downloader/tls.py
index 1295d26c4..33cea7263 100644
--- a/scrapy/core/downloader/tls.py
+++ b/scrapy/core/downloader/tls.py
@@ -1,18 +1,32 @@
import logging
from typing import Any, Dict
+
from OpenSSL import SSL
from service_identity.exceptions import CertificateError
-from twisted.internet._sslverify import ClientTLSOptions, VerificationError, verifyHostname
+from twisted.internet._sslverify import (
+ ClientTLSOptions,
+ VerificationError,
+ verifyHostname,
+)
from twisted.internet.ssl import AcceptableCiphers
+
from scrapy.utils.ssl import get_temp_key_info, x509name_to_string
+
logger = logging.getLogger(__name__)
-METHOD_TLS = 'TLS'
-METHOD_TLSv10 = 'TLSv1.0'
-METHOD_TLSv11 = 'TLSv1.1'
-METHOD_TLSv12 = 'TLSv1.2'
-openssl_methods: Dict[str, int] = {METHOD_TLS: SSL.SSLv23_METHOD,
- METHOD_TLSv10: SSL.TLSv1_METHOD, METHOD_TLSv11: SSL.TLSv1_1_METHOD,
- METHOD_TLSv12: SSL.TLSv1_2_METHOD}
+
+
+METHOD_TLS = "TLS"
+METHOD_TLSv10 = "TLSv1.0"
+METHOD_TLSv11 = "TLSv1.1"
+METHOD_TLSv12 = "TLSv1.2"
+
+
+openssl_methods: Dict[str, int] = {
+ METHOD_TLS: SSL.SSLv23_METHOD, # protocol negotiation (recommended)
+ METHOD_TLSv10: SSL.TLSv1_METHOD, # TLS 1.0 only
+ METHOD_TLSv11: SSL.TLSv1_1_METHOD, # TLS 1.1 only
+ METHOD_TLSv12: SSL.TLSv1_2_METHOD, # TLS 1.2 only
+}
class ScrapyClientTLSOptions(ClientTLSOptions):
@@ -26,11 +40,52 @@ class ScrapyClientTLSOptions(ClientTLSOptions):
logging warnings. Also, HTTPS connection parameters logging is added.
"""
- def __init__(self, hostname: str, ctx: SSL.Context, verbose_logging:
- bool=False):
+ def __init__(self, hostname: str, ctx: SSL.Context, verbose_logging: bool = False):
super().__init__(hostname, ctx)
self.verbose_logging: bool = verbose_logging
+ def _identityVerifyingInfoCallback(
+ self, connection: SSL.Connection, where: int, ret: Any
+ ) -> None:
+ if where & SSL.SSL_CB_HANDSHAKE_START:
+ connection.set_tlsext_host_name(self._hostnameBytes)
+ elif where & SSL.SSL_CB_HANDSHAKE_DONE:
+ if self.verbose_logging:
+ logger.debug(
+ "SSL connection to %s using protocol %s, cipher %s",
+ self._hostnameASCII,
+ connection.get_protocol_version_name(),
+ connection.get_cipher_name(),
+ )
+ server_cert = connection.get_peer_certificate()
+ if server_cert:
+ logger.debug(
+ 'SSL connection certificate: issuer "%s", subject "%s"',
+ x509name_to_string(server_cert.get_issuer()),
+ x509name_to_string(server_cert.get_subject()),
+ )
+ key_info = get_temp_key_info(connection._ssl)
+ if key_info:
+ logger.debug("SSL temp key: %s", key_info)
+
+ try:
+ verifyHostname(connection, self._hostnameASCII)
+ except (CertificateError, VerificationError) as e:
+ logger.warning(
+ 'Remote certificate is not valid for hostname "%s"; %s',
+ self._hostnameASCII,
+ e,
+ )
+
+ except ValueError as e:
+ logger.warning(
+ "Ignoring error while verifying certificate "
+ 'from host "%s" (exception: %r)',
+ self._hostnameASCII,
+ e,
+ )
+
DEFAULT_CIPHERS: AcceptableCiphers = AcceptableCiphers.fromOpenSSLCipherString(
- 'DEFAULT')
+ "DEFAULT"
+)
diff --git a/scrapy/core/downloader/webclient.py b/scrapy/core/downloader/webclient.py
index 96778332a..bb1f73805 100644
--- a/scrapy/core/downloader/webclient.py
+++ b/scrapy/core/downloader/webclient.py
@@ -2,9 +2,11 @@ import re
from time import time
from typing import Optional, Tuple
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
+
from twisted.internet import defer
from twisted.internet.protocol import ClientFactory
from twisted.web.http import HTTPClient
+
from scrapy import Request
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
@@ -12,48 +14,185 @@ from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes, to_unicode
-def _parse(url: str) ->Tuple[bytes, bytes, bytes, int, bytes]:
+def _parsed_url_args(parsed: ParseResult) -> Tuple[bytes, bytes, bytes, int, bytes]:
+ # Assume parsed is urlparse-d from Request.url,
+ # which was passed via safe_url_string and is ascii-only.
+ path_str = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
+ path = to_bytes(path_str, encoding="ascii")
+ assert parsed.hostname is not None
+ host = to_bytes(parsed.hostname, encoding="ascii")
+ port = parsed.port
+ scheme = to_bytes(parsed.scheme, encoding="ascii")
+ netloc = to_bytes(parsed.netloc, encoding="ascii")
+ if port is None:
+ port = 443 if scheme == b"https" else 80
+ return scheme, netloc, host, port, path
+
+
+def _parse(url: str) -> Tuple[bytes, bytes, bytes, int, bytes]:
"""Return tuple of (scheme, netloc, host, port, path),
all in bytes except for port which is int.
Assume url is from Request.url, which was passed via safe_url_string
and is ascii-only.
"""
- pass
+ url = url.strip()
+ if not re.match(r"^\w+://", url):
+ url = "//" + url
+ parsed = urlparse(url)
+ return _parsed_url_args(parsed)
class ScrapyHTTPPageGetter(HTTPClient):
- delimiter = b'\n'
+ delimiter = b"\n"
+
+ def connectionMade(self):
+ self.headers = Headers() # bucket for response headers
+
+ # Method command
+ self.sendCommand(self.factory.method, self.factory.path)
+ # Headers
+ for key, values in self.factory.headers.items():
+ for value in values:
+ self.sendHeader(key, value)
+ self.endHeaders()
+ # Body
+ if self.factory.body is not None:
+ self.transport.write(self.factory.body)
+
+ def lineReceived(self, line):
+ return HTTPClient.lineReceived(self, line.rstrip())
+
+ def handleHeader(self, key, value):
+ self.headers.appendlist(key, value)
+
+ def handleStatus(self, version, status, message):
+ self.factory.gotStatus(version, status, message)
+
+ def handleEndHeaders(self):
+ self.factory.gotHeaders(self.headers)
+
+ def connectionLost(self, reason):
+ self._connection_lost_reason = reason
+ HTTPClient.connectionLost(self, reason)
+ self.factory.noPage(reason)
+
+ def handleResponse(self, response):
+ if self.factory.method.upper() == b"HEAD":
+ self.factory.page(b"")
+ elif self.length is not None and self.length > 0:
+ self.factory.noPage(self._connection_lost_reason)
+ else:
+ self.factory.page(response)
+ self.transport.loseConnection()
+
+ def timeout(self):
+ self.transport.loseConnection()
+
+ # transport cleanup needed for HTTPS connections
+ if self.factory.url.startswith(b"https"):
+ self.transport.stopProducing()
+ self.factory.noPage(
+ defer.TimeoutError(
+ f"Getting {self.factory.url} took longer "
+ f"than {self.factory.timeout} seconds."
+ )
+ )
+
+# This class used to inherit from Twistedโs
+# twisted.web.client.HTTPClientFactory. When that class was deprecated in
+# Twisted (https://github.com/twisted/twisted/pull/643), we merged its
+# non-overridden code into this class.
class ScrapyHTTPClientFactory(ClientFactory):
protocol = ScrapyHTTPPageGetter
+
waiting = 1
noisy = False
followRedirect = False
afterFoundGet = False
- def __init__(self, request: Request, timeout: float=180):
+ def _build_response(self, body, request):
+ request.meta["download_latency"] = self.headers_time - self.start_time
+ status = int(self.status)
+ headers = Headers(self.response_headers)
+ respcls = responsetypes.from_args(headers=headers, url=self._url, body=body)
+ return respcls(
+ url=self._url,
+ status=status,
+ headers=headers,
+ body=body,
+ protocol=to_unicode(self.version),
+ )
+
+ def _set_connection_attributes(self, request):
+ parsed = urlparse_cached(request)
+ self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(
+ parsed
+ )
+ proxy = request.meta.get("proxy")
+ if proxy:
+ self.scheme, _, self.host, self.port, _ = _parse(proxy)
+ self.path = self.url
+
+ def __init__(self, request: Request, timeout: float = 180):
self._url: str = urldefrag(request.url)[0]
- self.url: bytes = to_bytes(self._url, encoding='ascii')
- self.method: bytes = to_bytes(request.method, encoding='ascii')
+ # converting to bytes to comply to Twisted interface
+ self.url: bytes = to_bytes(self._url, encoding="ascii")
+ self.method: bytes = to_bytes(request.method, encoding="ascii")
self.body: Optional[bytes] = request.body or None
self.headers: Headers = Headers(request.headers)
self.response_headers: Optional[Headers] = None
- self.timeout: float = request.meta.get('download_timeout') or timeout
+ self.timeout: float = request.meta.get("download_timeout") or timeout
self.start_time: float = time()
- self.deferred: defer.Deferred = defer.Deferred().addCallback(self.
- _build_response, request)
+ self.deferred: defer.Deferred = defer.Deferred().addCallback(
+ self._build_response, request
+ )
+
+ # Fixes Twisted 11.1.0+ support as HTTPClientFactory is expected
+ # to have _disconnectedDeferred. See Twisted r32329.
+ # As Scrapy implements it's own logic to handle redirects is not
+ # needed to add the callback _waitForDisconnect.
+ # Specifically this avoids the AttributeError exception when
+ # clientConnectionFailed method is called.
self._disconnectedDeferred: defer.Deferred = defer.Deferred()
+
self._set_connection_attributes(request)
- self.headers.setdefault('Host', self.netloc)
+
+ # set Host header based on url
+ self.headers.setdefault("Host", self.netloc)
+
+ # set Content-Length based len of body
if self.body is not None:
- self.headers['Content-Length'] = len(self.body)
- self.headers.setdefault('Connection', 'close')
- elif self.method == b'POST':
- self.headers['Content-Length'] = 0
+ self.headers["Content-Length"] = len(self.body)
+ # just in case a broken http/1.1 decides to keep connection alive
+ self.headers.setdefault("Connection", "close")
+ # Content-Length must be specified in POST method even with no body
+ elif self.method == b"POST":
+ self.headers["Content-Length"] = 0
+
+ def __repr__(self) -> str:
+ return f"<{self.__class__.__name__}: {self._url}>"
+
+ def _cancelTimeout(self, result, timeoutCall):
+ if timeoutCall.active():
+ timeoutCall.cancel()
+ return result
+
+ def buildProtocol(self, addr):
+ p = ClientFactory.buildProtocol(self, addr)
+ p.followRedirect = self.followRedirect
+ p.afterFoundGet = self.afterFoundGet
+ if self.timeout:
+ from twisted.internet import reactor
- def __repr__(self) ->str:
- return f'<{self.__class__.__name__}: {self._url}>'
+ timeoutCall = reactor.callLater(self.timeout, p.timeout)
+ self.deferred.addBoth(self._cancelTimeout, timeoutCall)
+ return p
+
+ def gotHeaders(self, headers):
+ self.headers_time = time()
+ self.response_headers = headers
def gotStatus(self, version, status, message):
"""
@@ -66,7 +205,17 @@ class ScrapyHTTPClientFactory(ClientFactory):
@param message: The HTTP status message.
@type message: L{bytes}
"""
- pass
+ self.version, self.status, self.message = version, status, message
+
+ def page(self, page):
+ if self.waiting:
+ self.waiting = 0
+ self.deferred.callback(page)
+
+ def noPage(self, reason):
+ if self.waiting:
+ self.waiting = 0
+ self.deferred.errback(reason)
def clientConnectionFailed(self, _, reason):
"""
@@ -74,4 +223,9 @@ class ScrapyHTTPClientFactory(ClientFactory):
result has yet been provided to the result Deferred, provide the
connection failure reason as an error result.
"""
- pass
+ if self.waiting:
+ self.waiting = 0
+ # If the connection attempt failed, there is nothing more to
+ # disconnect, so just fire that Deferred now.
+ self._disconnectedDeferred.callback(None)
+ self.deferred.errback(reason)
diff --git a/scrapy/core/engine.py b/scrapy/core/engine.py
index e7403fb61..92967ffc8 100644
--- a/scrapy/core/engine.py
+++ b/scrapy/core/engine.py
@@ -6,10 +6,24 @@ For more information see docs/topics/architecture.rst
"""
import logging
from time import time
-from typing import TYPE_CHECKING, Any, Callable, Generator, Iterable, Iterator, Optional, Set, Type, Union, cast
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Generator,
+ Iterable,
+ Iterator,
+ Optional,
+ Set,
+ Type,
+ Union,
+ cast,
+)
+
from twisted.internet.defer import Deferred, inlineCallbacks, succeed
from twisted.internet.task import LoopingCall
from twisted.python.failure import Failure
+
from scrapy import signals
from scrapy.core.downloader import Downloader
from scrapy.core.scraper import Scraper
@@ -23,30 +37,54 @@ from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import global_object_name
from scrapy.utils.reactor import CallLaterOnce
+
if TYPE_CHECKING:
from scrapy.core.scheduler import BaseScheduler
from scrapy.crawler import Crawler
+
logger = logging.getLogger(__name__)
class Slot:
-
- def __init__(self, start_requests: Iterable[Request], close_if_idle:
- bool, nextcall: CallLaterOnce, scheduler: 'BaseScheduler') ->None:
+ def __init__(
+ self,
+ start_requests: Iterable[Request],
+ close_if_idle: bool,
+ nextcall: CallLaterOnce,
+ scheduler: "BaseScheduler",
+ ) -> None:
self.closing: Optional[Deferred] = None
self.inprogress: Set[Request] = set()
self.start_requests: Optional[Iterator[Request]] = iter(start_requests)
self.close_if_idle: bool = close_if_idle
self.nextcall: CallLaterOnce = nextcall
- self.scheduler: 'BaseScheduler' = scheduler
+ self.scheduler: "BaseScheduler" = scheduler
self.heartbeat: LoopingCall = LoopingCall(nextcall.schedule)
+ def add_request(self, request: Request) -> None:
+ self.inprogress.add(request)
-class ExecutionEngine:
+ def remove_request(self, request: Request) -> None:
+ self.inprogress.remove(request)
+ self._maybe_fire_closing()
+
+ def close(self) -> Deferred:
+ self.closing = Deferred()
+ self._maybe_fire_closing()
+ return self.closing
+
+ def _maybe_fire_closing(self) -> None:
+ if self.closing is not None and not self.inprogress:
+ if self.nextcall:
+ self.nextcall.cancel()
+ if self.heartbeat.running:
+ self.heartbeat.stop()
+ self.closing.callback(None)
- def __init__(self, crawler: 'Crawler', spider_closed_callback: Callable
- ) ->None:
- self.crawler: 'Crawler' = crawler
+
+class ExecutionEngine:
+ def __init__(self, crawler: "Crawler", spider_closed_callback: Callable) -> None:
+ self.crawler: "Crawler" = crawler
self.settings: Settings = crawler.settings
self.signals: SignalManager = crawler.signals
assert crawler.logformatter
@@ -55,43 +93,386 @@ class ExecutionEngine:
self.spider: Optional[Spider] = None
self.running: bool = False
self.paused: bool = False
- self.scheduler_cls: Type['BaseScheduler'] = self._get_scheduler_class(
- crawler.settings)
- downloader_cls: Type[Downloader] = load_object(self.settings[
- 'DOWNLOADER'])
+ self.scheduler_cls: Type["BaseScheduler"] = self._get_scheduler_class(
+ crawler.settings
+ )
+ downloader_cls: Type[Downloader] = load_object(self.settings["DOWNLOADER"])
self.downloader: Downloader = downloader_cls(crawler)
self.scraper = Scraper(crawler)
self._spider_closed_callback: Callable = spider_closed_callback
self.start_time: Optional[float] = None
- def stop(self) ->Deferred:
+ def _get_scheduler_class(self, settings: BaseSettings) -> Type["BaseScheduler"]:
+ from scrapy.core.scheduler import BaseScheduler
+
+ scheduler_cls: Type = load_object(settings["SCHEDULER"])
+ if not issubclass(scheduler_cls, BaseScheduler):
+ raise TypeError(
+ f"The provided scheduler class ({settings['SCHEDULER']})"
+ " does not fully implement the scheduler interface"
+ )
+ return scheduler_cls
+
+ @inlineCallbacks
+ def start(self) -> Generator[Deferred, Any, None]:
+ if self.running:
+ raise RuntimeError("Engine already running")
+ self.start_time = time()
+ yield self.signals.send_catch_log_deferred(signal=signals.engine_started)
+ self.running = True
+ self._closewait: Deferred = Deferred()
+ yield self._closewait
+
+ def stop(self) -> Deferred:
"""Gracefully stop the execution engine"""
- pass
- def close(self) ->Deferred:
+ @inlineCallbacks
+ def _finish_stopping_engine(_: Any) -> Generator[Deferred, Any, None]:
+ yield self.signals.send_catch_log_deferred(signal=signals.engine_stopped)
+ self._closewait.callback(None)
+
+ if not self.running:
+ raise RuntimeError("Engine not running")
+
+ self.running = False
+ dfd = (
+ self.close_spider(self.spider, reason="shutdown")
+ if self.spider is not None
+ else succeed(None)
+ )
+ return dfd.addBoth(_finish_stopping_engine)
+
+ def close(self) -> Deferred:
"""
Gracefully close the execution engine.
If it has already been started, stop it. In all cases, close the spider and the downloader.
"""
- pass
+ if self.running:
+ return self.stop() # will also close spider and downloader
+ if self.spider is not None:
+ return self.close_spider(
+ self.spider, reason="shutdown"
+ ) # will also close downloader
+ self.downloader.close()
+ return succeed(None)
+
+ def pause(self) -> None:
+ self.paused = True
+
+ def unpause(self) -> None:
+ self.paused = False
+
+ def _next_request(self) -> None:
+ if self.slot is None:
+ return
+
+ assert self.spider is not None # typing
+
+ if self.paused:
+ return None
+
+ while (
+ not self._needs_backout()
+ and self._next_request_from_scheduler() is not None
+ ):
+ pass
+
+ if self.slot.start_requests is not None and not self._needs_backout():
+ try:
+ request = next(self.slot.start_requests)
+ except StopIteration:
+ self.slot.start_requests = None
+ except Exception:
+ self.slot.start_requests = None
+ logger.error(
+ "Error while obtaining start requests",
+ exc_info=True,
+ extra={"spider": self.spider},
+ )
+ else:
+ self.crawl(request)
+
+ if self.spider_is_idle() and self.slot.close_if_idle:
+ self._spider_idle()
+
+ def _needs_backout(self) -> bool:
+ assert self.slot is not None # typing
+ assert self.scraper.slot is not None # typing
+ return (
+ not self.running
+ or bool(self.slot.closing)
+ or self.downloader.needs_backout()
+ or self.scraper.slot.needs_backout()
+ )
+
+ def _next_request_from_scheduler(self) -> Optional[Deferred]:
+ assert self.slot is not None # typing
+ assert self.spider is not None # typing
+
+ request = self.slot.scheduler.next_request()
+ if request is None:
+ return None
+
+ d = self._download(request)
+ d.addBoth(self._handle_downloader_output, request)
+ d.addErrback(
+ lambda f: logger.info(
+ "Error while handling downloader output",
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": self.spider},
+ )
+ )
+
+ def _remove_request(_: Any) -> None:
+ assert self.slot
+ self.slot.remove_request(request)
+
+ d.addBoth(_remove_request)
+ d.addErrback(
+ lambda f: logger.info(
+ "Error while removing request from slot",
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": self.spider},
+ )
+ )
+ slot = self.slot
+ d.addBoth(lambda _: slot.nextcall.schedule())
+ d.addErrback(
+ lambda f: logger.info(
+ "Error while scheduling new request",
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": self.spider},
+ )
+ )
+ return d
- def crawl(self, request: Request) ->None:
+ def _handle_downloader_output(
+ self, result: Union[Request, Response, Failure], request: Request
+ ) -> Optional[Deferred]:
+ assert self.spider is not None # typing
+
+ if not isinstance(result, (Request, Response, Failure)):
+ raise TypeError(
+ f"Incorrect type: expected Request, Response or Failure, got {type(result)}: {result!r}"
+ )
+
+ # downloader middleware can return requests (for example, redirects)
+ if isinstance(result, Request):
+ self.crawl(result)
+ return None
+
+ d = self.scraper.enqueue_scrape(result, request, self.spider)
+ d.addErrback(
+ lambda f: logger.error(
+ "Error while enqueuing downloader output",
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": self.spider},
+ )
+ )
+ return d
+
+ def spider_is_idle(self) -> bool:
+ if self.slot is None:
+ raise RuntimeError("Engine slot not assigned")
+ if not self.scraper.slot.is_idle(): # type: ignore[union-attr]
+ return False
+ if self.downloader.active: # downloader has pending requests
+ return False
+ if self.slot.start_requests is not None: # not all start requests are handled
+ return False
+ if self.slot.scheduler.has_pending_requests():
+ return False
+ return True
+
+ def crawl(self, request: Request) -> None:
"""Inject the request into the spider <-> downloader pipeline"""
- pass
+ if self.spider is None:
+ raise RuntimeError(f"No open spider to crawl: {request}")
+ self._schedule_request(request, self.spider)
+ self.slot.nextcall.schedule() # type: ignore[union-attr]
+
+ def _schedule_request(self, request: Request, spider: Spider) -> None:
+ request_scheduled_result = self.signals.send_catch_log(
+ signals.request_scheduled,
+ request=request,
+ spider=spider,
+ dont_log=IgnoreRequest,
+ )
+ for handler, result in request_scheduled_result:
+ if isinstance(result, Failure) and isinstance(result.value, IgnoreRequest):
+ logger.debug(
+ f"Signal handler {global_object_name(handler)} dropped "
+ f"request {request} before it reached the scheduler."
+ )
+ return
+ if not self.slot.scheduler.enqueue_request(request): # type: ignore[union-attr]
+ self.signals.send_catch_log(
+ signals.request_dropped, request=request, spider=spider
+ )
- def download(self, request: Request) ->Deferred:
+ def download(self, request: Request) -> Deferred:
"""Return a Deferred which fires with a Response as result, only downloader middlewares are applied"""
- pass
+ if self.spider is None:
+ raise RuntimeError(f"No open spider to crawl: {request}")
+ return self._download(request).addBoth(self._downloaded, request)
+
+ def _downloaded(
+ self, result: Union[Response, Request, Failure], request: Request
+ ) -> Union[Deferred, Response, Failure]:
+ assert self.slot is not None # typing
+ self.slot.remove_request(request)
+ return self.download(result) if isinstance(result, Request) else result
+
+ def _download(self, request: Request) -> Deferred:
+ assert self.slot is not None # typing
- def _spider_idle(self) ->None:
+ self.slot.add_request(request)
+
+ def _on_success(result: Union[Response, Request]) -> Union[Response, Request]:
+ if not isinstance(result, (Response, Request)):
+ raise TypeError(
+ f"Incorrect type: expected Response or Request, got {type(result)}: {result!r}"
+ )
+ if isinstance(result, Response):
+ if result.request is None:
+ result.request = request
+ assert self.spider is not None
+ logkws = self.logformatter.crawled(result.request, result, self.spider)
+ if logkws is not None:
+ logger.log(
+ *logformatter_adapter(logkws), extra={"spider": self.spider}
+ )
+ self.signals.send_catch_log(
+ signal=signals.response_received,
+ response=result,
+ request=result.request,
+ spider=self.spider,
+ )
+ return result
+
+ def _on_complete(_: Any) -> Any:
+ assert self.slot is not None
+ self.slot.nextcall.schedule()
+ return _
+
+ assert self.spider is not None
+ dwld = self.downloader.fetch(request, self.spider)
+ dwld.addCallbacks(_on_success)
+ dwld.addBoth(_on_complete)
+ return dwld
+
+ @inlineCallbacks
+ def open_spider(
+ self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True
+ ) -> Generator[Deferred, Any, None]:
+ if self.slot is not None:
+ raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
+ logger.info("Spider opened", extra={"spider": spider})
+ nextcall = CallLaterOnce(self._next_request)
+ scheduler = create_instance(
+ self.scheduler_cls, settings=None, crawler=self.crawler
+ )
+ start_requests = yield self.scraper.spidermw.process_start_requests(
+ start_requests, spider
+ )
+ self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
+ self.spider = spider
+ if hasattr(scheduler, "open"):
+ yield scheduler.open(spider)
+ yield self.scraper.open_spider(spider)
+ assert self.crawler.stats
+ self.crawler.stats.open_spider(spider)
+ yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
+ self.slot.nextcall.schedule()
+ self.slot.heartbeat.start(5)
+
+ def _spider_idle(self) -> None:
"""
Called when a spider gets idle, i.e. when there are no remaining requests to download or schedule.
It can be called multiple times. If a handler for the spider_idle signal raises a DontCloseSpider
exception, the spider is not closed until the next loop and this function is guaranteed to be called
(at least) once again. A handler can raise CloseSpider to provide a custom closing reason.
"""
- pass
+ assert self.spider is not None # typing
+ expected_ex = (DontCloseSpider, CloseSpider)
+ res = self.signals.send_catch_log(
+ signals.spider_idle, spider=self.spider, dont_log=expected_ex
+ )
+ detected_ex = {
+ ex: x.value
+ for _, x in res
+ for ex in expected_ex
+ if isinstance(x, Failure) and isinstance(x.value, ex)
+ }
+ if DontCloseSpider in detected_ex:
+ return None
+ if self.spider_is_idle():
+ ex = detected_ex.get(CloseSpider, CloseSpider(reason="finished"))
+ assert isinstance(ex, CloseSpider) # typing
+ self.close_spider(self.spider, reason=ex.reason)
- def close_spider(self, spider: Spider, reason: str='cancelled') ->Deferred:
+ def close_spider(self, spider: Spider, reason: str = "cancelled") -> Deferred:
"""Close (cancel) spider and clear all its outstanding requests"""
- pass
+ if self.slot is None:
+ raise RuntimeError("Engine slot not assigned")
+
+ if self.slot.closing is not None:
+ return self.slot.closing
+
+ logger.info(
+ "Closing spider (%(reason)s)", {"reason": reason}, extra={"spider": spider}
+ )
+
+ dfd = self.slot.close()
+
+ def log_failure(msg: str) -> Callable:
+ def errback(failure: Failure) -> None:
+ logger.error(
+ msg, exc_info=failure_to_exc_info(failure), extra={"spider": spider}
+ )
+
+ return errback
+
+ dfd.addBoth(lambda _: self.downloader.close())
+ dfd.addErrback(log_failure("Downloader close failure"))
+
+ dfd.addBoth(lambda _: self.scraper.close_spider(spider))
+ dfd.addErrback(log_failure("Scraper close failure"))
+
+ if hasattr(self.slot.scheduler, "close"):
+ dfd.addBoth(lambda _: cast(Slot, self.slot).scheduler.close(reason))
+ dfd.addErrback(log_failure("Scheduler close failure"))
+
+ dfd.addBoth(
+ lambda _: self.signals.send_catch_log_deferred(
+ signal=signals.spider_closed,
+ spider=spider,
+ reason=reason,
+ )
+ )
+ dfd.addErrback(log_failure("Error while sending spider_close signal"))
+
+ def close_stats(_: Any) -> None:
+ assert self.crawler.stats
+ self.crawler.stats.close_spider(spider, reason=reason)
+
+ dfd.addBoth(close_stats)
+ dfd.addErrback(log_failure("Stats close failure"))
+
+ dfd.addBoth(
+ lambda _: logger.info(
+ "Spider closed (%(reason)s)",
+ {"reason": reason},
+ extra={"spider": spider},
+ )
+ )
+
+ dfd.addBoth(lambda _: setattr(self, "slot", None))
+ dfd.addErrback(log_failure("Error while unassigning slot"))
+
+ dfd.addBoth(lambda _: setattr(self, "spider", None))
+ dfd.addErrback(log_failure("Error while unassigning spider"))
+
+ dfd.addBoth(lambda _: self._spider_closed_callback(spider))
+
+ return dfd
diff --git a/scrapy/core/http2/agent.py b/scrapy/core/http2/agent.py
index caf067cd1..215ea9716 100644
--- a/scrapy/core/http2/agent.py
+++ b/scrapy/core/http2/agent.py
@@ -1,12 +1,19 @@
from collections import deque
from typing import Deque, Dict, List, Optional, Tuple
+
from twisted.internet import defer
from twisted.internet.base import ReactorBase
from twisted.internet.defer import Deferred
from twisted.internet.endpoints import HostnameEndpoint
from twisted.python.failure import Failure
-from twisted.web.client import URI, BrowserLikePolicyForHTTPS, ResponseFailed, _StandardEndpointFactory
+from twisted.web.client import (
+ URI,
+ BrowserLikePolicyForHTTPS,
+ ResponseFailed,
+ _StandardEndpointFactory,
+)
from twisted.web.error import SchemeNotSupported
+
from scrapy.core.downloader.contextfactory import AcceptableProtocolsContextFactory
from scrapy.core.http2.protocol import H2ClientFactory, H2ClientProtocol
from scrapy.http.request import Request
@@ -15,54 +22,148 @@ from scrapy.spiders import Spider
class H2ConnectionPool:
-
- def __init__(self, reactor: ReactorBase, settings: Settings) ->None:
+ def __init__(self, reactor: ReactorBase, settings: Settings) -> None:
self._reactor = reactor
self.settings = settings
+
+ # Store a dictionary which is used to get the respective
+ # H2ClientProtocolInstance using the key as Tuple(scheme, hostname, port)
self._connections: Dict[Tuple, H2ClientProtocol] = {}
+
+ # Save all requests that arrive before the connection is established
self._pending_requests: Dict[Tuple, Deque[Deferred]] = {}
- def close_connections(self) ->None:
+ def get_connection(
+ self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
+ ) -> Deferred:
+ if key in self._pending_requests:
+ # Received a request while connecting to remote
+ # Create a deferred which will fire with the H2ClientProtocol
+ # instance
+ d: Deferred = Deferred()
+ self._pending_requests[key].append(d)
+ return d
+
+ # Check if we already have a connection to the remote
+ conn = self._connections.get(key, None)
+ if conn:
+ # Return this connection instance wrapped inside a deferred
+ return defer.succeed(conn)
+
+ # No connection is established for the given URI
+ return self._new_connection(key, uri, endpoint)
+
+ def _new_connection(
+ self, key: Tuple, uri: URI, endpoint: HostnameEndpoint
+ ) -> Deferred:
+ self._pending_requests[key] = deque()
+
+ conn_lost_deferred: Deferred = Deferred()
+ conn_lost_deferred.addCallback(self._remove_connection, key)
+
+ factory = H2ClientFactory(uri, self.settings, conn_lost_deferred)
+ conn_d = endpoint.connect(factory)
+ conn_d.addCallback(self.put_connection, key)
+
+ d: Deferred = Deferred()
+ self._pending_requests[key].append(d)
+ return d
+
+ def put_connection(self, conn: H2ClientProtocol, key: Tuple) -> H2ClientProtocol:
+ self._connections[key] = conn
+
+ # Now as we have established a proper HTTP/2 connection
+ # we fire all the deferred's with the connection instance
+ pending_requests = self._pending_requests.pop(key, None)
+ while pending_requests:
+ d = pending_requests.popleft()
+ d.callback(conn)
+
+ return conn
+
+ def _remove_connection(self, errors: List[BaseException], key: Tuple) -> None:
+ self._connections.pop(key)
+
+ # Call the errback of all the pending requests for this connection
+ pending_requests = self._pending_requests.pop(key, None)
+ while pending_requests:
+ d = pending_requests.popleft()
+ d.errback(ResponseFailed(errors))
+
+ def close_connections(self) -> None:
"""Close all the HTTP/2 connections and remove them from pool
Returns:
Deferred that fires when all connections have been closed
"""
- pass
+ for conn in self._connections.values():
+ assert conn.transport is not None # typing
+ conn.transport.abortConnection()
class H2Agent:
-
- def __init__(self, reactor: ReactorBase, pool: H2ConnectionPool,
- context_factory: BrowserLikePolicyForHTTPS=
- BrowserLikePolicyForHTTPS(), connect_timeout: Optional[float]=None,
- bind_address: Optional[bytes]=None) ->None:
+ def __init__(
+ self,
+ reactor: ReactorBase,
+ pool: H2ConnectionPool,
+ context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
+ connect_timeout: Optional[float] = None,
+ bind_address: Optional[bytes] = None,
+ ) -> None:
self._reactor = reactor
self._pool = pool
self._context_factory = AcceptableProtocolsContextFactory(
- context_factory, acceptable_protocols=[b'h2'])
- self.endpoint_factory = _StandardEndpointFactory(self._reactor,
- self._context_factory, connect_timeout, bind_address)
+ context_factory, acceptable_protocols=[b"h2"]
+ )
+ self.endpoint_factory = _StandardEndpointFactory(
+ self._reactor, self._context_factory, connect_timeout, bind_address
+ )
- def get_key(self, uri: URI) ->Tuple:
+ def get_endpoint(self, uri: URI):
+ return self.endpoint_factory.endpointForURI(uri)
+
+ def get_key(self, uri: URI) -> Tuple:
"""
Arguments:
uri - URI obtained directly from request URL
"""
- pass
+ return uri.scheme, uri.host, uri.port
+ def request(self, request: Request, spider: Spider) -> Deferred:
+ uri = URI.fromBytes(bytes(request.url, encoding="utf-8"))
+ try:
+ endpoint = self.get_endpoint(uri)
+ except SchemeNotSupported:
+ return defer.fail(Failure())
+
+ key = self.get_key(uri)
+ d = self._pool.get_connection(key, uri, endpoint)
+ d.addCallback(lambda conn: conn.request(request, spider))
+ return d
-class ScrapyProxyH2Agent(H2Agent):
- def __init__(self, reactor: ReactorBase, proxy_uri: URI, pool:
- H2ConnectionPool, context_factory: BrowserLikePolicyForHTTPS=
- BrowserLikePolicyForHTTPS(), connect_timeout: Optional[float]=None,
- bind_address: Optional[bytes]=None) ->None:
- super().__init__(reactor=reactor, pool=pool, context_factory=
- context_factory, connect_timeout=connect_timeout, bind_address=
- bind_address)
+class ScrapyProxyH2Agent(H2Agent):
+ def __init__(
+ self,
+ reactor: ReactorBase,
+ proxy_uri: URI,
+ pool: H2ConnectionPool,
+ context_factory: BrowserLikePolicyForHTTPS = BrowserLikePolicyForHTTPS(),
+ connect_timeout: Optional[float] = None,
+ bind_address: Optional[bytes] = None,
+ ) -> None:
+ super().__init__(
+ reactor=reactor,
+ pool=pool,
+ context_factory=context_factory,
+ connect_timeout=connect_timeout,
+ bind_address=bind_address,
+ )
self._proxy_uri = proxy_uri
- def get_key(self, uri: URI) ->Tuple:
+ def get_endpoint(self, uri: URI):
+ return self.endpoint_factory.endpointForURI(self._proxy_uri)
+
+ def get_key(self, uri: URI) -> Tuple:
"""We use the proxy uri instead of uri obtained from request url"""
- pass
+ return "http-proxy", self._proxy_uri.host, self._proxy_uri.port
diff --git a/scrapy/core/http2/protocol.py b/scrapy/core/http2/protocol.py
index 7f0ed1e12..bc8da50d7 100644
--- a/scrapy/core/http2/protocol.py
+++ b/scrapy/core/http2/protocol.py
@@ -4,10 +4,21 @@ import logging
from collections import deque
from ipaddress import IPv4Address, IPv6Address
from typing import Dict, List, Optional, Union
+
from h2.config import H2Configuration
from h2.connection import H2Connection
from h2.errors import ErrorCodes
-from h2.events import ConnectionTerminated, DataReceived, Event, ResponseReceived, SettingsAcknowledged, StreamEnded, StreamReset, UnknownFrameReceived, WindowUpdated
+from h2.events import (
+ ConnectionTerminated,
+ DataReceived,
+ Event,
+ ResponseReceived,
+ SettingsAcknowledged,
+ StreamEnded,
+ StreamReset,
+ UnknownFrameReceived,
+ WindowUpdated,
+)
from h2.exceptions import FrameTooLargeError, H2Error
from twisted.internet.defer import Deferred
from twisted.internet.error import TimeoutError
@@ -18,54 +29,56 @@ from twisted.protocols.policies import TimeoutMixin
from twisted.python.failure import Failure
from twisted.web.client import URI
from zope.interface import implementer
+
from scrapy.core.http2.stream import Stream, StreamCloseReason
from scrapy.http import Request
from scrapy.settings import Settings
from scrapy.spiders import Spider
+
logger = logging.getLogger(__name__)
-PROTOCOL_NAME = b'h2'
-class InvalidNegotiatedProtocol(H2Error):
+PROTOCOL_NAME = b"h2"
+
- def __init__(self, negotiated_protocol: bytes) ->None:
+class InvalidNegotiatedProtocol(H2Error):
+ def __init__(self, negotiated_protocol: bytes) -> None:
self.negotiated_protocol = negotiated_protocol
- def __str__(self) ->str:
- return (
- f'Expected {PROTOCOL_NAME!r}, received {self.negotiated_protocol!r}'
- )
+ def __str__(self) -> str:
+ return f"Expected {PROTOCOL_NAME!r}, received {self.negotiated_protocol!r}"
class RemoteTerminatedConnection(H2Error):
-
- def __init__(self, remote_ip_address: Optional[Union[IPv4Address,
- IPv6Address]], event: ConnectionTerminated) ->None:
+ def __init__(
+ self,
+ remote_ip_address: Optional[Union[IPv4Address, IPv6Address]],
+ event: ConnectionTerminated,
+ ) -> None:
self.remote_ip_address = remote_ip_address
self.terminate_event = event
- def __str__(self) ->str:
- return f'Received GOAWAY frame from {self.remote_ip_address!r}'
+ def __str__(self) -> str:
+ return f"Received GOAWAY frame from {self.remote_ip_address!r}"
class MethodNotAllowed405(H2Error):
-
- def __init__(self, remote_ip_address: Optional[Union[IPv4Address,
- IPv6Address]]) ->None:
+ def __init__(
+ self, remote_ip_address: Optional[Union[IPv4Address, IPv6Address]]
+ ) -> None:
self.remote_ip_address = remote_ip_address
- def __str__(self) ->str:
- return (
- f"Received 'HTTP/2.0 405 Method Not Allowed' from {self.remote_ip_address!r}"
- )
+ def __str__(self) -> str:
+ return f"Received 'HTTP/2.0 405 Method Not Allowed' from {self.remote_ip_address!r}"
@implementer(IHandshakeListener)
class H2ClientProtocol(Protocol, TimeoutMixin):
IDLE_TIMEOUT = 240
- def __init__(self, uri: URI, settings: Settings, conn_lost_deferred:
- Deferred) ->None:
+ def __init__(
+ self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
+ ) -> None:
"""
Arguments:
uri -- URI of the base url to which HTTP/2 Connection will be made.
@@ -76,108 +89,350 @@ class H2ClientProtocol(Protocol, TimeoutMixin):
that connection was lost
"""
self._conn_lost_deferred = conn_lost_deferred
- config = H2Configuration(client_side=True, header_encoding='utf-8')
+
+ config = H2Configuration(client_side=True, header_encoding="utf-8")
self.conn = H2Connection(config=config)
+
+ # ID of the next request stream
+ # Following the convention - 'Streams initiated by a client MUST
+ # use odd-numbered stream identifiers' (RFC 7540 - Section 5.1.1)
self._stream_id_generator = itertools.count(start=1, step=2)
+
+ # Streams are stored in a dictionary keyed off their stream IDs
self.streams: Dict[int, Stream] = {}
+
+ # If requests are received before connection is made we keep
+ # all requests in a pool and send them as the connection is made
self._pending_request_stream_pool: deque = deque()
+
+ # Save an instance of errors raised which lead to losing the connection
+ # We pass these instances to the streams ResponseFailed() failure
self._conn_lost_errors: List[BaseException] = []
- self.metadata: Dict = {'certificate': None, 'ip_address': None,
- 'uri': uri, 'default_download_maxsize': settings.getint(
- 'DOWNLOAD_MAXSIZE'), 'default_download_warnsize': settings.
- getint('DOWNLOAD_WARNSIZE'), 'active_streams': 0,
- 'settings_acknowledged': False}
+
+ # Some meta data of this connection
+ # initialized when connection is successfully made
+ self.metadata: Dict = {
+ # Peer certificate instance
+ "certificate": None,
+ # Address of the server we are connected to which
+ # is updated when HTTP/2 connection is made successfully
+ "ip_address": None,
+ # URI of the peer HTTP/2 connection is made
+ "uri": uri,
+ # Both ip_address and uri are used by the Stream before
+ # initiating the request to verify that the base address
+ # Variables taken from Project Settings
+ "default_download_maxsize": settings.getint("DOWNLOAD_MAXSIZE"),
+ "default_download_warnsize": settings.getint("DOWNLOAD_WARNSIZE"),
+ # Counter to keep track of opened streams. This counter
+ # is used to make sure that not more than MAX_CONCURRENT_STREAMS
+ # streams are opened which leads to ProtocolError
+ # We use simple FIFO policy to handle pending requests
+ "active_streams": 0,
+ # Flag to keep track if settings were acknowledged by the remote
+ # This ensures that we have established a HTTP/2 connection
+ "settings_acknowledged": False,
+ }
@property
- def h2_connected(self) ->bool:
+ def h2_connected(self) -> bool:
"""Boolean to keep track of the connection status.
This is used while initiating pending streams to make sure
that we initiate stream only during active HTTP/2 Connection
"""
- pass
+ assert self.transport is not None # typing
+ return bool(self.transport.connected) and self.metadata["settings_acknowledged"]
@property
- def allowed_max_concurrent_streams(self) ->int:
+ def allowed_max_concurrent_streams(self) -> int:
"""We keep total two streams for client (sending data) and
server side (receiving data) for a single request. To be safe
we choose the minimum. Since this value can change in event
RemoteSettingsChanged we make variable a property.
"""
- pass
+ return min(
+ self.conn.local_settings.max_concurrent_streams,
+ self.conn.remote_settings.max_concurrent_streams,
+ )
- def _send_pending_requests(self) ->None:
+ def _send_pending_requests(self) -> None:
"""Initiate all pending requests from the deque following FIFO
We make sure that at any time {allowed_max_concurrent_streams}
streams are active.
"""
- pass
-
- def pop_stream(self, stream_id: int) ->Stream:
+ while (
+ self._pending_request_stream_pool
+ and self.metadata["active_streams"] < self.allowed_max_concurrent_streams
+ and self.h2_connected
+ ):
+ self.metadata["active_streams"] += 1
+ stream = self._pending_request_stream_pool.popleft()
+ stream.initiate_request()
+ self._write_to_transport()
+
+ def pop_stream(self, stream_id: int) -> Stream:
"""Perform cleanup when a stream is closed"""
- pass
+ stream = self.streams.pop(stream_id)
+ self.metadata["active_streams"] -= 1
+ self._send_pending_requests()
+ return stream
- def _new_stream(self, request: Request, spider: Spider) ->Stream:
+ def _new_stream(self, request: Request, spider: Spider) -> Stream:
"""Instantiates a new Stream object"""
- pass
-
- def _write_to_transport(self) ->None:
+ stream = Stream(
+ stream_id=next(self._stream_id_generator),
+ request=request,
+ protocol=self,
+ download_maxsize=getattr(
+ spider, "download_maxsize", self.metadata["default_download_maxsize"]
+ ),
+ download_warnsize=getattr(
+ spider, "download_warnsize", self.metadata["default_download_warnsize"]
+ ),
+ )
+ self.streams[stream.stream_id] = stream
+ return stream
+
+ def _write_to_transport(self) -> None:
"""Write data to the underlying transport connection
from the HTTP2 connection instance if any
"""
- pass
+ assert self.transport is not None # typing
+ # Reset the idle timeout as connection is still actively sending data
+ self.resetTimeout()
- def connectionMade(self) ->None:
+ data = self.conn.data_to_send()
+ self.transport.write(data)
+
+ def request(self, request: Request, spider: Spider) -> Deferred:
+ if not isinstance(request, Request):
+ raise TypeError(
+ f"Expected scrapy.http.Request, received {request.__class__.__qualname__}"
+ )
+
+ stream = self._new_stream(request, spider)
+ d = stream.get_response()
+
+ # Add the stream to the request pool
+ self._pending_request_stream_pool.append(stream)
+
+ # If we receive a request when connection is idle
+ # We need to initiate pending requests
+ self._send_pending_requests()
+ return d
+
+ def connectionMade(self) -> None:
"""Called by Twisted when the connection is established. We can start
sending some data now: we should open with the connection preamble.
"""
- pass
+ # Initialize the timeout
+ self.setTimeout(self.IDLE_TIMEOUT)
+
+ assert self.transport is not None # typing
+ destination = self.transport.getPeer()
+ self.metadata["ip_address"] = ipaddress.ip_address(destination.host)
- def _lose_connection_with_error(self, errors: List[BaseException]) ->None:
+ # Initiate H2 Connection
+ self.conn.initiate_connection()
+ self._write_to_transport()
+
+ def _lose_connection_with_error(self, errors: List[BaseException]) -> None:
"""Helper function to lose the connection with the error sent as a
reason"""
- pass
+ self._conn_lost_errors += errors
+ assert self.transport is not None # typing
+ self.transport.loseConnection()
- def handshakeCompleted(self) ->None:
+ def handshakeCompleted(self) -> None:
"""
Close the connection if it's not made via the expected protocol
"""
- pass
+ assert self.transport is not None # typing
+ if (
+ self.transport.negotiatedProtocol is not None
+ and self.transport.negotiatedProtocol != PROTOCOL_NAME
+ ):
+ # we have not initiated the connection yet, no need to send a GOAWAY frame to the remote peer
+ self._lose_connection_with_error(
+ [InvalidNegotiatedProtocol(self.transport.negotiatedProtocol)]
+ )
- def _check_received_data(self, data: bytes) ->None:
+ def _check_received_data(self, data: bytes) -> None:
"""Checks for edge cases where the connection to remote fails
without raising an appropriate H2Error
Arguments:
data -- Data received from the remote
"""
- pass
-
- def timeoutConnection(self) ->None:
+ if data.startswith(b"HTTP/2.0 405 Method Not Allowed"):
+ raise MethodNotAllowed405(self.metadata["ip_address"])
+
+ def dataReceived(self, data: bytes) -> None:
+ # Reset the idle timeout as connection is still actively receiving data
+ self.resetTimeout()
+
+ try:
+ self._check_received_data(data)
+ events = self.conn.receive_data(data)
+ self._handle_events(events)
+ except H2Error as e:
+ if isinstance(e, FrameTooLargeError):
+ # hyper-h2 does not drop the connection in this scenario, we
+ # need to abort the connection manually.
+ self._conn_lost_errors += [e]
+ assert self.transport is not None # typing
+ self.transport.abortConnection()
+ return
+
+ # Save this error as ultimately the connection will be dropped
+ # internally by hyper-h2. Saved error will be passed to all the streams
+ # closed with the connection.
+ self._lose_connection_with_error([e])
+ finally:
+ self._write_to_transport()
+
+ def timeoutConnection(self) -> None:
"""Called when the connection times out.
We lose the connection with TimeoutError"""
- pass
- def connectionLost(self, reason: Failure=connectionDone) ->None:
+ # Check whether there are open streams. If there are, we're going to
+ # want to use the error code PROTOCOL_ERROR. If there aren't, use
+ # NO_ERROR.
+ if (
+ self.conn.open_outbound_streams > 0
+ or self.conn.open_inbound_streams > 0
+ or self.metadata["active_streams"] > 0
+ ):
+ error_code = ErrorCodes.PROTOCOL_ERROR
+ else:
+ error_code = ErrorCodes.NO_ERROR
+ self.conn.close_connection(error_code=error_code)
+ self._write_to_transport()
+
+ self._lose_connection_with_error(
+ [TimeoutError(f"Connection was IDLE for more than {self.IDLE_TIMEOUT}s")]
+ )
+
+ def connectionLost(self, reason: Failure = connectionDone) -> None:
"""Called by Twisted when the transport connection is lost.
No need to write anything to transport here.
"""
- pass
+ # Cancel the timeout if not done yet
+ self.setTimeout(None)
+
+ # Notify the connection pool instance such that no new requests are
+ # sent over current connection
+ if not reason.check(connectionDone):
+ self._conn_lost_errors.append(reason)
+
+ self._conn_lost_deferred.callback(self._conn_lost_errors)
+
+ for stream in self.streams.values():
+ if stream.metadata["request_sent"]:
+ close_reason = StreamCloseReason.CONNECTION_LOST
+ else:
+ close_reason = StreamCloseReason.INACTIVE
+ stream.close(close_reason, self._conn_lost_errors, from_protocol=True)
- def _handle_events(self, events: List[Event]) ->None:
+ self.metadata["active_streams"] -= len(self.streams)
+ self.streams.clear()
+ self._pending_request_stream_pool.clear()
+ self.conn.close_connection()
+
+ def _handle_events(self, events: List[Event]) -> None:
"""Private method which acts as a bridge between the events
received from the HTTP/2 data and IH2EventsHandler
Arguments:
events -- A list of events that the remote peer triggered by sending data
"""
- pass
+ for event in events:
+ if isinstance(event, ConnectionTerminated):
+ self.connection_terminated(event)
+ elif isinstance(event, DataReceived):
+ self.data_received(event)
+ elif isinstance(event, ResponseReceived):
+ self.response_received(event)
+ elif isinstance(event, StreamEnded):
+ self.stream_ended(event)
+ elif isinstance(event, StreamReset):
+ self.stream_reset(event)
+ elif isinstance(event, WindowUpdated):
+ self.window_updated(event)
+ elif isinstance(event, SettingsAcknowledged):
+ self.settings_acknowledged(event)
+ elif isinstance(event, UnknownFrameReceived):
+ logger.warning("Unknown frame received: %s", event.frame)
+
+ # Event handler functions starts here
+ def connection_terminated(self, event: ConnectionTerminated) -> None:
+ self._lose_connection_with_error(
+ [RemoteTerminatedConnection(self.metadata["ip_address"], event)]
+ )
+
+ def data_received(self, event: DataReceived) -> None:
+ try:
+ stream = self.streams[event.stream_id]
+ except KeyError:
+ pass # We ignore server-initiated events
+ else:
+ stream.receive_data(event.data, event.flow_controlled_length)
+
+ def response_received(self, event: ResponseReceived) -> None:
+ try:
+ stream = self.streams[event.stream_id]
+ except KeyError:
+ pass # We ignore server-initiated events
+ else:
+ stream.receive_headers(event.headers)
+
+ def settings_acknowledged(self, event: SettingsAcknowledged) -> None:
+ self.metadata["settings_acknowledged"] = True
+
+ # Send off all the pending requests as now we have
+ # established a proper HTTP/2 connection
+ self._send_pending_requests()
+
+ # Update certificate when our HTTP/2 connection is established
+ assert self.transport is not None # typing
+ self.metadata["certificate"] = Certificate(self.transport.getPeerCertificate())
+
+ def stream_ended(self, event: StreamEnded) -> None:
+ try:
+ stream = self.pop_stream(event.stream_id)
+ except KeyError:
+ pass # We ignore server-initiated events
+ else:
+ stream.close(StreamCloseReason.ENDED, from_protocol=True)
+
+ def stream_reset(self, event: StreamReset) -> None:
+ try:
+ stream = self.pop_stream(event.stream_id)
+ except KeyError:
+ pass # We ignore server-initiated events
+ else:
+ stream.close(StreamCloseReason.RESET, from_protocol=True)
+
+ def window_updated(self, event: WindowUpdated) -> None:
+ if event.stream_id != 0:
+ self.streams[event.stream_id].receive_window_update()
+ else:
+ # Send leftover data for all the streams
+ for stream in self.streams.values():
+ stream.receive_window_update()
@implementer(IProtocolNegotiationFactory)
class H2ClientFactory(Factory):
-
- def __init__(self, uri: URI, settings: Settings, conn_lost_deferred:
- Deferred) ->None:
+ def __init__(
+ self, uri: URI, settings: Settings, conn_lost_deferred: Deferred
+ ) -> None:
self.uri = uri
self.settings = settings
self.conn_lost_deferred = conn_lost_deferred
+
+ def buildProtocol(self, addr) -> H2ClientProtocol:
+ return H2ClientProtocol(self.uri, self.settings, self.conn_lost_deferred)
+
+ def acceptableProtocols(self) -> List[bytes]:
+ return [PROTOCOL_NAME]
diff --git a/scrapy/core/http2/stream.py b/scrapy/core/http2/stream.py
index dcbe8e224..6c6ed6f9b 100644
--- a/scrapy/core/http2/stream.py
+++ b/scrapy/core/http2/stream.py
@@ -3,6 +3,7 @@ from enum import Enum
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
from urllib.parse import urlparse
+
from h2.errors import ErrorCodes
from h2.exceptions import H2Error, ProtocolError, StreamClosedError
from hpack import HeaderTuple
@@ -10,11 +11,15 @@ from twisted.internet.defer import CancelledError, Deferred
from twisted.internet.error import ConnectionClosed
from twisted.python.failure import Failure
from twisted.web.client import ResponseFailed
+
from scrapy.http import Request
from scrapy.http.headers import Headers
from scrapy.responsetypes import responsetypes
+
if TYPE_CHECKING:
from scrapy.core.http2.protocol import H2ClientProtocol
+
+
logger = logging.getLogger(__name__)
@@ -23,36 +28,47 @@ class InactiveStreamClosed(ConnectionClosed):
of the stream. This happens when a stream is waiting for other
streams to close and connection is lost."""
- def __init__(self, request: Request) ->None:
+ def __init__(self, request: Request) -> None:
self.request = request
- def __str__(self) ->str:
- return (
- f'InactiveStreamClosed: Connection was closed without sending the request {self.request!r}'
- )
+ def __str__(self) -> str:
+ return f"InactiveStreamClosed: Connection was closed without sending the request {self.request!r}"
class InvalidHostname(H2Error):
-
- def __init__(self, request: Request, expected_hostname: str,
- expected_netloc: str) ->None:
+ def __init__(
+ self, request: Request, expected_hostname: str, expected_netloc: str
+ ) -> None:
self.request = request
self.expected_hostname = expected_hostname
self.expected_netloc = expected_netloc
- def __str__(self) ->str:
- return (
- f'InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}'
- )
+ def __str__(self) -> str:
+ return f"InvalidHostname: Expected {self.expected_hostname} or {self.expected_netloc} in {self.request}"
class StreamCloseReason(Enum):
+ # Received a StreamEnded event from the remote
ENDED = 1
+
+ # Received a StreamReset event -- ended abruptly
RESET = 2
+
+ # Transport connection was lost
CONNECTION_LOST = 3
+
+ # Expected response body size is more than allowed limit
MAXSIZE_EXCEEDED = 4
+
+ # Response deferred is cancelled by the client
+ # (happens when client called response_deferred.cancel())
CANCELLED = 5
+
+ # Connection lost and the stream was not initiated
INACTIVE = 6
+
+ # The hostname of the request is not same as of connected peer hostname
+ # As a result sending this request will the end the connection
INVALID_HOSTNAME = 7
@@ -67,9 +83,14 @@ class Stream:
1. Combine all the data frames
"""
- def __init__(self, stream_id: int, request: Request, protocol:
- 'H2ClientProtocol', download_maxsize: int=0, download_warnsize: int=0
- ) ->None:
+ def __init__(
+ self,
+ stream_id: int,
+ request: Request,
+ protocol: "H2ClientProtocol",
+ download_maxsize: int = 0,
+ download_warnsize: int = 0,
+ ) -> None:
"""
Arguments:
stream_id -- Unique identifier for the stream within a single HTTP/2 connection
@@ -78,31 +99,65 @@ class Stream:
"""
self.stream_id: int = stream_id
self._request: Request = request
- self._protocol: 'H2ClientProtocol' = protocol
- self._download_maxsize = self._request.meta.get('download_maxsize',
- download_maxsize)
- self._download_warnsize = self._request.meta.get('download_warnsize',
- download_warnsize)
- self.metadata: Dict = {'request_content_length': 0 if self._request
- .body is None else len(self._request.body), 'request_sent':
- False, 'reached_warnsize': False, 'remaining_content_length': 0 if
- self._request.body is None else len(self._request.body),
- 'stream_closed_local': False, 'stream_closed_server': False}
- self._response: Dict = {'body': BytesIO(), 'flow_controlled_size':
- 0, 'headers': Headers({})}
-
- def _cancel(_) ->None:
- if self.metadata['request_sent']:
+ self._protocol: "H2ClientProtocol" = protocol
+
+ self._download_maxsize = self._request.meta.get(
+ "download_maxsize", download_maxsize
+ )
+ self._download_warnsize = self._request.meta.get(
+ "download_warnsize", download_warnsize
+ )
+
+ # Metadata of an HTTP/2 connection stream
+ # initialized when stream is instantiated
+ self.metadata: Dict = {
+ "request_content_length": 0
+ if self._request.body is None
+ else len(self._request.body),
+ # Flag to keep track whether the stream has initiated the request
+ "request_sent": False,
+ # Flag to track whether we have logged about exceeding download warnsize
+ "reached_warnsize": False,
+ # Each time we send a data frame, we will decrease value by the amount send.
+ "remaining_content_length": 0
+ if self._request.body is None
+ else len(self._request.body),
+ # Flag to keep track whether client (self) have closed this stream
+ "stream_closed_local": False,
+ # Flag to keep track whether the server has closed the stream
+ "stream_closed_server": False,
+ }
+
+ # Private variable used to build the response
+ # this response is then converted to appropriate Response class
+ # passed to the response deferred callback
+ self._response: Dict = {
+ # Data received frame by frame from the server is appended
+ # and passed to the response Deferred when completely received.
+ "body": BytesIO(),
+ # The amount of data received that counts against the
+ # flow control window
+ "flow_controlled_size": 0,
+ # Headers received after sending the request
+ "headers": Headers({}),
+ }
+
+ def _cancel(_) -> None:
+ # Close this stream as gracefully as possible
+ # If the associated request is initiated we reset this stream
+ # else we directly call close() method
+ if self.metadata["request_sent"]:
self.reset_stream(StreamCloseReason.CANCELLED)
else:
self.close(StreamCloseReason.CANCELLED)
+
self._deferred_response: Deferred = Deferred(_cancel)
- def __repr__(self) ->str:
- return f'Stream(id={self.stream_id!r})'
+ def __repr__(self) -> str:
+ return f"Stream(id={self.stream_id!r})"
@property
- def _log_warnsize(self) ->bool:
+ def _log_warnsize(self) -> bool:
"""Checks if we have received data which exceeds the download warnsize
and whether we have not already logged about it.
@@ -110,15 +165,97 @@ class Stream:
True if both the above conditions hold true
False if any of the conditions is false
"""
- pass
+ content_length_header = int(
+ self._response["headers"].get(b"Content-Length", -1)
+ )
+ return (
+ self._download_warnsize
+ and (
+ self._response["flow_controlled_size"] > self._download_warnsize
+ or content_length_header > self._download_warnsize
+ )
+ and not self.metadata["reached_warnsize"]
+ )
- def get_response(self) ->Deferred:
+ def get_response(self) -> Deferred:
"""Simply return a Deferred which fires when response
from the asynchronous request is available
"""
- pass
+ return self._deferred_response
+
+ def check_request_url(self) -> bool:
+ # Make sure that we are sending the request to the correct URL
+ url = urlparse(self._request.url)
+ return (
+ url.netloc == str(self._protocol.metadata["uri"].host, "utf-8")
+ or url.netloc == str(self._protocol.metadata["uri"].netloc, "utf-8")
+ or url.netloc
+ == f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}'
+ )
+
+ def _get_request_headers(self) -> List[Tuple[str, str]]:
+ url = urlparse(self._request.url)
+
+ path = url.path
+ if url.query:
+ path += "?" + url.query
+
+ # This pseudo-header field MUST NOT be empty for "http" or "https"
+ # URIs; "http" or "https" URIs that do not contain a path component
+ # MUST include a value of '/'. The exception to this rule is an
+ # OPTIONS request for an "http" or "https" URI that does not include
+ # a path component; these MUST include a ":path" pseudo-header field
+ # with a value of '*' (refer RFC 7540 - Section 8.1.2.3)
+ if not path:
+ path = "*" if self._request.method == "OPTIONS" else "/"
+
+ # Make sure pseudo-headers comes before all the other headers
+ headers = [
+ (":method", self._request.method),
+ (":authority", url.netloc),
+ ]
+
+ # The ":scheme" and ":path" pseudo-header fields MUST
+ # be omitted for CONNECT method (refer RFC 7540 - Section 8.3)
+ if self._request.method != "CONNECT":
+ headers += [
+ (":scheme", self._protocol.metadata["uri"].scheme),
+ (":path", path),
+ ]
- def send_data(self) ->None:
+ content_length = str(len(self._request.body))
+ headers.append(("Content-Length", content_length))
+
+ content_length_name = self._request.headers.normkey(b"Content-Length")
+ for name, values in self._request.headers.items():
+ for value in values:
+ value = str(value, "utf-8")
+ if name == content_length_name:
+ if value != content_length:
+ logger.warning(
+ "Ignoring bad Content-Length header %r of request %r, "
+ "sending %r instead",
+ value,
+ self._request,
+ content_length,
+ )
+ continue
+ headers.append((str(name, "utf-8"), value))
+
+ return headers
+
+ def initiate_request(self) -> None:
+ if self.check_request_url():
+ headers = self._get_request_headers()
+ self._protocol.conn.send_headers(self.stream_id, headers, end_stream=False)
+ self.metadata["request_sent"] = True
+ self.send_data()
+ else:
+ # Close this stream calling the response errback
+ # Note that we have not sent any headers
+ self.close(StreamCloseReason.INVALID_HOSTNAME)
+
+ def send_data(self) -> None:
"""Called immediately after the headers are sent. Here we send all the
data as part of the request.
@@ -129,27 +266,227 @@ class Stream:
and has initiated request already by sending HEADER frame. If not then
stream will raise ProtocolError (raise by h2 state machine).
"""
- pass
+ if self.metadata["stream_closed_local"]:
+ raise StreamClosedError(self.stream_id)
+
+ # Firstly, check what the flow control window is for current stream.
+ window_size = self._protocol.conn.local_flow_control_window(
+ stream_id=self.stream_id
+ )
+
+ # Next, check what the maximum frame size is.
+ max_frame_size = self._protocol.conn.max_outbound_frame_size
+
+ # We will send no more than the window size or the remaining file size
+ # of data in this call, whichever is smaller.
+ bytes_to_send_size = min(window_size, self.metadata["remaining_content_length"])
+
+ # We now need to send a number of data frames.
+ while bytes_to_send_size > 0:
+ chunk_size = min(bytes_to_send_size, max_frame_size)
+
+ data_chunk_start_id = (
+ self.metadata["request_content_length"]
+ - self.metadata["remaining_content_length"]
+ )
+ data_chunk = self._request.body[
+ data_chunk_start_id : data_chunk_start_id + chunk_size
+ ]
+
+ self._protocol.conn.send_data(self.stream_id, data_chunk, end_stream=False)
+
+ bytes_to_send_size -= chunk_size
+ self.metadata["remaining_content_length"] -= chunk_size
+
+ self.metadata["remaining_content_length"] = max(
+ 0, self.metadata["remaining_content_length"]
+ )
+
+ # End the stream if no more data needs to be send
+ if self.metadata["remaining_content_length"] == 0:
+ self._protocol.conn.end_stream(self.stream_id)
- def receive_window_update(self) ->None:
+ # Q. What about the rest of the data?
+ # Ans: Remaining Data frames will be sent when we get a WindowUpdate frame
+
+ def receive_window_update(self) -> None:
"""Flow control window size was changed.
Send data that earlier could not be sent as we were
blocked behind the flow control.
"""
- pass
+ if (
+ self.metadata["remaining_content_length"]
+ and not self.metadata["stream_closed_server"]
+ and self.metadata["request_sent"]
+ ):
+ self.send_data()
+
+ def receive_data(self, data: bytes, flow_controlled_length: int) -> None:
+ self._response["body"].write(data)
+ self._response["flow_controlled_size"] += flow_controlled_length
+
+ # We check maxsize here in case the Content-Length header was not received
+ if (
+ self._download_maxsize
+ and self._response["flow_controlled_size"] > self._download_maxsize
+ ):
+ self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
+ return
+
+ if self._log_warnsize:
+ self.metadata["reached_warnsize"] = True
+ warning_msg = (
+ f'Received more ({self._response["flow_controlled_size"]}) bytes than download '
+ f"warn size ({self._download_warnsize}) in request {self._request}"
+ )
+ logger.warning(warning_msg)
- def reset_stream(self, reason: StreamCloseReason=StreamCloseReason.RESET
- ) ->None:
+ # Acknowledge the data received
+ self._protocol.conn.acknowledge_received_data(
+ self._response["flow_controlled_size"], self.stream_id
+ )
+
+ def receive_headers(self, headers: List[HeaderTuple]) -> None:
+ for name, value in headers:
+ self._response["headers"].appendlist(name, value)
+
+ # Check if we exceed the allowed max data size which can be received
+ expected_size = int(self._response["headers"].get(b"Content-Length", -1))
+ if self._download_maxsize and expected_size > self._download_maxsize:
+ self.reset_stream(StreamCloseReason.MAXSIZE_EXCEEDED)
+ return
+
+ if self._log_warnsize:
+ self.metadata["reached_warnsize"] = True
+ warning_msg = (
+ f"Expected response size ({expected_size}) larger than "
+ f"download warn size ({self._download_warnsize}) in request {self._request}"
+ )
+ logger.warning(warning_msg)
+
+ def reset_stream(self, reason: StreamCloseReason = StreamCloseReason.RESET) -> None:
"""Close this stream by sending a RST_FRAME to the remote peer"""
- pass
+ if self.metadata["stream_closed_local"]:
+ raise StreamClosedError(self.stream_id)
- def close(self, reason: StreamCloseReason, errors: Optional[List[
- BaseException]]=None, from_protocol: bool=False) ->None:
+ # Clear buffer earlier to avoid keeping data in memory for a long time
+ self._response["body"].truncate(0)
+
+ self.metadata["stream_closed_local"] = True
+ self._protocol.conn.reset_stream(self.stream_id, ErrorCodes.REFUSED_STREAM)
+ self.close(reason)
+
+ def close(
+ self,
+ reason: StreamCloseReason,
+ errors: Optional[List[BaseException]] = None,
+ from_protocol: bool = False,
+ ) -> None:
"""Based on the reason sent we will handle each case."""
- pass
+ if self.metadata["stream_closed_server"]:
+ raise StreamClosedError(self.stream_id)
+
+ if not isinstance(reason, StreamCloseReason):
+ raise TypeError(
+ f"Expected StreamCloseReason, received {reason.__class__.__qualname__}"
+ )
+
+ # Have default value of errors as an empty list as
+ # some cases can add a list of exceptions
+ errors = errors or []
- def _fire_response_deferred(self) ->None:
+ if not from_protocol:
+ self._protocol.pop_stream(self.stream_id)
+
+ self.metadata["stream_closed_server"] = True
+
+ # We do not check for Content-Length or Transfer-Encoding in response headers
+ # and add `partial` flag as in HTTP/1.1 as 'A request or response that includes
+ # a payload body can include a content-length header field' (RFC 7540 - Section 8.1.2.6)
+
+ # NOTE: Order of handling the events is important here
+ # As we immediately cancel the request when maxsize is exceeded while
+ # receiving DATA_FRAME's when we have received the headers (not
+ # having Content-Length)
+ if reason is StreamCloseReason.MAXSIZE_EXCEEDED:
+ expected_size = int(
+ self._response["headers"].get(
+ b"Content-Length", self._response["flow_controlled_size"]
+ )
+ )
+ error_msg = (
+ f"Cancelling download of {self._request.url}: received response "
+ f"size ({expected_size}) larger than download max size ({self._download_maxsize})"
+ )
+ logger.error(error_msg)
+ self._deferred_response.errback(CancelledError(error_msg))
+
+ elif reason is StreamCloseReason.ENDED:
+ self._fire_response_deferred()
+
+ # Stream was abruptly ended here
+ elif reason is StreamCloseReason.CANCELLED:
+ # Client has cancelled the request. Remove all the data
+ # received and fire the response deferred with no flags set
+
+ # NOTE: The data is already flushed in Stream.reset_stream() called
+ # immediately when the stream needs to be cancelled
+
+ # There maybe no :status in headers, we make
+ # HTTP Status Code: 499 - Client Closed Request
+ self._response["headers"][":status"] = "499"
+ self._fire_response_deferred()
+
+ elif reason is StreamCloseReason.RESET:
+ self._deferred_response.errback(
+ ResponseFailed(
+ [
+ Failure(
+ f'Remote peer {self._protocol.metadata["ip_address"]} sent RST_STREAM',
+ ProtocolError,
+ )
+ ]
+ )
+ )
+
+ elif reason is StreamCloseReason.CONNECTION_LOST:
+ self._deferred_response.errback(ResponseFailed(errors))
+
+ elif reason is StreamCloseReason.INACTIVE:
+ errors.insert(0, InactiveStreamClosed(self._request))
+ self._deferred_response.errback(ResponseFailed(errors))
+
+ else:
+ assert reason is StreamCloseReason.INVALID_HOSTNAME
+ self._deferred_response.errback(
+ InvalidHostname(
+ self._request,
+ str(self._protocol.metadata["uri"].host, "utf-8"),
+ f'{self._protocol.metadata["ip_address"]}:{self._protocol.metadata["uri"].port}',
+ )
+ )
+
+ def _fire_response_deferred(self) -> None:
"""Builds response from the self._response dict
and fires the response deferred callback with the
generated response instance"""
- pass
+
+ body = self._response["body"].getvalue()
+ response_cls = responsetypes.from_args(
+ headers=self._response["headers"],
+ url=self._request.url,
+ body=body,
+ )
+
+ response = response_cls(
+ url=self._request.url,
+ status=int(self._response["headers"][":status"]),
+ headers=self._response["headers"],
+ body=body,
+ request=self._request,
+ certificate=self._protocol.metadata["certificate"],
+ ip_address=self._protocol.metadata["ip_address"],
+ protocol="h2",
+ )
+
+ self._deferred_response.callback(response)
diff --git a/scrapy/core/scheduler.py b/scrapy/core/scheduler.py
index 39a4bb6a3..17c95f1ea 100644
--- a/scrapy/core/scheduler.py
+++ b/scrapy/core/scheduler.py
@@ -1,10 +1,13 @@
from __future__ import annotations
+
import json
import logging
from abc import abstractmethod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Optional, Type, TypeVar, cast
+
from twisted.internet.defer import Deferred
+
from scrapy.crawler import Crawler
from scrapy.dupefilters import BaseDupeFilter
from scrapy.http.request import Request
@@ -12,8 +15,12 @@ from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.job import job_dir
from scrapy.utils.misc import create_instance, load_object
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
+
logger = logging.getLogger(__name__)
@@ -22,14 +29,18 @@ class BaseSchedulerMeta(type):
Metaclass to check scheduler classes against the necessary interface
"""
- def __instancecheck__(cls, instance: Any) ->bool:
+ def __instancecheck__(cls, instance: Any) -> bool:
return cls.__subclasscheck__(type(instance))
- def __subclasscheck__(cls, subclass: type) ->bool:
- return hasattr(subclass, 'has_pending_requests') and callable(subclass
- .has_pending_requests) and hasattr(subclass, 'enqueue_request'
- ) and callable(subclass.enqueue_request) and hasattr(subclass,
- 'next_request') and callable(subclass.next_request)
+ def __subclasscheck__(cls, subclass: type) -> bool:
+ return (
+ hasattr(subclass, "has_pending_requests")
+ and callable(subclass.has_pending_requests)
+ and hasattr(subclass, "enqueue_request")
+ and callable(subclass.enqueue_request)
+ and hasattr(subclass, "next_request")
+ and callable(subclass.next_request)
+ )
class BaseScheduler(metaclass=BaseSchedulerMeta):
@@ -50,13 +61,13 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
"""
@classmethod
- def from_crawler(cls, crawler: Crawler) ->Self:
+ def from_crawler(cls, crawler: Crawler) -> Self:
"""
Factory method which receives the current :class:`~scrapy.crawler.Crawler` object as argument.
"""
- pass
+ return cls()
- def open(self, spider: Spider) ->Optional[Deferred]:
+ def open(self, spider: Spider) -> Optional[Deferred]:
"""
Called when the spider is opened by the engine. It receives the spider
instance as argument and it's useful to execute initialization code.
@@ -66,7 +77,7 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
"""
pass
- def close(self, reason: str) ->Optional[Deferred]:
+ def close(self, reason: str) -> Optional[Deferred]:
"""
Called when the spider is closed by the engine. It receives the reason why the crawl
finished as argument and it's useful to execute cleaning code.
@@ -77,14 +88,14 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
pass
@abstractmethod
- def has_pending_requests(self) ->bool:
+ def has_pending_requests(self) -> bool:
"""
``True`` if the scheduler has enqueued requests, ``False`` otherwise
"""
- pass
+ raise NotImplementedError()
@abstractmethod
- def enqueue_request(self, request: Request) ->bool:
+ def enqueue_request(self, request: Request) -> bool:
"""
Process a request received by the engine.
@@ -95,10 +106,10 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
For reference, the default Scrapy scheduler returns ``False`` when the
request is rejected by the dupefilter.
"""
- pass
+ raise NotImplementedError()
@abstractmethod
- def next_request(self) ->Optional[Request]:
+ def next_request(self) -> Optional[Request]:
"""
Return the next :class:`~scrapy.http.Request` to be processed, or ``None``
to indicate that there are no requests to be considered ready at the moment.
@@ -107,10 +118,10 @@ class BaseScheduler(metaclass=BaseSchedulerMeta):
to the downloader in the current reactor cycle. The engine will continue
calling ``next_request`` until ``has_pending_requests`` is ``False``.
"""
- pass
+ raise NotImplementedError()
-SchedulerTV = TypeVar('SchedulerTV', bound='Scheduler')
+SchedulerTV = TypeVar("SchedulerTV", bound="Scheduler")
class Scheduler(BaseScheduler):
@@ -164,10 +175,17 @@ class Scheduler(BaseScheduler):
:type crawler: :class:`scrapy.crawler.Crawler`
"""
- def __init__(self, dupefilter: BaseDupeFilter, jobdir: Optional[str]=
- None, dqclass=None, mqclass=None, logunser: bool=False, stats:
- Optional[StatsCollector]=None, pqclass=None, crawler: Optional[
- Crawler]=None):
+ def __init__(
+ self,
+ dupefilter: BaseDupeFilter,
+ jobdir: Optional[str] = None,
+ dqclass=None,
+ mqclass=None,
+ logunser: bool = False,
+ stats: Optional[StatsCollector] = None,
+ pqclass=None,
+ crawler: Optional[Crawler] = None,
+ ):
self.df: BaseDupeFilter = dupefilter
self.dqdir: Optional[str] = self._dqdir(jobdir)
self.pqclass = pqclass
@@ -178,28 +196,48 @@ class Scheduler(BaseScheduler):
self.crawler: Optional[Crawler] = crawler
@classmethod
- def from_crawler(cls: Type[SchedulerTV], crawler: Crawler) ->SchedulerTV:
+ def from_crawler(cls: Type[SchedulerTV], crawler: Crawler) -> SchedulerTV:
"""
Factory method, initializes the scheduler with arguments taken from the crawl settings
"""
- pass
-
- def open(self, spider: Spider) ->Optional[Deferred]:
+ dupefilter_cls = load_object(crawler.settings["DUPEFILTER_CLASS"])
+ return cls(
+ dupefilter=create_instance(dupefilter_cls, crawler.settings, crawler),
+ jobdir=job_dir(crawler.settings),
+ dqclass=load_object(crawler.settings["SCHEDULER_DISK_QUEUE"]),
+ mqclass=load_object(crawler.settings["SCHEDULER_MEMORY_QUEUE"]),
+ logunser=crawler.settings.getbool("SCHEDULER_DEBUG"),
+ stats=crawler.stats,
+ pqclass=load_object(crawler.settings["SCHEDULER_PRIORITY_QUEUE"]),
+ crawler=crawler,
+ )
+
+ def has_pending_requests(self) -> bool:
+ return len(self) > 0
+
+ def open(self, spider: Spider) -> Optional[Deferred]:
"""
(1) initialize the memory queue
(2) initialize the disk queue if the ``jobdir`` attribute is a valid directory
(3) return the result of the dupefilter's ``open`` method
"""
- pass
+ self.spider = spider
+ self.mqs = self._mq()
+ self.dqs = self._dq() if self.dqdir else None
+ return self.df.open()
- def close(self, reason: str) ->Optional[Deferred]:
+ def close(self, reason: str) -> Optional[Deferred]:
"""
(1) dump pending requests to disk if there is a disk queue
(2) return the result of the dupefilter's ``close`` method
"""
- pass
+ if self.dqs is not None:
+ state = self.dqs.close()
+ assert isinstance(self.dqdir, str)
+ self._write_dqs_state(self.dqdir, state)
+ return self.df.close(reason)
- def enqueue_request(self, request: Request) ->bool:
+ def enqueue_request(self, request: Request) -> bool:
"""
Unless the received request is filtered out by the Dupefilter, attempt to push
it into the disk queue, falling back to pushing it into the memory queue.
@@ -209,9 +247,20 @@ class Scheduler(BaseScheduler):
Return ``True`` if the request was stored successfully, ``False`` otherwise.
"""
- pass
-
- def next_request(self) ->Optional[Request]:
+ if not request.dont_filter and self.df.request_seen(request):
+ self.df.log(request, self.spider)
+ return False
+ dqok = self._dqpush(request)
+ assert self.stats is not None
+ if dqok:
+ self.stats.inc_value("scheduler/enqueued/disk", spider=self.spider)
+ else:
+ self._mqpush(request)
+ self.stats.inc_value("scheduler/enqueued/memory", spider=self.spider)
+ self.stats.inc_value("scheduler/enqueued", spider=self.spider)
+ return True
+
+ def next_request(self) -> Optional[Request]:
"""
Return a :class:`~scrapy.http.Request` object from the memory queue,
falling back to the disk queue if the memory queue is empty.
@@ -220,23 +269,103 @@ class Scheduler(BaseScheduler):
Increment the appropriate stats, such as: ``scheduler/dequeued``,
``scheduler/dequeued/disk``, ``scheduler/dequeued/memory``.
"""
- pass
-
- def __len__(self) ->int:
+ request: Optional[Request] = self.mqs.pop()
+ assert self.stats is not None
+ if request is not None:
+ self.stats.inc_value("scheduler/dequeued/memory", spider=self.spider)
+ else:
+ request = self._dqpop()
+ if request is not None:
+ self.stats.inc_value("scheduler/dequeued/disk", spider=self.spider)
+ if request is not None:
+ self.stats.inc_value("scheduler/dequeued", spider=self.spider)
+ return request
+
+ def __len__(self) -> int:
"""
Return the total amount of enqueued requests
"""
- return len(self.dqs) + len(self.mqs) if self.dqs is not None else len(
- self.mqs)
+ return len(self.dqs) + len(self.mqs) if self.dqs is not None else len(self.mqs)
+
+ def _dqpush(self, request: Request) -> bool:
+ if self.dqs is None:
+ return False
+ try:
+ self.dqs.push(request)
+ except ValueError as e: # non serializable request
+ if self.logunser:
+ msg = (
+ "Unable to serialize request: %(request)s - reason:"
+ " %(reason)s - no more unserializable requests will be"
+ " logged (stats being collected)"
+ )
+ logger.warning(
+ msg,
+ {"request": request, "reason": e},
+ exc_info=True,
+ extra={"spider": self.spider},
+ )
+ self.logunser = False
+ assert self.stats is not None
+ self.stats.inc_value("scheduler/unserializable", spider=self.spider)
+ return False
+ else:
+ return True
+
+ def _mqpush(self, request: Request) -> None:
+ self.mqs.push(request)
+
+ def _dqpop(self) -> Optional[Request]:
+ if self.dqs is not None:
+ return self.dqs.pop()
+ return None
def _mq(self):
"""Create a new priority queue instance, with in-memory storage"""
- pass
+ return create_instance(
+ self.pqclass,
+ settings=None,
+ crawler=self.crawler,
+ downstream_queue_cls=self.mqclass,
+ key="",
+ )
def _dq(self):
"""Create a new priority queue instance, with disk storage"""
- pass
-
- def _dqdir(self, jobdir: Optional[str]) ->Optional[str]:
+ assert self.dqdir
+ state = self._read_dqs_state(self.dqdir)
+ q = create_instance(
+ self.pqclass,
+ settings=None,
+ crawler=self.crawler,
+ downstream_queue_cls=self.dqclass,
+ key=self.dqdir,
+ startprios=state,
+ )
+ if q:
+ logger.info(
+ "Resuming crawl (%(queuesize)d requests scheduled)",
+ {"queuesize": len(q)},
+ extra={"spider": self.spider},
+ )
+ return q
+
+ def _dqdir(self, jobdir: Optional[str]) -> Optional[str]:
"""Return a folder name to keep disk queue state at"""
- pass
+ if jobdir:
+ dqdir = Path(jobdir, "requests.queue")
+ if not dqdir.exists():
+ dqdir.mkdir(parents=True)
+ return str(dqdir)
+ return None
+
+ def _read_dqs_state(self, dqdir: str) -> list:
+ path = Path(dqdir, "active.json")
+ if not path.exists():
+ return []
+ with path.open(encoding="utf-8") as f:
+ return cast(list, json.load(f))
+
+ def _write_dqs_state(self, dqdir: str, state: list) -> None:
+ with Path(dqdir, "active.json").open("w", encoding="utf-8") as f:
+ json.dump(state, f)
diff --git a/scrapy/core/scraper.py b/scrapy/core/scraper.py
index d5cedad9e..b2c26507c 100644
--- a/scrapy/core/scraper.py
+++ b/scrapy/core/scraper.py
@@ -1,12 +1,28 @@
"""This module implements the Scraper component which parses responses and
extracts information from them"""
from __future__ import annotations
+
import logging
from collections import deque
-from typing import TYPE_CHECKING, Any, AsyncGenerator, AsyncIterable, Deque, Generator, Iterable, Optional, Set, Tuple, Type, Union
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ AsyncGenerator,
+ AsyncIterable,
+ Deque,
+ Generator,
+ Iterable,
+ Optional,
+ Set,
+ Tuple,
+ Type,
+ Union,
+)
+
from itemadapter import is_item
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
+
from scrapy import Spider, signals
from scrapy.core.spidermw import SpiderMiddlewareManager
from scrapy.exceptions import CloseSpider, DropItem, IgnoreRequest
@@ -14,21 +30,34 @@ from scrapy.http import Request, Response
from scrapy.logformatter import LogFormatter
from scrapy.pipelines import ItemPipelineManager
from scrapy.signalmanager import SignalManager
-from scrapy.utils.defer import aiter_errback, defer_fail, defer_succeed, iter_errback, parallel, parallel_async
+from scrapy.utils.defer import (
+ aiter_errback,
+ defer_fail,
+ defer_succeed,
+ iter_errback,
+ parallel,
+ parallel_async,
+)
from scrapy.utils.log import failure_to_exc_info, logformatter_adapter
from scrapy.utils.misc import load_object, warn_on_generator_with_return_value
from scrapy.utils.spider import iterate_spider_output
+
if TYPE_CHECKING:
from scrapy.crawler import Crawler
+
+
QueueTuple = Tuple[Union[Response, Failure], Request, Deferred]
+
+
logger = logging.getLogger(__name__)
class Slot:
"""Scraper slot (one per running spider)"""
+
MIN_RESPONSE_SIZE = 1024
- def __init__(self, max_active_size: int=5000000):
+ def __init__(self, max_active_size: int = 5000000):
self.max_active_size = max_active_size
self.queue: Deque[QueueTuple] = deque()
self.active: Set[Request] = set()
@@ -36,60 +65,260 @@ class Slot:
self.itemproc_size: int = 0
self.closing: Optional[Deferred] = None
+ def add_response_request(
+ self, result: Union[Response, Failure], request: Request
+ ) -> Deferred:
+ deferred: Deferred = Deferred()
+ self.queue.append((result, request, deferred))
+ if isinstance(result, Response):
+ self.active_size += max(len(result.body), self.MIN_RESPONSE_SIZE)
+ else:
+ self.active_size += self.MIN_RESPONSE_SIZE
+ return deferred
+
+ def next_response_request_deferred(self) -> QueueTuple:
+ response, request, deferred = self.queue.popleft()
+ self.active.add(request)
+ return response, request, deferred
+
+ def finish_response(
+ self, result: Union[Response, Failure], request: Request
+ ) -> None:
+ self.active.remove(request)
+ if isinstance(result, Response):
+ self.active_size -= max(len(result.body), self.MIN_RESPONSE_SIZE)
+ else:
+ self.active_size -= self.MIN_RESPONSE_SIZE
+
+ def is_idle(self) -> bool:
+ return not (self.queue or self.active)
+
+ def needs_backout(self) -> bool:
+ return self.active_size > self.max_active_size
-class Scraper:
- def __init__(self, crawler: Crawler) ->None:
+class Scraper:
+ def __init__(self, crawler: Crawler) -> None:
self.slot: Optional[Slot] = None
- self.spidermw: SpiderMiddlewareManager = (SpiderMiddlewareManager.
- from_crawler(crawler))
- itemproc_cls: Type[ItemPipelineManager] = load_object(crawler.
- settings['ITEM_PROCESSOR'])
+ self.spidermw: SpiderMiddlewareManager = SpiderMiddlewareManager.from_crawler(
+ crawler
+ )
+ itemproc_cls: Type[ItemPipelineManager] = load_object(
+ crawler.settings["ITEM_PROCESSOR"]
+ )
self.itemproc: ItemPipelineManager = itemproc_cls.from_crawler(crawler)
- self.concurrent_items: int = crawler.settings.getint('CONCURRENT_ITEMS'
- )
+ self.concurrent_items: int = crawler.settings.getint("CONCURRENT_ITEMS")
self.crawler: Crawler = crawler
self.signals: SignalManager = crawler.signals
assert crawler.logformatter
self.logformatter: LogFormatter = crawler.logformatter
@inlineCallbacks
- def open_spider(self, spider: Spider) ->Generator[Deferred, Any, None]:
+ def open_spider(self, spider: Spider) -> Generator[Deferred, Any, None]:
"""Open the given spider for scraping and allocate resources for it"""
- pass
+ self.slot = Slot(self.crawler.settings.getint("SCRAPER_SLOT_MAX_ACTIVE_SIZE"))
+ yield self.itemproc.open_spider(spider)
- def close_spider(self, spider: Spider) ->Deferred:
+ def close_spider(self, spider: Spider) -> Deferred:
"""Close a spider being scraped and release its resources"""
- pass
+ if self.slot is None:
+ raise RuntimeError("Scraper slot not assigned")
+ self.slot.closing = Deferred()
+ self.slot.closing.addCallback(self.itemproc.close_spider)
+ self._check_if_closing(spider)
+ return self.slot.closing
- def is_idle(self) ->bool:
+ def is_idle(self) -> bool:
"""Return True if there isn't any more spiders to process"""
- pass
+ return not self.slot
+
+ def _check_if_closing(self, spider: Spider) -> None:
+ assert self.slot is not None # typing
+ if self.slot.closing and self.slot.is_idle():
+ self.slot.closing.callback(spider)
+
+ def enqueue_scrape(
+ self, result: Union[Response, Failure], request: Request, spider: Spider
+ ) -> Deferred:
+ if self.slot is None:
+ raise RuntimeError("Scraper slot not assigned")
+ dfd = self.slot.add_response_request(result, request)
- def _scrape(self, result: Union[Response, Failure], request: Request,
- spider: Spider) ->Deferred:
+ def finish_scraping(_: Any) -> Any:
+ assert self.slot is not None
+ self.slot.finish_response(result, request)
+ self._check_if_closing(spider)
+ self._scrape_next(spider)
+ return _
+
+ dfd.addBoth(finish_scraping)
+ dfd.addErrback(
+ lambda f: logger.error(
+ "Scraper bug processing %(request)s",
+ {"request": request},
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": spider},
+ )
+ )
+ self._scrape_next(spider)
+ return dfd
+
+ def _scrape_next(self, spider: Spider) -> None:
+ assert self.slot is not None # typing
+ while self.slot.queue:
+ response, request, deferred = self.slot.next_response_request_deferred()
+ self._scrape(response, request, spider).chainDeferred(deferred)
+
+ def _scrape(
+ self, result: Union[Response, Failure], request: Request, spider: Spider
+ ) -> Deferred:
"""
Handle the downloaded response or failure through the spider callback/errback
"""
- pass
+ if not isinstance(result, (Response, Failure)):
+ raise TypeError(
+ f"Incorrect type: expected Response or Failure, got {type(result)}: {result!r}"
+ )
+ dfd = self._scrape2(
+ result, request, spider
+ ) # returns spider's processed output
+ dfd.addErrback(self.handle_spider_error, request, result, spider)
+ dfd.addCallback(self.handle_spider_output, request, result, spider)
+ return dfd
- def _scrape2(self, result: Union[Response, Failure], request: Request,
- spider: Spider) ->Deferred:
+ def _scrape2(
+ self, result: Union[Response, Failure], request: Request, spider: Spider
+ ) -> Deferred:
"""
Handle the different cases of request's result been a Response or a Failure
"""
- pass
+ if isinstance(result, Response):
+ return self.spidermw.scrape_response(
+ self.call_spider, result, request, spider
+ )
+ # else result is a Failure
+ dfd = self.call_spider(result, request, spider)
+ return dfd.addErrback(self._log_download_errors, result, request, spider)
+
+ def call_spider(
+ self, result: Union[Response, Failure], request: Request, spider: Spider
+ ) -> Deferred:
+ if isinstance(result, Response):
+ if getattr(result, "request", None) is None:
+ result.request = request
+ callback = result.request.callback or spider._parse
+ warn_on_generator_with_return_value(spider, callback)
+ dfd = defer_succeed(result)
+ dfd.addCallbacks(
+ callback=callback, callbackKeywords=result.request.cb_kwargs
+ )
+ else: # result is a Failure
+ # TODO: properly type adding this attribute to a Failure
+ result.request = request # type: ignore[attr-defined]
+ dfd = defer_fail(result)
+ if request.errback:
+ warn_on_generator_with_return_value(spider, request.errback)
+ dfd.addErrback(request.errback)
+ return dfd.addCallback(iterate_spider_output)
+
+ def handle_spider_error(
+ self,
+ _failure: Failure,
+ request: Request,
+ response: Union[Response, Failure],
+ spider: Spider,
+ ) -> None:
+ exc = _failure.value
+ if isinstance(exc, CloseSpider):
+ assert self.crawler.engine is not None # typing
+ self.crawler.engine.close_spider(spider, exc.reason or "cancelled")
+ return
+ logkws = self.logformatter.spider_error(_failure, request, response, spider)
+ logger.log(
+ *logformatter_adapter(logkws),
+ exc_info=failure_to_exc_info(_failure),
+ extra={"spider": spider},
+ )
+ self.signals.send_catch_log(
+ signal=signals.spider_error,
+ failure=_failure,
+ response=response,
+ spider=spider,
+ )
+ assert self.crawler.stats
+ self.crawler.stats.inc_value(
+ f"spider_exceptions/{_failure.value.__class__.__name__}", spider=spider
+ )
- def _process_spidermw_output(self, output: Any, request: Request,
- response: Response, spider: Spider) ->Optional[Deferred]:
+ def handle_spider_output(
+ self,
+ result: Union[Iterable, AsyncIterable],
+ request: Request,
+ response: Union[Response, Failure],
+ spider: Spider,
+ ) -> Deferred:
+ if not result:
+ return defer_succeed(None)
+ it: Union[Generator, AsyncGenerator]
+ if isinstance(result, AsyncIterable):
+ it = aiter_errback(
+ result, self.handle_spider_error, request, response, spider
+ )
+ dfd = parallel_async(
+ it,
+ self.concurrent_items,
+ self._process_spidermw_output,
+ request,
+ response,
+ spider,
+ )
+ else:
+ it = iter_errback(
+ result, self.handle_spider_error, request, response, spider
+ )
+ dfd = parallel(
+ it,
+ self.concurrent_items,
+ self._process_spidermw_output,
+ request,
+ response,
+ spider,
+ )
+ return dfd
+
+ def _process_spidermw_output(
+ self, output: Any, request: Request, response: Response, spider: Spider
+ ) -> Optional[Deferred]:
"""Process each Request/Item (given in the output parameter) returned
from the given spider
"""
- pass
+ assert self.slot is not None # typing
+ if isinstance(output, Request):
+ assert self.crawler.engine is not None # typing
+ self.crawler.engine.crawl(request=output)
+ elif is_item(output):
+ self.slot.itemproc_size += 1
+ dfd = self.itemproc.process_item(output, spider)
+ dfd.addBoth(self._itemproc_finished, output, response, spider)
+ return dfd
+ elif output is None:
+ pass
+ else:
+ typename = type(output).__name__
+ logger.error(
+ "Spider must return request, item, or None, got %(typename)r in %(request)s",
+ {"request": request, "typename": typename},
+ extra={"spider": spider},
+ )
+ return None
- def _log_download_errors(self, spider_failure: Failure,
- download_failure: Failure, request: Request, spider: Spider) ->Union[
- Failure, None]:
+ def _log_download_errors(
+ self,
+ spider_failure: Failure,
+ download_failure: Failure,
+ request: Request,
+ spider: Spider,
+ ) -> Union[Failure, None]:
"""Log and silence errors that come from the engine (typically download
errors that got propagated thru here).
@@ -97,9 +326,67 @@ class Scraper:
download_failure: the value passed into _scrape2() from
ExecutionEngine._handle_downloader_output() as "result"
"""
- pass
+ if not download_failure.check(IgnoreRequest):
+ if download_failure.frames:
+ logkws = self.logformatter.download_error(
+ download_failure, request, spider
+ )
+ logger.log(
+ *logformatter_adapter(logkws),
+ extra={"spider": spider},
+ exc_info=failure_to_exc_info(download_failure),
+ )
+ else:
+ errmsg = download_failure.getErrorMessage()
+ if errmsg:
+ logkws = self.logformatter.download_error(
+ download_failure, request, spider, errmsg
+ )
+ logger.log(
+ *logformatter_adapter(logkws),
+ extra={"spider": spider},
+ )
+
+ if spider_failure is not download_failure:
+ return spider_failure
+ return None
- def _itemproc_finished(self, output: Any, item: Any, response: Response,
- spider: Spider) ->Deferred:
+ def _itemproc_finished(
+ self, output: Any, item: Any, response: Response, spider: Spider
+ ) -> Deferred:
"""ItemProcessor finished for the given ``item`` and returned ``output``"""
- pass
+ assert self.slot is not None # typing
+ self.slot.itemproc_size -= 1
+ if isinstance(output, Failure):
+ ex = output.value
+ if isinstance(ex, DropItem):
+ logkws = self.logformatter.dropped(item, ex, response, spider)
+ if logkws is not None:
+ logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
+ return self.signals.send_catch_log_deferred(
+ signal=signals.item_dropped,
+ item=item,
+ response=response,
+ spider=spider,
+ exception=output.value,
+ )
+ assert ex
+ logkws = self.logformatter.item_error(item, ex, response, spider)
+ logger.log(
+ *logformatter_adapter(logkws),
+ extra={"spider": spider},
+ exc_info=failure_to_exc_info(output),
+ )
+ return self.signals.send_catch_log_deferred(
+ signal=signals.item_error,
+ item=item,
+ response=response,
+ spider=spider,
+ failure=output,
+ )
+ logkws = self.logformatter.scraped(output, response, spider)
+ if logkws is not None:
+ logger.log(*logformatter_adapter(logkws), extra={"spider": spider})
+ return self.signals.send_catch_log_deferred(
+ signal=signals.item_scraped, item=output, response=response, spider=spider
+ )
diff --git a/scrapy/core/spidermw.py b/scrapy/core/spidermw.py
index 9922755bf..dcf1a6dbc 100644
--- a/scrapy/core/spidermw.py
+++ b/scrapy/core/spidermw.py
@@ -6,9 +6,23 @@ See documentation in docs/topics/spider-middleware.rst
import logging
from inspect import isasyncgenfunction, iscoroutine
from itertools import islice
-from typing import Any, AsyncGenerator, AsyncIterable, Callable, Generator, Iterable, List, Optional, Tuple, Union, cast
+from typing import (
+ Any,
+ AsyncGenerator,
+ AsyncIterable,
+ Callable,
+ Generator,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ cast,
+)
+
from twisted.internet.defer import Deferred, inlineCallbacks
from twisted.python.failure import Failure
+
from scrapy import Request, Spider
from scrapy.exceptions import _InvalidOutput
from scrapy.http import Response
@@ -16,15 +30,317 @@ from scrapy.middleware import MiddlewareManager
from scrapy.settings import BaseSettings
from scrapy.utils.asyncgen import as_async_generator, collect_asyncgen
from scrapy.utils.conf import build_component_list
-from scrapy.utils.defer import deferred_f_from_coro_f, deferred_from_coro, maybe_deferred_to_future, mustbe_deferred
+from scrapy.utils.defer import (
+ deferred_f_from_coro_f,
+ deferred_from_coro,
+ maybe_deferred_to_future,
+ mustbe_deferred,
+)
from scrapy.utils.python import MutableAsyncChain, MutableChain
+
logger = logging.getLogger(__name__)
+
+
ScrapeFunc = Callable[[Union[Response, Failure], Request, Spider], Any]
+def _isiterable(o: Any) -> bool:
+ return isinstance(o, (Iterable, AsyncIterable))
+
+
class SpiderMiddlewareManager(MiddlewareManager):
- component_name = 'spider middleware'
+ component_name = "spider middleware"
def __init__(self, *middlewares: Any):
super().__init__(*middlewares)
self.downgrade_warning_done = False
+
+ @classmethod
+ def _get_mwlist_from_settings(cls, settings: BaseSettings) -> List[Any]:
+ return build_component_list(settings.getwithbase("SPIDER_MIDDLEWARES"))
+
+ def _add_middleware(self, mw: Any) -> None:
+ super()._add_middleware(mw)
+ if hasattr(mw, "process_spider_input"):
+ self.methods["process_spider_input"].append(mw.process_spider_input)
+ if hasattr(mw, "process_start_requests"):
+ self.methods["process_start_requests"].appendleft(mw.process_start_requests)
+ process_spider_output = self._get_async_method_pair(mw, "process_spider_output")
+ self.methods["process_spider_output"].appendleft(process_spider_output)
+ process_spider_exception = getattr(mw, "process_spider_exception", None)
+ self.methods["process_spider_exception"].appendleft(process_spider_exception)
+
+ def _process_spider_input(
+ self,
+ scrape_func: ScrapeFunc,
+ response: Response,
+ request: Request,
+ spider: Spider,
+ ) -> Any:
+ for method in self.methods["process_spider_input"]:
+ method = cast(Callable, method)
+ try:
+ result = method(response=response, spider=spider)
+ if result is not None:
+ msg = (
+ f"{method.__qualname__} must return None "
+ f"or raise an exception, got {type(result)}"
+ )
+ raise _InvalidOutput(msg)
+ except _InvalidOutput:
+ raise
+ except Exception:
+ return scrape_func(Failure(), request, spider)
+ return scrape_func(response, request, spider)
+
+ def _evaluate_iterable(
+ self,
+ response: Response,
+ spider: Spider,
+ iterable: Union[Iterable, AsyncIterable],
+ exception_processor_index: int,
+ recover_to: Union[MutableChain, MutableAsyncChain],
+ ) -> Union[Generator, AsyncGenerator]:
+ def process_sync(iterable: Iterable) -> Generator:
+ try:
+ for r in iterable:
+ yield r
+ except Exception as ex:
+ exception_result = self._process_spider_exception(
+ response, spider, Failure(ex), exception_processor_index
+ )
+ if isinstance(exception_result, Failure):
+ raise
+ recover_to.extend(exception_result)
+
+ async def process_async(iterable: AsyncIterable) -> AsyncGenerator:
+ try:
+ async for r in iterable:
+ yield r
+ except Exception as ex:
+ exception_result = self._process_spider_exception(
+ response, spider, Failure(ex), exception_processor_index
+ )
+ if isinstance(exception_result, Failure):
+ raise
+ recover_to.extend(exception_result)
+
+ if isinstance(iterable, AsyncIterable):
+ return process_async(iterable)
+ return process_sync(iterable)
+
+ def _process_spider_exception(
+ self,
+ response: Response,
+ spider: Spider,
+ _failure: Failure,
+ start_index: int = 0,
+ ) -> Union[Failure, MutableChain]:
+ exception = _failure.value
+ # don't handle _InvalidOutput exception
+ if isinstance(exception, _InvalidOutput):
+ return _failure
+ method_list = islice(
+ self.methods["process_spider_exception"], start_index, None
+ )
+ for method_index, method in enumerate(method_list, start=start_index):
+ if method is None:
+ continue
+ method = cast(Callable, method)
+ result = method(response=response, exception=exception, spider=spider)
+ if _isiterable(result):
+ # stop exception handling by handing control over to the
+ # process_spider_output chain if an iterable has been returned
+ dfd: Deferred = self._process_spider_output(
+ response, spider, result, method_index + 1
+ )
+ # _process_spider_output() returns a Deferred only because of downgrading so this can be
+ # simplified when downgrading is removed.
+ if dfd.called:
+ # the result is available immediately if _process_spider_output didn't do downgrading
+ return cast(MutableChain, dfd.result)
+ # we forbid waiting here because otherwise we would need to return a deferred from
+ # _process_spider_exception too, which complicates the architecture
+ msg = f"Async iterable returned from {method.__qualname__} cannot be downgraded"
+ raise _InvalidOutput(msg)
+ elif result is None:
+ continue
+ else:
+ msg = (
+ f"{method.__qualname__} must return None "
+ f"or an iterable, got {type(result)}"
+ )
+ raise _InvalidOutput(msg)
+ return _failure
+
+ # This method cannot be made async def, as _process_spider_exception relies on the Deferred result
+ # being available immediately which doesn't work when it's a wrapped coroutine.
+ # It also needs @inlineCallbacks only because of downgrading so it can be removed when downgrading is removed.
+ @inlineCallbacks
+ def _process_spider_output(
+ self,
+ response: Response,
+ spider: Spider,
+ result: Union[Iterable, AsyncIterable],
+ start_index: int = 0,
+ ) -> Generator[Deferred, Any, Union[MutableChain, MutableAsyncChain]]:
+ # items in this iterable do not need to go through the process_spider_output
+ # chain, they went through it already from the process_spider_exception method
+ recovered: Union[MutableChain, MutableAsyncChain]
+ last_result_is_async = isinstance(result, AsyncIterable)
+ if last_result_is_async:
+ recovered = MutableAsyncChain()
+ else:
+ recovered = MutableChain()
+
+ # There are three cases for the middleware: def foo, async def foo, def foo + async def foo_async.
+ # 1. def foo. Sync iterables are passed as is, async ones are downgraded.
+ # 2. async def foo. Sync iterables are upgraded, async ones are passed as is.
+ # 3. def foo + async def foo_async. Iterables are passed to the respective method.
+ # Storing methods and method tuples in the same list is weird but we should be able to roll this back
+ # when we drop this compatibility feature.
+
+ method_list = islice(self.methods["process_spider_output"], start_index, None)
+ for method_index, method_pair in enumerate(method_list, start=start_index):
+ if method_pair is None:
+ continue
+ need_upgrade = need_downgrade = False
+ if isinstance(method_pair, tuple):
+ # This tuple handling is only needed until _async compatibility methods are removed.
+ method_sync, method_async = method_pair
+ method = method_async if last_result_is_async else method_sync
+ else:
+ method = method_pair
+ if not last_result_is_async and isasyncgenfunction(method):
+ need_upgrade = True
+ elif last_result_is_async and not isasyncgenfunction(method):
+ need_downgrade = True
+ try:
+ if need_upgrade:
+ # Iterable -> AsyncIterable
+ result = as_async_generator(result)
+ elif need_downgrade:
+ if not self.downgrade_warning_done:
+ logger.warning(
+ f"Async iterable passed to {method.__qualname__} "
+ f"was downgraded to a non-async one"
+ )
+ self.downgrade_warning_done = True
+ assert isinstance(result, AsyncIterable)
+ # AsyncIterable -> Iterable
+ result = yield deferred_from_coro(collect_asyncgen(result))
+ if isinstance(recovered, AsyncIterable):
+ recovered_collected = yield deferred_from_coro(
+ collect_asyncgen(recovered)
+ )
+ recovered = MutableChain(recovered_collected)
+ # might fail directly if the output value is not a generator
+ result = method(response=response, result=result, spider=spider)
+ except Exception as ex:
+ exception_result = self._process_spider_exception(
+ response, spider, Failure(ex), method_index + 1
+ )
+ if isinstance(exception_result, Failure):
+ raise
+ return exception_result
+ if _isiterable(result):
+ result = self._evaluate_iterable(
+ response, spider, result, method_index + 1, recovered
+ )
+ else:
+ if iscoroutine(result):
+ result.close() # Silence warning about not awaiting
+ msg = (
+ f"{method.__qualname__} must be an asynchronous "
+ f"generator (i.e. use yield)"
+ )
+ else:
+ msg = (
+ f"{method.__qualname__} must return an iterable, got "
+ f"{type(result)}"
+ )
+ raise _InvalidOutput(msg)
+ last_result_is_async = isinstance(result, AsyncIterable)
+
+ if last_result_is_async:
+ return MutableAsyncChain(result, recovered)
+ return MutableChain(result, recovered) # type: ignore[arg-type]
+
+ async def _process_callback_output(
+ self, response: Response, spider: Spider, result: Union[Iterable, AsyncIterable]
+ ) -> Union[MutableChain, MutableAsyncChain]:
+ recovered: Union[MutableChain, MutableAsyncChain]
+ if isinstance(result, AsyncIterable):
+ recovered = MutableAsyncChain()
+ else:
+ recovered = MutableChain()
+ result = self._evaluate_iterable(response, spider, result, 0, recovered)
+ result = await maybe_deferred_to_future(
+ self._process_spider_output(response, spider, result)
+ )
+ if isinstance(result, AsyncIterable):
+ return MutableAsyncChain(result, recovered)
+ if isinstance(recovered, AsyncIterable):
+ recovered_collected = await collect_asyncgen(recovered)
+ recovered = MutableChain(recovered_collected)
+ return MutableChain(result, recovered)
+
+ def scrape_response(
+ self,
+ scrape_func: ScrapeFunc,
+ response: Response,
+ request: Request,
+ spider: Spider,
+ ) -> Deferred:
+ async def process_callback_output(
+ result: Union[Iterable, AsyncIterable]
+ ) -> Union[MutableChain, MutableAsyncChain]:
+ return await self._process_callback_output(response, spider, result)
+
+ def process_spider_exception(_failure: Failure) -> Union[Failure, MutableChain]:
+ return self._process_spider_exception(response, spider, _failure)
+
+ dfd = mustbe_deferred(
+ self._process_spider_input, scrape_func, response, request, spider
+ )
+ dfd.addCallbacks(
+ callback=deferred_f_from_coro_f(process_callback_output),
+ errback=process_spider_exception,
+ )
+ return dfd
+
+ def process_start_requests(
+ self, start_requests: Iterable[Request], spider: Spider
+ ) -> Deferred:
+ return self._process_chain("process_start_requests", start_requests, spider)
+
+ # This method is only needed until _async compatibility methods are removed.
+ @staticmethod
+ def _get_async_method_pair(
+ mw: Any, methodname: str
+ ) -> Union[None, Callable, Tuple[Callable, Callable]]:
+ normal_method: Optional[Callable] = getattr(mw, methodname, None)
+ methodname_async = methodname + "_async"
+ async_method: Optional[Callable] = getattr(mw, methodname_async, None)
+ if not async_method:
+ return normal_method
+ if not normal_method:
+ logger.error(
+ f"Middleware {mw.__qualname__} has {methodname_async} "
+ f"without {methodname}, skipping this method."
+ )
+ return None
+ if not isasyncgenfunction(async_method):
+ logger.error(
+ f"{async_method.__qualname__} is not "
+ f"an async generator function, skipping this method."
+ )
+ return normal_method
+ if isasyncgenfunction(normal_method):
+ logger.error(
+ f"{normal_method.__qualname__} is an async "
+ f"generator function while {methodname_async} exists, "
+ f"skipping both methods."
+ )
+ return None
+ return normal_method, async_method
diff --git a/scrapy/crawler.py b/scrapy/crawler.py
index 4271118c2..6f54e62e9 100644
--- a/scrapy/crawler.py
+++ b/scrapy/crawler.py
@@ -1,16 +1,27 @@
from __future__ import annotations
+
import logging
import pprint
import signal
import warnings
from typing import TYPE_CHECKING, Any, Dict, Generator, Optional, Set, Type, Union, cast
-from twisted.internet.defer import Deferred, DeferredList, inlineCallbacks, maybeDeferred
+
+from twisted.internet.defer import (
+ Deferred,
+ DeferredList,
+ inlineCallbacks,
+ maybeDeferred,
+)
from zope.interface.exceptions import DoesNotImplement
+
try:
+ # zope >= 5.0 only supports MultipleInvalid
from zope.interface.exceptions import MultipleInvalid
except ImportError:
MultipleInvalid = None
+
from zope.interface.verify import verifyClass
+
from scrapy import Spider, signals
from scrapy.addons import AddonManager
from scrapy.core.engine import ExecutionEngine
@@ -21,33 +32,55 @@ from scrapy.logformatter import LogFormatter
from scrapy.settings import BaseSettings, Settings, overridden_settings
from scrapy.signalmanager import SignalManager
from scrapy.statscollectors import StatsCollector
-from scrapy.utils.log import LogCounterHandler, configure_logging, get_scrapy_root_handler, install_scrapy_root_handler, log_reactor_info, log_scrapy_info
+from scrapy.utils.log import (
+ LogCounterHandler,
+ configure_logging,
+ get_scrapy_root_handler,
+ install_scrapy_root_handler,
+ log_reactor_info,
+ log_scrapy_info,
+)
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.ossignal import install_shutdown_handlers, signal_names
-from scrapy.utils.reactor import install_reactor, is_asyncio_reactor_installed, verify_installed_asyncio_event_loop, verify_installed_reactor
+from scrapy.utils.reactor import (
+ install_reactor,
+ is_asyncio_reactor_installed,
+ verify_installed_asyncio_event_loop,
+ verify_installed_reactor,
+)
+
if TYPE_CHECKING:
from scrapy.utils.request import RequestFingerprinter
+
+
logger = logging.getLogger(__name__)
class Crawler:
-
- def __init__(self, spidercls: Type[Spider], settings: Union[None, Dict[
- str, Any], Settings]=None, init_reactor: bool=False):
+ def __init__(
+ self,
+ spidercls: Type[Spider],
+ settings: Union[None, Dict[str, Any], Settings] = None,
+ init_reactor: bool = False,
+ ):
if isinstance(spidercls, Spider):
- raise ValueError(
- 'The spidercls argument must be a class, not an object')
+ raise ValueError("The spidercls argument must be a class, not an object")
+
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
+
self.spidercls: Type[Spider] = spidercls
self.settings: Settings = settings.copy()
self.spidercls.update_settings(self.settings)
self._update_root_log_handler()
+
self.addons: AddonManager = AddonManager(self)
self.signals: SignalManager = SignalManager(self)
+
self._init_reactor: bool = init_reactor
self.crawling: bool = False
self._started: bool = False
+
self.extensions: Optional[ExtensionManager] = None
self.stats: Optional[StatsCollector] = None
self.logformatter: Optional[LogFormatter] = None
@@ -55,11 +88,97 @@ class Crawler:
self.spider: Optional[Spider] = None
self.engine: Optional[ExecutionEngine] = None
+ def _update_root_log_handler(self) -> None:
+ if get_scrapy_root_handler() is not None:
+ # scrapy root handler already installed: update it with new settings
+ install_scrapy_root_handler(self.settings)
+
+ def _apply_settings(self) -> None:
+ if self.settings.frozen:
+ return
+
+ self.addons.load_settings(self.settings)
+ self.stats = load_object(self.settings["STATS_CLASS"])(self)
+
+ handler = LogCounterHandler(self, level=self.settings.get("LOG_LEVEL"))
+ logging.root.addHandler(handler)
+ # lambda is assigned to Crawler attribute because this way it is not
+ # garbage collected after leaving the scope
+ self.__remove_handler = lambda: logging.root.removeHandler(handler)
+ self.signals.connect(self.__remove_handler, signals.engine_stopped)
+
+ lf_cls: Type[LogFormatter] = load_object(self.settings["LOG_FORMATTER"])
+ self.logformatter = lf_cls.from_crawler(self)
+
+ self.request_fingerprinter = create_instance(
+ load_object(self.settings["REQUEST_FINGERPRINTER_CLASS"]),
+ settings=self.settings,
+ crawler=self,
+ )
+
+ reactor_class: str = self.settings["TWISTED_REACTOR"]
+ event_loop: str = self.settings["ASYNCIO_EVENT_LOOP"]
+ if self._init_reactor:
+ # this needs to be done after the spider settings are merged,
+ # but before something imports twisted.internet.reactor
+ if reactor_class:
+ install_reactor(reactor_class, event_loop)
+ else:
+ from twisted.internet import reactor # noqa: F401
+ log_reactor_info()
+ if reactor_class:
+ verify_installed_reactor(reactor_class)
+ if is_asyncio_reactor_installed() and event_loop:
+ verify_installed_asyncio_event_loop(event_loop)
+
+ self.extensions = ExtensionManager.from_crawler(self)
+ self.settings.freeze()
+
+ d = dict(overridden_settings(self.settings))
+ logger.info(
+ "Overridden settings:\n%(settings)s", {"settings": pprint.pformat(d)}
+ )
+
+ @inlineCallbacks
+ def crawl(self, *args: Any, **kwargs: Any) -> Generator[Deferred, Any, None]:
+ if self.crawling:
+ raise RuntimeError("Crawling already taking place")
+ if self._started:
+ warnings.warn(
+ "Running Crawler.crawl() more than once is deprecated.",
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ self.crawling = self._started = True
+
+ try:
+ self.spider = self._create_spider(*args, **kwargs)
+ self._apply_settings()
+ self._update_root_log_handler()
+ self.engine = self._create_engine()
+ start_requests = iter(self.spider.start_requests())
+ yield self.engine.open_spider(self.spider, start_requests)
+ yield maybeDeferred(self.engine.start)
+ except Exception:
+ self.crawling = False
+ if self.engine is not None:
+ yield self.engine.close()
+ raise
+
+ def _create_spider(self, *args: Any, **kwargs: Any) -> Spider:
+ return self.spidercls.from_crawler(self, *args, **kwargs)
+
+ def _create_engine(self) -> ExecutionEngine:
+ return ExecutionEngine(self, lambda _: self.stop())
+
@inlineCallbacks
- def stop(self) ->Generator[Deferred, Any, None]:
+ def stop(self) -> Generator[Deferred, Any, None]:
"""Starts a graceful stop of the crawler and returns a deferred that is
fired when the crawler is stopped."""
- pass
+ if self.crawling:
+ self.crawling = False
+ assert self.engine
+ yield maybeDeferred(self.engine.stop)
class CrawlerRunner:
@@ -74,16 +193,34 @@ class CrawlerRunner:
accordingly) unless writing scripts that manually handle the crawling
process. See :ref:`run-from-script` for an example.
"""
- crawlers = property(lambda self: self._crawlers, doc=
- 'Set of :class:`crawlers <scrapy.crawler.Crawler>` started by :meth:`crawl` and managed by this class.'
- )
+
+ crawlers = property(
+ lambda self: self._crawlers,
+ doc="Set of :class:`crawlers <scrapy.crawler.Crawler>` started by "
+ ":meth:`crawl` and managed by this class.",
+ )
@staticmethod
def _get_spider_loader(settings: BaseSettings):
"""Get SpiderLoader instance from settings"""
- pass
-
- def __init__(self, settings: Union[Dict[str, Any], Settings, None]=None):
+ cls_path = settings.get("SPIDER_LOADER_CLASS")
+ loader_cls = load_object(cls_path)
+ excs = (
+ (DoesNotImplement, MultipleInvalid) if MultipleInvalid else DoesNotImplement
+ )
+ try:
+ verifyClass(ISpiderLoader, loader_cls)
+ except excs:
+ warnings.warn(
+ "SPIDER_LOADER_CLASS (previously named SPIDER_MANAGER_CLASS) does "
+ "not fully implement scrapy.interfaces.ISpiderLoader interface. "
+ "Please add all missing methods to avoid unexpected runtime errors.",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ return loader_cls.from_settings(settings.frozencopy())
+
+ def __init__(self, settings: Union[Dict[str, Any], Settings, None] = None):
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
self.settings = settings
@@ -92,8 +229,12 @@ class CrawlerRunner:
self._active: Set[Deferred] = set()
self.bootstrap_failed = False
- def crawl(self, crawler_or_spidercls: Union[Type[Spider], str, Crawler],
- *args: Any, **kwargs: Any) ->Deferred:
+ def crawl(
+ self,
+ crawler_or_spidercls: Union[Type[Spider], str, Crawler],
+ *args: Any,
+ **kwargs: Any,
+ ) -> Deferred:
"""
Run a crawler with the provided arguments.
@@ -115,10 +256,30 @@ class CrawlerRunner:
:param kwargs: keyword arguments to initialize the spider
"""
- pass
-
- def create_crawler(self, crawler_or_spidercls: Union[Type[Spider], str,
- Crawler]) ->Crawler:
+ if isinstance(crawler_or_spidercls, Spider):
+ raise ValueError(
+ "The crawler_or_spidercls argument cannot be a spider object, "
+ "it must be a spider class (or a Crawler object)"
+ )
+ crawler = self.create_crawler(crawler_or_spidercls)
+ return self._crawl(crawler, *args, **kwargs)
+
+ def _crawl(self, crawler: Crawler, *args: Any, **kwargs: Any) -> Deferred:
+ self.crawlers.add(crawler)
+ d = crawler.crawl(*args, **kwargs)
+ self._active.add(d)
+
+ def _done(result: Any) -> Any:
+ self.crawlers.discard(crawler)
+ self._active.discard(d)
+ self.bootstrap_failed |= not getattr(crawler, "spider", None)
+ return result
+
+ return d.addBoth(_done)
+
+ def create_crawler(
+ self, crawler_or_spidercls: Union[Type[Spider], str, Crawler]
+ ) -> Crawler:
"""
Return a :class:`~scrapy.crawler.Crawler` object.
@@ -129,25 +290,39 @@ class CrawlerRunner:
a spider with this name in a Scrapy project (using spider loader),
then creates a Crawler instance for it.
"""
- pass
-
- def stop(self) ->Deferred:
+ if isinstance(crawler_or_spidercls, Spider):
+ raise ValueError(
+ "The crawler_or_spidercls argument cannot be a spider object, "
+ "it must be a spider class (or a Crawler object)"
+ )
+ if isinstance(crawler_or_spidercls, Crawler):
+ return crawler_or_spidercls
+ return self._create_crawler(crawler_or_spidercls)
+
+ def _create_crawler(self, spidercls: Union[str, Type[Spider]]) -> Crawler:
+ if isinstance(spidercls, str):
+ spidercls = self.spider_loader.load(spidercls)
+ # temporary cast until self.spider_loader is typed
+ return Crawler(cast(Type[Spider], spidercls), self.settings)
+
+ def stop(self) -> Deferred:
"""
Stops simultaneously all the crawling jobs taking place.
Returns a deferred that is fired when they all have ended.
"""
- pass
+ return DeferredList([c.stop() for c in list(self.crawlers)])
@inlineCallbacks
- def join(self) ->Generator[Deferred, Any, None]:
+ def join(self) -> Generator[Deferred, Any, None]:
"""
join()
Returns a deferred that is fired when all managed :attr:`crawlers` have
completed their executions.
"""
- pass
+ while self._active:
+ yield DeferredList(self._active)
class CrawlerProcess(CrawlerRunner):
@@ -174,15 +349,50 @@ class CrawlerProcess(CrawlerRunner):
process. See :ref:`run-from-script` for an example.
"""
- def __init__(self, settings: Union[Dict[str, Any], Settings, None]=None,
- install_root_handler: bool=True):
+ def __init__(
+ self,
+ settings: Union[Dict[str, Any], Settings, None] = None,
+ install_root_handler: bool = True,
+ ):
super().__init__(settings)
configure_logging(self.settings, install_root_handler)
log_scrapy_info(self.settings)
self._initialized_reactor = False
- def start(self, stop_after_crawl: bool=True, install_signal_handlers:
- bool=True) ->None:
+ def _signal_shutdown(self, signum: int, _: Any) -> None:
+ from twisted.internet import reactor
+
+ install_shutdown_handlers(self._signal_kill)
+ signame = signal_names[signum]
+ logger.info(
+ "Received %(signame)s, shutting down gracefully. Send again to force ",
+ {"signame": signame},
+ )
+ reactor.callFromThread(self._graceful_stop_reactor)
+
+ def _signal_kill(self, signum: int, _: Any) -> None:
+ from twisted.internet import reactor
+
+ install_shutdown_handlers(signal.SIG_IGN)
+ signame = signal_names[signum]
+ logger.info(
+ "Received %(signame)s twice, forcing unclean shutdown", {"signame": signame}
+ )
+ reactor.callFromThread(self._stop_reactor)
+
+ def _create_crawler(self, spidercls: Union[Type[Spider], str]) -> Crawler:
+ if isinstance(spidercls, str):
+ spidercls = self.spider_loader.load(spidercls)
+ init_reactor = not self._initialized_reactor
+ self._initialized_reactor = True
+ # temporary cast until self.spider_loader is typed
+ return Crawler(
+ cast(Type[Spider], spidercls), self.settings, init_reactor=init_reactor
+ )
+
+ def start(
+ self, stop_after_crawl: bool = True, install_signal_handlers: bool = True
+ ) -> None:
"""
This method starts a :mod:`~twisted.internet.reactor`, adjusts its pool
size to :setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache
@@ -197,4 +407,36 @@ class CrawlerProcess(CrawlerRunner):
:param bool install_signal_handlers: whether to install the OS signal
handlers from Twisted and Scrapy (default: True)
"""
- pass
+ from twisted.internet import reactor
+
+ if stop_after_crawl:
+ d = self.join()
+ # Don't start the reactor if the deferreds are already fired
+ if d.called:
+ return
+ d.addBoth(self._stop_reactor)
+
+ resolver_class = load_object(self.settings["DNS_RESOLVER"])
+ resolver = create_instance(resolver_class, self.settings, self, reactor=reactor)
+ resolver.install_on_reactor()
+ tp = reactor.getThreadPool()
+ tp.adjustPoolsize(maxthreads=self.settings.getint("REACTOR_THREADPOOL_MAXSIZE"))
+ reactor.addSystemEventTrigger("before", "shutdown", self.stop)
+ if install_signal_handlers:
+ reactor.addSystemEventTrigger(
+ "after", "startup", install_shutdown_handlers, self._signal_shutdown
+ )
+ reactor.run(installSignalHandlers=install_signal_handlers) # blocking call
+
+ def _graceful_stop_reactor(self) -> Deferred:
+ d = self.stop()
+ d.addBoth(self._stop_reactor)
+ return d
+
+ def _stop_reactor(self, _: Any = None) -> None:
+ from twisted.internet import reactor
+
+ try:
+ reactor.stop()
+ except RuntimeError: # raised if already stopped or in shutdown stage
+ pass
diff --git a/scrapy/downloadermiddlewares/ajaxcrawl.py b/scrapy/downloadermiddlewares/ajaxcrawl.py
index c8df05f69..04ae719de 100644
--- a/scrapy/downloadermiddlewares/ajaxcrawl.py
+++ b/scrapy/downloadermiddlewares/ajaxcrawl.py
@@ -1,8 +1,11 @@
import logging
import re
+
from w3lib import html
+
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
+
logger = logging.getLogger(__name__)
@@ -13,20 +16,57 @@ class AjaxCrawlMiddleware:
"""
def __init__(self, settings):
- if not settings.getbool('AJAXCRAWL_ENABLED'):
+ if not settings.getbool("AJAXCRAWL_ENABLED"):
raise NotConfigured
- self.lookup_bytes = settings.getint('AJAXCRAWL_MAXSIZE', 32768)
+
+ # XXX: Google parses at least first 100k bytes; scrapy's redirect
+ # middleware parses first 4k. 4k turns out to be insufficient
+ # for this middleware, and parsing 100k could be slow.
+ # We use something in between (32K) by default.
+ self.lookup_bytes = settings.getint("AJAXCRAWL_MAXSIZE", 32768)
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings)
+
+ def process_response(self, request, response, spider):
+ if not isinstance(response, HtmlResponse) or response.status != 200:
+ return response
+
+ if request.method != "GET":
+ # other HTTP methods are either not safe or don't have a body
+ return response
+
+ if "ajax_crawlable" in request.meta: # prevent loops
+ return response
+
+ if not self._has_ajax_crawlable_variant(response):
+ return response
+
+ # scrapy already handles #! links properly
+ ajax_crawl_request = request.replace(url=request.url + "#!")
+ logger.debug(
+ "Downloading AJAX crawlable %(ajax_crawl_request)s instead of %(request)s",
+ {"ajax_crawl_request": ajax_crawl_request, "request": request},
+ extra={"spider": spider},
+ )
+
+ ajax_crawl_request.meta["ajax_crawlable"] = True
+ return ajax_crawl_request
def _has_ajax_crawlable_variant(self, response):
"""
Return True if a page without hash fragment could be "AJAX crawlable"
according to https://developers.google.com/webmasters/ajax-crawling/docs/getting-started.
"""
- pass
+ body = response.text[: self.lookup_bytes]
+ return _has_ajaxcrawlable_meta(body)
+# XXX: move it to w3lib?
_ajax_crawlable_re = re.compile(
- '<meta\\s+name=["\\\']fragment["\\\']\\s+content=["\\\']!["\\\']/?>')
+ r'<meta\s+name=["\']fragment["\']\s+content=["\']!["\']/?>'
+)
def _has_ajaxcrawlable_meta(text):
@@ -40,4 +80,16 @@ def _has_ajaxcrawlable_meta(text):
>>> _has_ajaxcrawlable_meta('<html></html>')
False
"""
- pass
+
+ # Stripping scripts and comments is slow (about 20x slower than
+ # just checking if a string is in text); this is a quick fail-fast
+ # path that should work for most pages.
+ if "fragment" not in text:
+ return False
+ if "content" not in text:
+ return False
+
+ text = html.remove_tags_with_content(text, ("script", "noscript"))
+ text = html.replace_entities(text)
+ text = html.remove_comments(text)
+ return _ajax_crawlable_re.search(text) is not None
diff --git a/scrapy/downloadermiddlewares/cookies.py b/scrapy/downloadermiddlewares/cookies.py
index 3b97e0653..6495157d7 100644
--- a/scrapy/downloadermiddlewares/cookies.py
+++ b/scrapy/downloadermiddlewares/cookies.py
@@ -1,15 +1,25 @@
import logging
from collections import defaultdict
+
from tldextract import TLDExtract
+
from scrapy.exceptions import NotConfigured
from scrapy.http import Response
from scrapy.http.cookies import CookieJar
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
+
logger = logging.getLogger(__name__)
+
+
_split_domain = TLDExtract(include_psl_private_domains=True)
+def _is_public_domain(domain):
+ parts = _split_domain(domain)
+ return not parts.domain
+
+
class CookiesMiddleware:
"""This middleware enables working with sites that need cookies"""
@@ -17,15 +27,118 @@ class CookiesMiddleware:
self.jars = defaultdict(CookieJar)
self.debug = debug
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("COOKIES_ENABLED"):
+ raise NotConfigured
+ return cls(crawler.settings.getbool("COOKIES_DEBUG"))
+
+ def _process_cookies(self, cookies, *, jar, request):
+ for cookie in cookies:
+ cookie_domain = cookie.domain
+ if cookie_domain.startswith("."):
+ cookie_domain = cookie_domain[1:]
+
+ request_domain = urlparse_cached(request).hostname.lower()
+
+ if cookie_domain and _is_public_domain(cookie_domain):
+ if cookie_domain != request_domain:
+ continue
+ cookie.domain = request_domain
+
+ jar.set_cookie_if_ok(cookie, request)
+
+ def process_request(self, request, spider):
+ if request.meta.get("dont_merge_cookies", False):
+ return
+
+ cookiejarkey = request.meta.get("cookiejar")
+ jar = self.jars[cookiejarkey]
+ cookies = self._get_request_cookies(jar, request)
+ self._process_cookies(cookies, jar=jar, request=request)
+
+ # set Cookie header
+ request.headers.pop("Cookie", None)
+ jar.add_cookie_header(request)
+ self._debug_cookie(request, spider)
+
+ def process_response(self, request, response, spider):
+ if request.meta.get("dont_merge_cookies", False):
+ return response
+
+ # extract cookies from Set-Cookie and drop invalid/expired cookies
+ cookiejarkey = request.meta.get("cookiejar")
+ jar = self.jars[cookiejarkey]
+ cookies = jar.make_cookies(response, request)
+ self._process_cookies(cookies, jar=jar, request=request)
+
+ self._debug_set_cookie(response, spider)
+
+ return response
+
+ def _debug_cookie(self, request, spider):
+ if self.debug:
+ cl = [
+ to_unicode(c, errors="replace")
+ for c in request.headers.getlist("Cookie")
+ ]
+ if cl:
+ cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
+ msg = f"Sending cookies to: {request}\n{cookies}"
+ logger.debug(msg, extra={"spider": spider})
+
+ def _debug_set_cookie(self, response, spider):
+ if self.debug:
+ cl = [
+ to_unicode(c, errors="replace")
+ for c in response.headers.getlist("Set-Cookie")
+ ]
+ if cl:
+ cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
+ msg = f"Received cookies from: {response}\n{cookies}"
+ logger.debug(msg, extra={"spider": spider})
+
def _format_cookie(self, cookie, request):
"""
Given a dict consisting of cookie components, return its string representation.
Decode from bytes if necessary.
"""
- pass
+ decoded = {}
+ for key in ("name", "value", "path", "domain"):
+ if cookie.get(key) is None:
+ if key in ("name", "value"):
+ msg = f"Invalid cookie found in request {request}: {cookie} ('{key}' is missing)"
+ logger.warning(msg)
+ return
+ continue
+ if isinstance(cookie[key], (bool, float, int, str)):
+ decoded[key] = str(cookie[key])
+ else:
+ try:
+ decoded[key] = cookie[key].decode("utf8")
+ except UnicodeDecodeError:
+ logger.warning(
+ "Non UTF-8 encoded cookie found in request %s: %s",
+ request,
+ cookie,
+ )
+ decoded[key] = cookie[key].decode("latin1", errors="replace")
+
+ cookie_str = f"{decoded.pop('name')}={decoded.pop('value')}"
+ for key, value in decoded.items(): # path, domain
+ cookie_str += f"; {key.capitalize()}={value}"
+ return cookie_str
def _get_request_cookies(self, jar, request):
"""
Extract cookies from the Request.cookies attribute
"""
- pass
+ if not request.cookies:
+ return []
+ if isinstance(request.cookies, dict):
+ cookies = ({"name": k, "value": v} for k, v in request.cookies.items())
+ else:
+ cookies = request.cookies
+ formatted = filter(None, (self._format_cookie(c, request) for c in cookies))
+ response = Response(request.url, headers={"Set-Cookie": formatted})
+ return jar.make_cookies(response, request)
diff --git a/scrapy/downloadermiddlewares/defaultheaders.py b/scrapy/downloadermiddlewares/defaultheaders.py
index d104ee821..cdacc7368 100644
--- a/scrapy/downloadermiddlewares/defaultheaders.py
+++ b/scrapy/downloadermiddlewares/defaultheaders.py
@@ -3,10 +3,19 @@ DefaultHeaders downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
+
from scrapy.utils.python import without_none_values
class DefaultHeadersMiddleware:
-
def __init__(self, headers):
self._headers = headers
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ headers = without_none_values(crawler.settings["DEFAULT_REQUEST_HEADERS"])
+ return cls(headers.items())
+
+ def process_request(self, request, spider):
+ for k, v in self._headers:
+ request.headers.setdefault(k, v)
diff --git a/scrapy/downloadermiddlewares/downloadtimeout.py b/scrapy/downloadermiddlewares/downloadtimeout.py
index 222bc5eb8..a926ecf56 100644
--- a/scrapy/downloadermiddlewares/downloadtimeout.py
+++ b/scrapy/downloadermiddlewares/downloadtimeout.py
@@ -3,10 +3,23 @@ Download timeout middleware
See documentation in docs/topics/downloader-middleware.rst
"""
+
from scrapy import signals
class DownloadTimeoutMiddleware:
-
def __init__(self, timeout=180):
self._timeout = timeout
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls(crawler.settings.getfloat("DOWNLOAD_TIMEOUT"))
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ return o
+
+ def spider_opened(self, spider):
+ self._timeout = getattr(spider, "download_timeout", self._timeout)
+
+ def process_request(self, request, spider):
+ if self._timeout:
+ request.meta.setdefault("download_timeout", self._timeout)
diff --git a/scrapy/downloadermiddlewares/httpauth.py b/scrapy/downloadermiddlewares/httpauth.py
index ec0f5cc32..de5a81388 100644
--- a/scrapy/downloadermiddlewares/httpauth.py
+++ b/scrapy/downloadermiddlewares/httpauth.py
@@ -4,7 +4,9 @@ HTTP basic auth downloader middleware
See documentation in docs/topics/downloader-middleware.rst
"""
import warnings
+
from w3lib.http import basic_auth_header
+
from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.httpobj import urlparse_cached
@@ -14,3 +16,37 @@ from scrapy.utils.url import url_is_from_any_domain
class HttpAuthMiddleware:
"""Set Basic HTTP Authorization header
(http_user and http_pass spider class attributes)"""
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls()
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ return o
+
+ def spider_opened(self, spider):
+ usr = getattr(spider, "http_user", "")
+ pwd = getattr(spider, "http_pass", "")
+ if usr or pwd:
+ self.auth = basic_auth_header(usr, pwd)
+ if not hasattr(spider, "http_auth_domain"):
+ warnings.warn(
+ "Using HttpAuthMiddleware without http_auth_domain is deprecated and can cause security "
+ "problems if the spider makes requests to several different domains. http_auth_domain "
+ "will be set to the domain of the first request, please set it to the correct value "
+ "explicitly.",
+ category=ScrapyDeprecationWarning,
+ )
+ self.domain_unset = True
+ else:
+ self.domain = spider.http_auth_domain
+ self.domain_unset = False
+
+ def process_request(self, request, spider):
+ auth = getattr(self, "auth", None)
+ if auth and b"Authorization" not in request.headers:
+ domain = urlparse_cached(request).hostname
+ if self.domain_unset:
+ self.domain = domain
+ self.domain_unset = False
+ if not self.domain or url_is_from_any_domain(request.url, [self.domain]):
+ request.headers[b"Authorization"] = auth
diff --git a/scrapy/downloadermiddlewares/httpcache.py b/scrapy/downloadermiddlewares/httpcache.py
index 2ffedebca..a521cde7a 100644
--- a/scrapy/downloadermiddlewares/httpcache.py
+++ b/scrapy/downloadermiddlewares/httpcache.py
@@ -1,8 +1,18 @@
from email.utils import formatdate
from typing import Optional, Type, TypeVar
+
from twisted.internet import defer
-from twisted.internet.error import ConnectError, ConnectionDone, ConnectionLost, ConnectionRefusedError, DNSLookupError, TCPTimedOutError, TimeoutError
+from twisted.internet.error import (
+ ConnectError,
+ ConnectionDone,
+ ConnectionLost,
+ ConnectionRefusedError,
+ DNSLookupError,
+ TCPTimedOutError,
+ TimeoutError,
+)
from twisted.web.client import ResponseFailed
+
from scrapy import signals
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest, NotConfigured
@@ -12,19 +22,129 @@ from scrapy.settings import Settings
from scrapy.spiders import Spider
from scrapy.statscollectors import StatsCollector
from scrapy.utils.misc import load_object
-HttpCacheMiddlewareTV = TypeVar('HttpCacheMiddlewareTV', bound=
- 'HttpCacheMiddleware')
+
+HttpCacheMiddlewareTV = TypeVar("HttpCacheMiddlewareTV", bound="HttpCacheMiddleware")
class HttpCacheMiddleware:
- DOWNLOAD_EXCEPTIONS = (defer.TimeoutError, TimeoutError, DNSLookupError,
- ConnectionRefusedError, ConnectionDone, ConnectError,
- ConnectionLost, TCPTimedOutError, ResponseFailed, OSError)
+ DOWNLOAD_EXCEPTIONS = (
+ defer.TimeoutError,
+ TimeoutError,
+ DNSLookupError,
+ ConnectionRefusedError,
+ ConnectionDone,
+ ConnectError,
+ ConnectionLost,
+ TCPTimedOutError,
+ ResponseFailed,
+ OSError,
+ )
- def __init__(self, settings: Settings, stats: StatsCollector) ->None:
- if not settings.getbool('HTTPCACHE_ENABLED'):
+ def __init__(self, settings: Settings, stats: StatsCollector) -> None:
+ if not settings.getbool("HTTPCACHE_ENABLED"):
raise NotConfigured
- self.policy = load_object(settings['HTTPCACHE_POLICY'])(settings)
- self.storage = load_object(settings['HTTPCACHE_STORAGE'])(settings)
- self.ignore_missing = settings.getbool('HTTPCACHE_IGNORE_MISSING')
+ self.policy = load_object(settings["HTTPCACHE_POLICY"])(settings)
+ self.storage = load_object(settings["HTTPCACHE_STORAGE"])(settings)
+ self.ignore_missing = settings.getbool("HTTPCACHE_IGNORE_MISSING")
self.stats = stats
+
+ @classmethod
+ def from_crawler(
+ cls: Type[HttpCacheMiddlewareTV], crawler: Crawler
+ ) -> HttpCacheMiddlewareTV:
+ assert crawler.stats
+ o = cls(crawler.settings, crawler.stats)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_opened(self, spider: Spider) -> None:
+ self.storage.open_spider(spider)
+
+ def spider_closed(self, spider: Spider) -> None:
+ self.storage.close_spider(spider)
+
+ def process_request(self, request: Request, spider: Spider) -> Optional[Response]:
+ if request.meta.get("dont_cache", False):
+ return None
+
+ # Skip uncacheable requests
+ if not self.policy.should_cache_request(request):
+ request.meta["_dont_cache"] = True # flag as uncacheable
+ return None
+
+ # Look for cached response and check if expired
+ cachedresponse = self.storage.retrieve_response(spider, request)
+ if cachedresponse is None:
+ self.stats.inc_value("httpcache/miss", spider=spider)
+ if self.ignore_missing:
+ self.stats.inc_value("httpcache/ignore", spider=spider)
+ raise IgnoreRequest(f"Ignored request not in cache: {request}")
+ return None # first time request
+
+ # Return cached response only if not expired
+ cachedresponse.flags.append("cached")
+ if self.policy.is_cached_response_fresh(cachedresponse, request):
+ self.stats.inc_value("httpcache/hit", spider=spider)
+ return cachedresponse
+
+ # Keep a reference to cached response to avoid a second cache lookup on
+ # process_response hook
+ request.meta["cached_response"] = cachedresponse
+
+ return None
+
+ def process_response(
+ self, request: Request, response: Response, spider: Spider
+ ) -> Response:
+ if request.meta.get("dont_cache", False):
+ return response
+
+ # Skip cached responses and uncacheable requests
+ if "cached" in response.flags or "_dont_cache" in request.meta:
+ request.meta.pop("_dont_cache", None)
+ return response
+
+ # RFC2616 requires origin server to set Date header,
+ # https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.18
+ if "Date" not in response.headers:
+ response.headers["Date"] = formatdate(usegmt=True)
+
+ # Do not validate first-hand responses
+ cachedresponse = request.meta.pop("cached_response", None)
+ if cachedresponse is None:
+ self.stats.inc_value("httpcache/firsthand", spider=spider)
+ self._cache_response(spider, response, request, cachedresponse)
+ return response
+
+ if self.policy.is_cached_response_valid(cachedresponse, response, request):
+ self.stats.inc_value("httpcache/revalidate", spider=spider)
+ return cachedresponse
+
+ self.stats.inc_value("httpcache/invalidate", spider=spider)
+ self._cache_response(spider, response, request, cachedresponse)
+ return response
+
+ def process_exception(
+ self, request: Request, exception: Exception, spider: Spider
+ ) -> Optional[Response]:
+ cachedresponse = request.meta.pop("cached_response", None)
+ if cachedresponse is not None and isinstance(
+ exception, self.DOWNLOAD_EXCEPTIONS
+ ):
+ self.stats.inc_value("httpcache/errorrecovery", spider=spider)
+ return cachedresponse
+ return None
+
+ def _cache_response(
+ self,
+ spider: Spider,
+ response: Response,
+ request: Request,
+ cachedresponse: Optional[Response],
+ ) -> None:
+ if self.policy.should_cache_response(response, request):
+ self.stats.inc_value("httpcache/store", spider=spider)
+ self.storage.store_response(spider, request, response)
+ else:
+ self.stats.inc_value("httpcache/uncacheable", spider=spider)
diff --git a/scrapy/downloadermiddlewares/httpcompression.py b/scrapy/downloadermiddlewares/httpcompression.py
index cc3614d22..816be25a1 100644
--- a/scrapy/downloadermiddlewares/httpcompression.py
+++ b/scrapy/downloadermiddlewares/httpcompression.py
@@ -1,26 +1,36 @@
import warnings
from logging import getLogger
+
from scrapy import signals
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Response, TextResponse
from scrapy.responsetypes import responsetypes
-from scrapy.utils._compression import _DecompressionMaxSizeExceeded, _inflate, _unbrotli, _unzstd
+from scrapy.utils._compression import (
+ _DecompressionMaxSizeExceeded,
+ _inflate,
+ _unbrotli,
+ _unzstd,
+)
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.gz import gunzip
+
logger = getLogger(__name__)
-ACCEPTED_ENCODINGS = [b'gzip', b'deflate']
+
+ACCEPTED_ENCODINGS = [b"gzip", b"deflate"]
+
try:
- import brotli
+ import brotli # noqa: F401
except ImportError:
pass
else:
- ACCEPTED_ENCODINGS.append(b'br')
+ ACCEPTED_ENCODINGS.append(b"br")
+
try:
- import zstandard
+ import zstandard # noqa: F401
except ImportError:
pass
else:
- ACCEPTED_ENCODINGS.append(b'zstd')
+ ACCEPTED_ENCODINGS.append(b"zstd")
class HttpCompressionMiddleware:
@@ -34,6 +44,94 @@ class HttpCompressionMiddleware:
self._warn_size = 33554432
return
self.stats = crawler.stats
- self._max_size = crawler.settings.getint('DOWNLOAD_MAXSIZE')
- self._warn_size = crawler.settings.getint('DOWNLOAD_WARNSIZE')
+ self._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE")
+ self._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE")
crawler.signals.connect(self.open_spider, signals.spider_opened)
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("COMPRESSION_ENABLED"):
+ raise NotConfigured
+ try:
+ return cls(crawler=crawler)
+ except TypeError:
+ warnings.warn(
+ "HttpCompressionMiddleware subclasses must either modify "
+ "their '__init__' method to support a 'crawler' parameter or "
+ "reimplement their 'from_crawler' method.",
+ ScrapyDeprecationWarning,
+ )
+ mw = cls()
+ mw.stats = crawler.stats
+ mw._max_size = crawler.settings.getint("DOWNLOAD_MAXSIZE")
+ mw._warn_size = crawler.settings.getint("DOWNLOAD_WARNSIZE")
+ crawler.signals.connect(mw.open_spider, signals.spider_opened)
+ return mw
+
+ def open_spider(self, spider):
+ if hasattr(spider, "download_maxsize"):
+ self._max_size = spider.download_maxsize
+ if hasattr(spider, "download_warnsize"):
+ self._warn_size = spider.download_warnsize
+
+ def process_request(self, request, spider):
+ request.headers.setdefault("Accept-Encoding", b", ".join(ACCEPTED_ENCODINGS))
+
+ def process_response(self, request, response, spider):
+ if request.method == "HEAD":
+ return response
+ if isinstance(response, Response):
+ content_encoding = response.headers.getlist("Content-Encoding")
+ if content_encoding:
+ encoding = content_encoding.pop()
+ max_size = request.meta.get("download_maxsize", self._max_size)
+ warn_size = request.meta.get("download_warnsize", self._warn_size)
+ try:
+ decoded_body = self._decode(
+ response.body, encoding.lower(), max_size
+ )
+ except _DecompressionMaxSizeExceeded:
+ raise IgnoreRequest(
+ f"Ignored response {response} because its body "
+ f"({len(response.body)} B) exceeded DOWNLOAD_MAXSIZE "
+ f"({max_size} B) during decompression."
+ )
+ if len(response.body) < warn_size <= len(decoded_body):
+ logger.warning(
+ f"{response} body size after decompression "
+ f"({len(decoded_body)} B) is larger than the "
+ f"download warning size ({warn_size} B)."
+ )
+ if self.stats:
+ self.stats.inc_value(
+ "httpcompression/response_bytes",
+ len(decoded_body),
+ spider=spider,
+ )
+ self.stats.inc_value(
+ "httpcompression/response_count", spider=spider
+ )
+ respcls = responsetypes.from_args(
+ headers=response.headers, url=response.url, body=decoded_body
+ )
+ kwargs = dict(cls=respcls, body=decoded_body)
+ if issubclass(respcls, TextResponse):
+ # force recalculating the encoding until we make sure the
+ # responsetypes guessing is reliable
+ kwargs["encoding"] = None
+ response = response.replace(**kwargs)
+ if not content_encoding:
+ del response.headers["Content-Encoding"]
+
+ return response
+
+ def _decode(self, body, encoding, max_size):
+ if encoding == b"gzip" or encoding == b"x-gzip":
+ return gunzip(body, max_size=max_size)
+ if encoding == b"deflate":
+ return _inflate(body, max_size=max_size)
+ if encoding == b"br" and b"br" in ACCEPTED_ENCODINGS:
+ return _unbrotli(body, max_size=max_size)
+ if encoding == b"zstd" and b"zstd" in ACCEPTED_ENCODINGS:
+ return _unzstd(body, max_size=max_size)
+ return body
diff --git a/scrapy/downloadermiddlewares/httpproxy.py b/scrapy/downloadermiddlewares/httpproxy.py
index 3fb8b5cbc..522237674 100644
--- a/scrapy/downloadermiddlewares/httpproxy.py
+++ b/scrapy/downloadermiddlewares/httpproxy.py
@@ -1,18 +1,83 @@
import base64
from urllib.parse import unquote, urlunparse
from urllib.request import _parse_proxy, getproxies, proxy_bypass
+
from scrapy.exceptions import NotConfigured
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_bytes
class HttpProxyMiddleware:
-
- def __init__(self, auth_encoding='latin-1'):
+ def __init__(self, auth_encoding="latin-1"):
self.auth_encoding = auth_encoding
self.proxies = {}
for type_, url in getproxies().items():
try:
self.proxies[type_] = self._get_proxy(url, type_)
+ # some values such as '/var/run/docker.sock' can't be parsed
+ # by _parse_proxy and as such should be skipped
except ValueError:
continue
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("HTTPPROXY_ENABLED"):
+ raise NotConfigured
+ auth_encoding = crawler.settings.get("HTTPPROXY_AUTH_ENCODING")
+ return cls(auth_encoding)
+
+ def _basic_auth_header(self, username, password):
+ user_pass = to_bytes(
+ f"{unquote(username)}:{unquote(password)}", encoding=self.auth_encoding
+ )
+ return base64.b64encode(user_pass)
+
+ def _get_proxy(self, url, orig_type):
+ proxy_type, user, password, hostport = _parse_proxy(url)
+ proxy_url = urlunparse((proxy_type or orig_type, hostport, "", "", "", ""))
+
+ if user:
+ creds = self._basic_auth_header(user, password)
+ else:
+ creds = None
+
+ return creds, proxy_url
+
+ def process_request(self, request, spider):
+ creds, proxy_url, scheme = None, None, None
+ if "proxy" in request.meta:
+ if request.meta["proxy"] is not None:
+ creds, proxy_url = self._get_proxy(request.meta["proxy"], "")
+ elif self.proxies:
+ parsed = urlparse_cached(request)
+ _scheme = parsed.scheme
+ if (
+ # 'no_proxy' is only supported by http schemes
+ _scheme not in ("http", "https")
+ or not proxy_bypass(parsed.hostname)
+ ) and _scheme in self.proxies:
+ scheme = _scheme
+ creds, proxy_url = self.proxies[scheme]
+
+ self._set_proxy_and_creds(request, proxy_url, creds, scheme)
+
+ def _set_proxy_and_creds(self, request, proxy_url, creds, scheme):
+ if scheme:
+ request.meta["_scheme_proxy"] = True
+ if proxy_url:
+ request.meta["proxy"] = proxy_url
+ elif request.meta.get("proxy") is not None:
+ request.meta["proxy"] = None
+ if creds:
+ request.headers[b"Proxy-Authorization"] = b"Basic " + creds
+ request.meta["_auth_proxy"] = proxy_url
+ elif "_auth_proxy" in request.meta:
+ if proxy_url != request.meta["_auth_proxy"]:
+ if b"Proxy-Authorization" in request.headers:
+ del request.headers[b"Proxy-Authorization"]
+ del request.meta["_auth_proxy"]
+ elif b"Proxy-Authorization" in request.headers:
+ if proxy_url:
+ request.meta["_auth_proxy"] = proxy_url
+ else:
+ del request.headers[b"Proxy-Authorization"]
diff --git a/scrapy/downloadermiddlewares/offsite.py b/scrapy/downloadermiddlewares/offsite.py
index d5ebe2f64..1e5026925 100644
--- a/scrapy/downloadermiddlewares/offsite.py
+++ b/scrapy/downloadermiddlewares/offsite.py
@@ -1,18 +1,77 @@
import logging
import re
import warnings
+
from scrapy import signals
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.httpobj import urlparse_cached
+
logger = logging.getLogger(__name__)
class OffsiteMiddleware:
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls(crawler.stats)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.request_scheduled, signal=signals.request_scheduled)
+ return o
def __init__(self, stats):
self.stats = stats
self.domains_seen = set()
+ def spider_opened(self, spider):
+ self.host_regex = self.get_host_regex(spider)
+
+ def request_scheduled(self, request, spider):
+ self.process_request(request, spider)
+
+ def process_request(self, request, spider):
+ if request.dont_filter or self.should_follow(request, spider):
+ return None
+ domain = urlparse_cached(request).hostname
+ if domain and domain not in self.domains_seen:
+ self.domains_seen.add(domain)
+ logger.debug(
+ "Filtered offsite request to %(domain)r: %(request)s",
+ {"domain": domain, "request": request},
+ extra={"spider": spider},
+ )
+ self.stats.inc_value("offsite/domains", spider=spider)
+ self.stats.inc_value("offsite/filtered", spider=spider)
+ raise IgnoreRequest
+
+ def should_follow(self, request, spider):
+ regex = self.host_regex
+ # hostname can be None for wrong urls (like javascript links)
+ host = urlparse_cached(request).hostname or ""
+ return bool(regex.search(host))
+
def get_host_regex(self, spider):
"""Override this method to implement a different offsite policy"""
- pass
+ allowed_domains = getattr(spider, "allowed_domains", None)
+ if not allowed_domains:
+ return re.compile("") # allow all by default
+ url_pattern = re.compile(r"^https?://.*$")
+ port_pattern = re.compile(r":\d+$")
+ domains = []
+ for domain in allowed_domains:
+ if domain is None:
+ continue
+ if url_pattern.match(domain):
+ message = (
+ "allowed_domains accepts only domains, not URLs. "
+ f"Ignoring URL entry {domain} in allowed_domains."
+ )
+ warnings.warn(message)
+ elif port_pattern.search(domain):
+ message = (
+ "allowed_domains accepts only domains without ports. "
+ f"Ignoring entry {domain} in allowed_domains."
+ )
+ warnings.warn(message)
+ else:
+ domains.append(re.escape(domain))
+ regex = rf'^(.*\.)?({"|".join(domains)})$'
+ return re.compile(regex)
diff --git a/scrapy/downloadermiddlewares/redirect.py b/scrapy/downloadermiddlewares/redirect.py
index dce283f2f..63be1d0ca 100644
--- a/scrapy/downloadermiddlewares/redirect.py
+++ b/scrapy/downloadermiddlewares/redirect.py
@@ -1,21 +1,120 @@
import logging
from urllib.parse import urljoin, urlparse
+
from w3lib.url import safe_url_string
+
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import HtmlResponse
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.response import get_meta_refresh
+
logger = logging.getLogger(__name__)
+def _build_redirect_request(source_request, *, url, **kwargs):
+ redirect_request = source_request.replace(
+ url=url,
+ **kwargs,
+ cookies=None,
+ )
+ if "_scheme_proxy" in redirect_request.meta:
+ source_request_scheme = urlparse_cached(source_request).scheme
+ redirect_request_scheme = urlparse_cached(redirect_request).scheme
+ if source_request_scheme != redirect_request_scheme:
+ redirect_request.meta.pop("_scheme_proxy")
+ redirect_request.meta.pop("proxy", None)
+ redirect_request.meta.pop("_auth_proxy", None)
+ redirect_request.headers.pop(b"Proxy-Authorization", None)
+ has_cookie_header = "Cookie" in redirect_request.headers
+ has_authorization_header = "Authorization" in redirect_request.headers
+ if has_cookie_header or has_authorization_header:
+ default_ports = {"http": 80, "https": 443}
+
+ parsed_source_request = urlparse_cached(source_request)
+ source_scheme, source_host, source_port = (
+ parsed_source_request.scheme,
+ parsed_source_request.hostname,
+ parsed_source_request.port
+ or default_ports.get(parsed_source_request.scheme),
+ )
+
+ parsed_redirect_request = urlparse_cached(redirect_request)
+ redirect_scheme, redirect_host, redirect_port = (
+ parsed_redirect_request.scheme,
+ parsed_redirect_request.hostname,
+ parsed_redirect_request.port
+ or default_ports.get(parsed_redirect_request.scheme),
+ )
+
+ if has_cookie_header and (
+ (source_scheme != redirect_scheme and redirect_scheme != "https")
+ or source_host != redirect_host
+ ):
+ del redirect_request.headers["Cookie"]
+
+ # https://fetch.spec.whatwg.org/#ref-for-cors-non-wildcard-request-header-name
+ if has_authorization_header and (
+ source_scheme != redirect_scheme
+ or source_host != redirect_host
+ or source_port != redirect_port
+ ):
+ del redirect_request.headers["Authorization"]
+
+ return redirect_request
+
+
class BaseRedirectMiddleware:
- enabled_setting = 'REDIRECT_ENABLED'
+ enabled_setting = "REDIRECT_ENABLED"
def __init__(self, settings):
if not settings.getbool(self.enabled_setting):
raise NotConfigured
- self.max_redirect_times = settings.getint('REDIRECT_MAX_TIMES')
- self.priority_adjust = settings.getint('REDIRECT_PRIORITY_ADJUST')
+
+ self.max_redirect_times = settings.getint("REDIRECT_MAX_TIMES")
+ self.priority_adjust = settings.getint("REDIRECT_PRIORITY_ADJUST")
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings)
+
+ def _redirect(self, redirected, request, spider, reason):
+ ttl = request.meta.setdefault("redirect_ttl", self.max_redirect_times)
+ redirects = request.meta.get("redirect_times", 0) + 1
+
+ if ttl and redirects <= self.max_redirect_times:
+ redirected.meta["redirect_times"] = redirects
+ redirected.meta["redirect_ttl"] = ttl - 1
+ redirected.meta["redirect_urls"] = request.meta.get("redirect_urls", []) + [
+ request.url
+ ]
+ redirected.meta["redirect_reasons"] = request.meta.get(
+ "redirect_reasons", []
+ ) + [reason]
+ redirected.dont_filter = request.dont_filter
+ redirected.priority = request.priority + self.priority_adjust
+ logger.debug(
+ "Redirecting (%(reason)s) to %(redirected)s from %(request)s",
+ {"reason": reason, "redirected": redirected, "request": request},
+ extra={"spider": spider},
+ )
+ return redirected
+ logger.debug(
+ "Discarding %(request)s: max redirections reached",
+ {"request": request},
+ extra={"spider": spider},
+ )
+ raise IgnoreRequest("max redirections reached")
+
+ def _redirect_request_using_get(self, request, redirect_url):
+ redirect_request = _build_redirect_request(
+ request,
+ url=redirect_url,
+ method="GET",
+ body="",
+ )
+ redirect_request.headers.pop("Content-Type", None)
+ redirect_request.headers.pop("Content-Length", None)
+ return redirect_request
class RedirectMiddleware(BaseRedirectMiddleware):
@@ -24,11 +123,59 @@ class RedirectMiddleware(BaseRedirectMiddleware):
and meta-refresh html tag.
"""
+ def process_response(self, request, response, spider):
+ if (
+ request.meta.get("dont_redirect", False)
+ or response.status in getattr(spider, "handle_httpstatus_list", [])
+ or response.status in request.meta.get("handle_httpstatus_list", [])
+ or request.meta.get("handle_httpstatus_all", False)
+ ):
+ return response
+
+ allowed_status = (301, 302, 303, 307, 308)
+ if "Location" not in response.headers or response.status not in allowed_status:
+ return response
+
+ location = safe_url_string(response.headers["Location"])
+ if response.headers["Location"].startswith(b"//"):
+ request_scheme = urlparse(request.url).scheme
+ location = request_scheme + "://" + location.lstrip("/")
+
+ redirected_url = urljoin(request.url, location)
+ if urlparse(redirected_url).scheme not in {"http", "https"}:
+ return response
+
+ if response.status in (301, 307, 308) or request.method == "HEAD":
+ redirected = _build_redirect_request(request, url=redirected_url)
+ return self._redirect(redirected, request, spider, response.status)
+
+ redirected = self._redirect_request_using_get(request, redirected_url)
+ return self._redirect(redirected, request, spider, response.status)
+
class MetaRefreshMiddleware(BaseRedirectMiddleware):
- enabled_setting = 'METAREFRESH_ENABLED'
+ enabled_setting = "METAREFRESH_ENABLED"
def __init__(self, settings):
super().__init__(settings)
- self._ignore_tags = settings.getlist('METAREFRESH_IGNORE_TAGS')
- self._maxdelay = settings.getint('METAREFRESH_MAXDELAY')
+ self._ignore_tags = settings.getlist("METAREFRESH_IGNORE_TAGS")
+ self._maxdelay = settings.getint("METAREFRESH_MAXDELAY")
+
+ def process_response(self, request, response, spider):
+ if (
+ request.meta.get("dont_redirect", False)
+ or request.method == "HEAD"
+ or not isinstance(response, HtmlResponse)
+ or urlparse_cached(request).scheme not in {"http", "https"}
+ ):
+ return response
+
+ interval, url = get_meta_refresh(response, ignore_tags=self._ignore_tags)
+ if not url:
+ return response
+ if urlparse(url).scheme not in {"http", "https"}:
+ return response
+ if interval < self._maxdelay:
+ redirected = self._redirect_request_using_get(request, url)
+ return self._redirect(redirected, request, spider, "meta refresh")
+ return response
diff --git a/scrapy/downloadermiddlewares/retry.py b/scrapy/downloadermiddlewares/retry.py
index 38df481a3..380623cea 100644
--- a/scrapy/downloadermiddlewares/retry.py
+++ b/scrapy/downloadermiddlewares/retry.py
@@ -12,6 +12,7 @@ once the spider has finished crawling all regular (non failed) pages.
import warnings
from logging import Logger, getLogger
from typing import Optional, Type, Union
+
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.http.request import Request
from scrapy.settings import Settings
@@ -19,17 +20,41 @@ from scrapy.spiders import Spider
from scrapy.utils.misc import load_object
from scrapy.utils.python import global_object_name
from scrapy.utils.response import response_status_message
+
retry_logger = getLogger(__name__)
+def backwards_compatibility_getattr(self, name):
+ if name == "EXCEPTIONS_TO_RETRY":
+ warnings.warn(
+ "Attribute RetryMiddleware.EXCEPTIONS_TO_RETRY is deprecated. "
+ "Use the RETRY_EXCEPTIONS setting instead.",
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ return tuple(
+ load_object(x) if isinstance(x, str) else x
+ for x in Settings().getlist("RETRY_EXCEPTIONS")
+ )
+ raise AttributeError(
+ f"{self.__class__.__name__!r} object has no attribute {name!r}"
+ )
+
+
class BackwardsCompatibilityMetaclass(type):
__getattr__ = backwards_compatibility_getattr
-def get_retry_request(request: Request, *, spider: Spider, reason: Union[
- str, Exception, Type[Exception]]='unspecified', max_retry_times:
- Optional[int]=None, priority_adjust: Optional[int]=None, logger: Logger
- =retry_logger, stats_base_key: str='retry'):
+def get_retry_request(
+ request: Request,
+ *,
+ spider: Spider,
+ reason: Union[str, Exception, Type[Exception]] = "unspecified",
+ max_retry_times: Optional[int] = None,
+ priority_adjust: Optional[int] = None,
+ logger: Logger = retry_logger,
+ stats_base_key: str = "retry",
+):
"""
Returns a new :class:`~scrapy.Request` object to retry the specified
request, or ``None`` if retries of the specified request have been
@@ -70,22 +95,90 @@ def get_retry_request(request: Request, *, spider: Spider, reason: Union[
*stats_base_key* is a string to be used as the base key for the
retry-related job stats
"""
- pass
+ settings = spider.crawler.settings
+ assert spider.crawler.stats
+ stats = spider.crawler.stats
+ retry_times = request.meta.get("retry_times", 0) + 1
+ if max_retry_times is None:
+ max_retry_times = request.meta.get("max_retry_times")
+ if max_retry_times is None:
+ max_retry_times = settings.getint("RETRY_TIMES")
+ if retry_times <= max_retry_times:
+ logger.debug(
+ "Retrying %(request)s (failed %(retry_times)d times): %(reason)s",
+ {"request": request, "retry_times": retry_times, "reason": reason},
+ extra={"spider": spider},
+ )
+ new_request: Request = request.copy()
+ new_request.meta["retry_times"] = retry_times
+ new_request.dont_filter = True
+ if priority_adjust is None:
+ priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
+ new_request.priority = request.priority + priority_adjust
+
+ if callable(reason):
+ reason = reason()
+ if isinstance(reason, Exception):
+ reason = global_object_name(reason.__class__)
+
+ stats.inc_value(f"{stats_base_key}/count")
+ stats.inc_value(f"{stats_base_key}/reason_count/{reason}")
+ return new_request
+ stats.inc_value(f"{stats_base_key}/max_reached")
+ logger.error(
+ "Gave up retrying %(request)s (failed %(retry_times)d times): " "%(reason)s",
+ {"request": request, "retry_times": retry_times, "reason": reason},
+ extra={"spider": spider},
+ )
+ return None
class RetryMiddleware(metaclass=BackwardsCompatibilityMetaclass):
-
def __init__(self, settings):
- if not settings.getbool('RETRY_ENABLED'):
+ if not settings.getbool("RETRY_ENABLED"):
raise NotConfigured
- self.max_retry_times = settings.getint('RETRY_TIMES')
- self.retry_http_codes = set(int(x) for x in settings.getlist(
- 'RETRY_HTTP_CODES'))
- self.priority_adjust = settings.getint('RETRY_PRIORITY_ADJUST')
+ self.max_retry_times = settings.getint("RETRY_TIMES")
+ self.retry_http_codes = set(
+ int(x) for x in settings.getlist("RETRY_HTTP_CODES")
+ )
+ self.priority_adjust = settings.getint("RETRY_PRIORITY_ADJUST")
+
try:
- self.exceptions_to_retry = self.__getattribute__(
- 'EXCEPTIONS_TO_RETRY')
+ self.exceptions_to_retry = self.__getattribute__("EXCEPTIONS_TO_RETRY")
except AttributeError:
- self.exceptions_to_retry = tuple(load_object(x) if isinstance(x,
- str) else x for x in settings.getlist('RETRY_EXCEPTIONS'))
+ # If EXCEPTIONS_TO_RETRY is not "overridden"
+ self.exceptions_to_retry = tuple(
+ load_object(x) if isinstance(x, str) else x
+ for x in settings.getlist("RETRY_EXCEPTIONS")
+ )
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings)
+
+ def process_response(self, request, response, spider):
+ if request.meta.get("dont_retry", False):
+ return response
+ if response.status in self.retry_http_codes:
+ reason = response_status_message(response.status)
+ return self._retry(request, reason, spider) or response
+ return response
+
+ def process_exception(self, request, exception, spider):
+ if isinstance(exception, self.exceptions_to_retry) and not request.meta.get(
+ "dont_retry", False
+ ):
+ return self._retry(request, exception, spider)
+
+ def _retry(self, request, reason, spider):
+ max_retry_times = request.meta.get("max_retry_times", self.max_retry_times)
+ priority_adjust = request.meta.get("priority_adjust", self.priority_adjust)
+ return get_retry_request(
+ request,
+ reason=reason,
+ spider=spider,
+ max_retry_times=max_retry_times,
+ priority_adjust=priority_adjust,
+ )
+
__getattr__ = backwards_compatibility_getattr
diff --git a/scrapy/downloadermiddlewares/robotstxt.py b/scrapy/downloadermiddlewares/robotstxt.py
index e1699b10f..6cab27c5a 100644
--- a/scrapy/downloadermiddlewares/robotstxt.py
+++ b/scrapy/downloadermiddlewares/robotstxt.py
@@ -3,14 +3,18 @@ This is a middleware to respect robots.txt policies. To activate it you must
enable this middleware and enable the ROBOTSTXT_OBEY setting.
"""
+
import logging
+
from twisted.internet.defer import Deferred, maybeDeferred
+
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import load_object
+
logger = logging.getLogger(__name__)
@@ -18,13 +22,100 @@ class RobotsTxtMiddleware:
DOWNLOAD_PRIORITY = 1000
def __init__(self, crawler):
- if not crawler.settings.getbool('ROBOTSTXT_OBEY'):
+ if not crawler.settings.getbool("ROBOTSTXT_OBEY"):
raise NotConfigured
- self._default_useragent = crawler.settings.get('USER_AGENT', 'Scrapy')
- self._robotstxt_useragent = crawler.settings.get('ROBOTSTXT_USER_AGENT'
- , None)
+ self._default_useragent = crawler.settings.get("USER_AGENT", "Scrapy")
+ self._robotstxt_useragent = crawler.settings.get("ROBOTSTXT_USER_AGENT", None)
self.crawler = crawler
self._parsers = {}
- self._parserimpl = load_object(crawler.settings.get('ROBOTSTXT_PARSER')
+ self._parserimpl = load_object(crawler.settings.get("ROBOTSTXT_PARSER"))
+
+ # check if parser dependencies are met, this should throw an error otherwise.
+ self._parserimpl.from_crawler(self.crawler, b"")
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def process_request(self, request, spider):
+ if request.meta.get("dont_obey_robotstxt"):
+ return
+ if request.url.startswith("data:") or request.url.startswith("file:"):
+ return
+ d = maybeDeferred(self.robot_parser, request, spider)
+ d.addCallback(self.process_request_2, request, spider)
+ return d
+
+ def process_request_2(self, rp, request, spider):
+ if rp is None:
+ return
+
+ useragent = self._robotstxt_useragent
+ if not useragent:
+ useragent = request.headers.get(b"User-Agent", self._default_useragent)
+ if not rp.allowed(request.url, useragent):
+ logger.debug(
+ "Forbidden by robots.txt: %(request)s",
+ {"request": request},
+ extra={"spider": spider},
)
- self._parserimpl.from_crawler(self.crawler, b'')
+ self.crawler.stats.inc_value("robotstxt/forbidden")
+ raise IgnoreRequest("Forbidden by robots.txt")
+
+ def robot_parser(self, request, spider):
+ url = urlparse_cached(request)
+ netloc = url.netloc
+
+ if netloc not in self._parsers:
+ self._parsers[netloc] = Deferred()
+ robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
+ robotsreq = Request(
+ robotsurl,
+ priority=self.DOWNLOAD_PRIORITY,
+ meta={"dont_obey_robotstxt": True},
+ callback=NO_CALLBACK,
+ )
+ dfd = self.crawler.engine.download(robotsreq)
+ dfd.addCallback(self._parse_robots, netloc, spider)
+ dfd.addErrback(self._logerror, robotsreq, spider)
+ dfd.addErrback(self._robots_error, netloc)
+ self.crawler.stats.inc_value("robotstxt/request_count")
+
+ if isinstance(self._parsers[netloc], Deferred):
+ d = Deferred()
+
+ def cb(result):
+ d.callback(result)
+ return result
+
+ self._parsers[netloc].addCallback(cb)
+ return d
+ return self._parsers[netloc]
+
+ def _logerror(self, failure, request, spider):
+ if failure.type is not IgnoreRequest:
+ logger.error(
+ "Error downloading %(request)s: %(f_exception)s",
+ {"request": request, "f_exception": failure.value},
+ exc_info=failure_to_exc_info(failure),
+ extra={"spider": spider},
+ )
+ return failure
+
+ def _parse_robots(self, response, netloc, spider):
+ self.crawler.stats.inc_value("robotstxt/response_count")
+ self.crawler.stats.inc_value(
+ f"robotstxt/response_status_count/{response.status}"
+ )
+ rp = self._parserimpl.from_crawler(self.crawler, response.body)
+ rp_dfd = self._parsers[netloc]
+ self._parsers[netloc] = rp
+ rp_dfd.callback(rp)
+
+ def _robots_error(self, failure, netloc):
+ if failure.type is not IgnoreRequest:
+ key = f"robotstxt/exception_count/{failure.type}"
+ self.crawler.stats.inc_value(key)
+ rp_dfd = self._parsers[netloc]
+ self._parsers[netloc] = None
+ rp_dfd.callback(None)
diff --git a/scrapy/downloadermiddlewares/stats.py b/scrapy/downloadermiddlewares/stats.py
index 571687317..a0f62e262 100644
--- a/scrapy/downloadermiddlewares/stats.py
+++ b/scrapy/downloadermiddlewares/stats.py
@@ -1,10 +1,60 @@
from twisted.web import http
+
from scrapy.exceptions import NotConfigured
from scrapy.utils.python import global_object_name, to_bytes
from scrapy.utils.request import request_httprepr
-class DownloaderStats:
+def get_header_size(headers):
+ size = 0
+ for key, value in headers.items():
+ if isinstance(value, (list, tuple)):
+ for v in value:
+ size += len(b": ") + len(key) + len(v)
+ return size + len(b"\r\n") * (len(headers.keys()) - 1)
+
+
+def get_status_size(response_status):
+ return len(to_bytes(http.RESPONSES.get(response_status, b""))) + 15
+ # resp.status + b"\r\n" + b"HTTP/1.1 <100-599> "
+
+class DownloaderStats:
def __init__(self, stats):
self.stats = stats
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("DOWNLOADER_STATS"):
+ raise NotConfigured
+ return cls(crawler.stats)
+
+ def process_request(self, request, spider):
+ self.stats.inc_value("downloader/request_count", spider=spider)
+ self.stats.inc_value(
+ f"downloader/request_method_count/{request.method}", spider=spider
+ )
+ reqlen = len(request_httprepr(request))
+ self.stats.inc_value("downloader/request_bytes", reqlen, spider=spider)
+
+ def process_response(self, request, response, spider):
+ self.stats.inc_value("downloader/response_count", spider=spider)
+ self.stats.inc_value(
+ f"downloader/response_status_count/{response.status}", spider=spider
+ )
+ reslen = (
+ len(response.body)
+ + get_header_size(response.headers)
+ + get_status_size(response.status)
+ + 4
+ )
+ # response.body + b"\r\n"+ response.header + b"\r\n" + response.status
+ self.stats.inc_value("downloader/response_bytes", reslen, spider=spider)
+ return response
+
+ def process_exception(self, request, exception, spider):
+ ex_class = global_object_name(exception.__class__)
+ self.stats.inc_value("downloader/exception_count", spider=spider)
+ self.stats.inc_value(
+ f"downloader/exception_type_count/{ex_class}", spider=spider
+ )
diff --git a/scrapy/downloadermiddlewares/useragent.py b/scrapy/downloadermiddlewares/useragent.py
index 5199b0472..856a275ab 100644
--- a/scrapy/downloadermiddlewares/useragent.py
+++ b/scrapy/downloadermiddlewares/useragent.py
@@ -1,9 +1,23 @@
"""Set User-Agent header per spider or use a default value from settings"""
+
from scrapy import signals
class UserAgentMiddleware:
"""This middleware allows spiders to override the user_agent"""
- def __init__(self, user_agent='Scrapy'):
+ def __init__(self, user_agent="Scrapy"):
self.user_agent = user_agent
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls(crawler.settings["USER_AGENT"])
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ return o
+
+ def spider_opened(self, spider):
+ self.user_agent = getattr(spider, "user_agent", self.user_agent)
+
+ def process_request(self, request, spider):
+ if self.user_agent:
+ request.headers.setdefault(b"User-Agent", self.user_agent)
diff --git a/scrapy/dupefilters.py b/scrapy/dupefilters.py
index 684ffcbe6..0b20f53b9 100644
--- a/scrapy/dupefilters.py
+++ b/scrapy/dupefilters.py
@@ -1,23 +1,45 @@
from __future__ import annotations
+
import logging
from pathlib import Path
from typing import TYPE_CHECKING, Optional, Set
from warnings import warn
+
from twisted.internet.defer import Deferred
+
from scrapy.http.request import Request
from scrapy.settings import BaseSettings
from scrapy.spiders import Spider
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.job import job_dir
-from scrapy.utils.request import RequestFingerprinter, RequestFingerprinterProtocol, referer_str
+from scrapy.utils.request import (
+ RequestFingerprinter,
+ RequestFingerprinterProtocol,
+ referer_str,
+)
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
from scrapy.crawler import Crawler
class BaseDupeFilter:
+ @classmethod
+ def from_settings(cls, settings: BaseSettings) -> Self:
+ return cls()
- def log(self, request: Request, spider: Spider) ->None:
+ def request_seen(self, request: Request) -> bool:
+ return False
+
+ def open(self) -> Optional[Deferred]:
+ pass
+
+ def close(self, reason: str) -> Optional[Deferred]:
+ pass
+
+ def log(self, request: Request, spider: Spider) -> None:
"""Log that a request has been filtered"""
pass
@@ -25,17 +47,96 @@ class BaseDupeFilter:
class RFPDupeFilter(BaseDupeFilter):
"""Request Fingerprint duplicates filter"""
- def __init__(self, path: Optional[str]=None, debug: bool=False, *,
- fingerprinter: Optional[RequestFingerprinterProtocol]=None) ->None:
+ def __init__(
+ self,
+ path: Optional[str] = None,
+ debug: bool = False,
+ *,
+ fingerprinter: Optional[RequestFingerprinterProtocol] = None,
+ ) -> None:
self.file = None
- self.fingerprinter: RequestFingerprinterProtocol = (fingerprinter or
- RequestFingerprinter())
+ self.fingerprinter: RequestFingerprinterProtocol = (
+ fingerprinter or RequestFingerprinter()
+ )
self.fingerprints: Set[str] = set()
self.logdupes = True
self.debug = debug
self.logger = logging.getLogger(__name__)
if path:
- self.file = Path(path, 'requests.seen').open('a+', encoding='utf-8'
- )
+ self.file = Path(path, "requests.seen").open("a+", encoding="utf-8")
self.file.seek(0)
self.fingerprints.update(x.rstrip() for x in self.file)
+
+ @classmethod
+ def from_settings(
+ cls,
+ settings: BaseSettings,
+ *,
+ fingerprinter: Optional[RequestFingerprinterProtocol] = None,
+ ) -> Self:
+ debug = settings.getbool("DUPEFILTER_DEBUG")
+ try:
+ return cls(job_dir(settings), debug, fingerprinter=fingerprinter)
+ except TypeError:
+ warn(
+ "RFPDupeFilter subclasses must either modify their '__init__' "
+ "method to support a 'fingerprinter' parameter or reimplement "
+ "the 'from_settings' class method.",
+ ScrapyDeprecationWarning,
+ )
+ result = cls(job_dir(settings), debug)
+ result.fingerprinter = fingerprinter or RequestFingerprinter()
+ return result
+
+ @classmethod
+ def from_crawler(cls, crawler: Crawler) -> Self:
+ assert crawler.request_fingerprinter
+ try:
+ return cls.from_settings(
+ crawler.settings,
+ fingerprinter=crawler.request_fingerprinter,
+ )
+ except TypeError:
+ warn(
+ "RFPDupeFilter subclasses must either modify their overridden "
+ "'__init__' method and 'from_settings' class method to "
+ "support a 'fingerprinter' parameter, or reimplement the "
+ "'from_crawler' class method.",
+ ScrapyDeprecationWarning,
+ )
+ result = cls.from_settings(crawler.settings)
+ result.fingerprinter = crawler.request_fingerprinter
+ return result
+
+ def request_seen(self, request: Request) -> bool:
+ fp = self.request_fingerprint(request)
+ if fp in self.fingerprints:
+ return True
+ self.fingerprints.add(fp)
+ if self.file:
+ self.file.write(fp + "\n")
+ return False
+
+ def request_fingerprint(self, request: Request) -> str:
+ return self.fingerprinter.fingerprint(request).hex()
+
+ def close(self, reason: str) -> None:
+ if self.file:
+ self.file.close()
+
+ def log(self, request: Request, spider: Spider) -> None:
+ if self.debug:
+ msg = "Filtered duplicate request: %(request)s (referer: %(referer)s)"
+ args = {"request": request, "referer": referer_str(request)}
+ self.logger.debug(msg, args, extra={"spider": spider})
+ elif self.logdupes:
+ msg = (
+ "Filtered duplicate request: %(request)s"
+ " - no more duplicates will be shown"
+ " (see DUPEFILTER_DEBUG to show all duplicates)"
+ )
+ self.logger.debug(msg, {"request": request}, extra={"spider": spider})
+ self.logdupes = False
+
+ assert spider.crawler.stats
+ spider.crawler.stats.inc_value("dupefilter/filtered", spider=spider)
diff --git a/scrapy/exceptions.py b/scrapy/exceptions.py
index 5dee6b7da..6d188c489 100644
--- a/scrapy/exceptions.py
+++ b/scrapy/exceptions.py
@@ -6,9 +6,12 @@ new exceptions here without documenting them there.
"""
from typing import Any
+# Internal
+
class NotConfigured(Exception):
"""Indicates a missing configuration situation"""
+
pass
@@ -17,22 +20,27 @@ class _InvalidOutput(TypeError):
Indicates an invalid value has been returned by a middleware's processing method.
Internal and undocumented, it should not be raised or caught by user code.
"""
+
pass
+# HTTP and crawling
+
+
class IgnoreRequest(Exception):
"""Indicates a decision was made not to process a request"""
class DontCloseSpider(Exception):
"""Request the spider not to be closed yet"""
+
pass
class CloseSpider(Exception):
"""Raise this from callbacks to request the spider to be closed"""
- def __init__(self, reason: str='cancelled'):
+ def __init__(self, reason: str = "cancelled"):
super().__init__()
self.reason = reason
@@ -44,26 +52,34 @@ class StopDownload(Exception):
should be handled by the request errback. Note that 'fail' is a keyword-only argument.
"""
- def __init__(self, *, fail: bool=True):
+ def __init__(self, *, fail: bool = True):
super().__init__()
self.fail = fail
+# Items
+
+
class DropItem(Exception):
"""Drop item from the item pipeline"""
+
pass
class NotSupported(Exception):
"""Indicates a feature or method is not supported"""
+
pass
+# Commands
+
+
class UsageError(Exception):
"""To indicate a command-line usage error"""
def __init__(self, *a: Any, **kw: Any):
- self.print_help = kw.pop('print_help', True)
+ self.print_help = kw.pop("print_help", True)
super().__init__(*a, **kw)
@@ -71,9 +87,11 @@ class ScrapyDeprecationWarning(Warning):
"""Warning category for deprecated features, since the default
DeprecationWarning is silenced on Python 2.7+
"""
+
pass
class ContractFail(AssertionError):
"""Error raised in case of a failing contract"""
+
pass
diff --git a/scrapy/exporters.py b/scrapy/exporters.py
index d22653341..f85f1dad8 100644
--- a/scrapy/exporters.py
+++ b/scrapy/exporters.py
@@ -1,6 +1,7 @@
"""
Item Exporters are used to export/serialize items into different formats.
"""
+
import csv
import io
import marshal
@@ -8,17 +9,26 @@ import pickle
import pprint
from collections.abc import Mapping
from xml.sax.saxutils import XMLGenerator
+
from itemadapter import ItemAdapter, is_item
+
from scrapy.item import Item
from scrapy.utils.python import is_listlike, to_bytes, to_unicode
from scrapy.utils.serialize import ScrapyJSONEncoder
-__all__ = ['BaseItemExporter', 'PprintItemExporter', 'PickleItemExporter',
- 'CsvItemExporter', 'XmlItemExporter', 'JsonLinesItemExporter',
- 'JsonItemExporter', 'MarshalItemExporter']
+__all__ = [
+ "BaseItemExporter",
+ "PprintItemExporter",
+ "PickleItemExporter",
+ "CsvItemExporter",
+ "XmlItemExporter",
+ "JsonLinesItemExporter",
+ "JsonItemExporter",
+ "MarshalItemExporter",
+]
-class BaseItemExporter:
+class BaseItemExporter:
def __init__(self, *, dont_fail=False, **kwargs):
self._kwargs = kwargs
self._configure(kwargs, dont_fail=dont_fail)
@@ -28,72 +38,257 @@ class BaseItemExporter:
If dont_fail is set, it won't raise an exception on unexpected options
(useful for using with keyword arguments in subclasses ``__init__`` methods)
"""
+ self.encoding = options.pop("encoding", None)
+ self.fields_to_export = options.pop("fields_to_export", None)
+ self.export_empty_fields = options.pop("export_empty_fields", False)
+ self.indent = options.pop("indent", None)
+ if not dont_fail and options:
+ raise TypeError(f"Unexpected options: {', '.join(options.keys())}")
+
+ def export_item(self, item):
+ raise NotImplementedError
+
+ def serialize_field(self, field, name, value):
+ serializer = field.get("serializer", lambda x: x)
+ return serializer(value)
+
+ def start_exporting(self):
+ pass
+
+ def finish_exporting(self):
pass
- def _get_serialized_fields(self, item, default_value=None,
- include_empty=None):
+ def _get_serialized_fields(self, item, default_value=None, include_empty=None):
"""Return the fields to export as an iterable of tuples
(name, serialized_value)
"""
- pass
+ item = ItemAdapter(item)
+ if include_empty is None:
+ include_empty = self.export_empty_fields
-class JsonLinesItemExporter(BaseItemExporter):
+ if self.fields_to_export is None:
+ if include_empty:
+ field_iter = item.field_names()
+ else:
+ field_iter = item.keys()
+ elif isinstance(self.fields_to_export, Mapping):
+ if include_empty:
+ field_iter = self.fields_to_export.items()
+ else:
+ field_iter = (
+ (x, y) for x, y in self.fields_to_export.items() if x in item
+ )
+ else:
+ if include_empty:
+ field_iter = self.fields_to_export
+ else:
+ field_iter = (x for x in self.fields_to_export if x in item)
+
+ for field_name in field_iter:
+ if isinstance(field_name, str):
+ item_field, output_field = field_name, field_name
+ else:
+ item_field, output_field = field_name
+ if item_field in item:
+ field_meta = item.get_field_meta(item_field)
+ value = self.serialize_field(field_meta, output_field, item[item_field])
+ else:
+ value = default_value
+ yield output_field, value
+
+
+class JsonLinesItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
- self._kwargs.setdefault('ensure_ascii', not self.encoding)
+ self._kwargs.setdefault("ensure_ascii", not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
+ def export_item(self, item):
+ itemdict = dict(self._get_serialized_fields(item))
+ data = self.encoder.encode(itemdict) + "\n"
+ self.file.write(to_bytes(data, self.encoding))
-class JsonItemExporter(BaseItemExporter):
+class JsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
- json_indent = (self.indent if self.indent is not None and self.
- indent > 0 else None)
- self._kwargs.setdefault('indent', json_indent)
- self._kwargs.setdefault('ensure_ascii', not self.encoding)
+ # there is a small difference between the behaviour or JsonItemExporter.indent
+ # and ScrapyJSONEncoder.indent. ScrapyJSONEncoder.indent=None is needed to prevent
+ # the addition of newlines everywhere
+ json_indent = (
+ self.indent if self.indent is not None and self.indent > 0 else None
+ )
+ self._kwargs.setdefault("indent", json_indent)
+ self._kwargs.setdefault("ensure_ascii", not self.encoding)
self.encoder = ScrapyJSONEncoder(**self._kwargs)
self.first_item = True
+ def _beautify_newline(self):
+ if self.indent is not None:
+ self.file.write(b"\n")
-class XmlItemExporter(BaseItemExporter):
+ def _add_comma_after_first(self):
+ if self.first_item:
+ self.first_item = False
+ else:
+ self.file.write(b",")
+ self._beautify_newline()
+ def start_exporting(self):
+ self.file.write(b"[")
+ self._beautify_newline()
+
+ def finish_exporting(self):
+ self._beautify_newline()
+ self.file.write(b"]")
+
+ def export_item(self, item):
+ itemdict = dict(self._get_serialized_fields(item))
+ data = to_bytes(self.encoder.encode(itemdict), self.encoding)
+ self._add_comma_after_first()
+ self.file.write(data)
+
+
+class XmlItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
- self.item_element = kwargs.pop('item_element', 'item')
- self.root_element = kwargs.pop('root_element', 'items')
+ self.item_element = kwargs.pop("item_element", "item")
+ self.root_element = kwargs.pop("root_element", "items")
super().__init__(**kwargs)
if not self.encoding:
- self.encoding = 'utf-8'
+ self.encoding = "utf-8"
self.xg = XMLGenerator(file, encoding=self.encoding)
+ def _beautify_newline(self, new_item=False):
+ if self.indent is not None and (self.indent > 0 or new_item):
+ self.xg.characters("\n")
-class CsvItemExporter(BaseItemExporter):
+ def _beautify_indent(self, depth=1):
+ if self.indent:
+ self.xg.characters(" " * self.indent * depth)
- def __init__(self, file, include_headers_line=True, join_multivalued=
- ',', errors=None, **kwargs):
+ def start_exporting(self):
+ self.xg.startDocument()
+ self.xg.startElement(self.root_element, {})
+ self._beautify_newline(new_item=True)
+
+ def export_item(self, item):
+ self._beautify_indent(depth=1)
+ self.xg.startElement(self.item_element, {})
+ self._beautify_newline()
+ for name, value in self._get_serialized_fields(item, default_value=""):
+ self._export_xml_field(name, value, depth=2)
+ self._beautify_indent(depth=1)
+ self.xg.endElement(self.item_element)
+ self._beautify_newline(new_item=True)
+
+ def finish_exporting(self):
+ self.xg.endElement(self.root_element)
+ self.xg.endDocument()
+
+ def _export_xml_field(self, name, serialized_value, depth):
+ self._beautify_indent(depth=depth)
+ self.xg.startElement(name, {})
+ if hasattr(serialized_value, "items"):
+ self._beautify_newline()
+ for subname, value in serialized_value.items():
+ self._export_xml_field(subname, value, depth=depth + 1)
+ self._beautify_indent(depth=depth)
+ elif is_listlike(serialized_value):
+ self._beautify_newline()
+ for value in serialized_value:
+ self._export_xml_field("value", value, depth=depth + 1)
+ self._beautify_indent(depth=depth)
+ elif isinstance(serialized_value, str):
+ self.xg.characters(serialized_value)
+ else:
+ self.xg.characters(str(serialized_value))
+ self.xg.endElement(name)
+ self._beautify_newline()
+
+
+class CsvItemExporter(BaseItemExporter):
+ def __init__(
+ self,
+ file,
+ include_headers_line=True,
+ join_multivalued=",",
+ errors=None,
+ **kwargs,
+ ):
super().__init__(dont_fail=True, **kwargs)
if not self.encoding:
- self.encoding = 'utf-8'
+ self.encoding = "utf-8"
self.include_headers_line = include_headers_line
- self.stream = io.TextIOWrapper(file, line_buffering=False,
- write_through=True, encoding=self.encoding, newline='', errors=
- errors)
+ self.stream = io.TextIOWrapper(
+ file,
+ line_buffering=False,
+ write_through=True,
+ encoding=self.encoding,
+ newline="", # Windows needs this https://github.com/scrapy/scrapy/issues/3034
+ errors=errors,
+ )
self.csv_writer = csv.writer(self.stream, **self._kwargs)
self._headers_not_written = True
self._join_multivalued = join_multivalued
+ def serialize_field(self, field, name, value):
+ serializer = field.get("serializer", self._join_if_needed)
+ return serializer(value)
+
+ def _join_if_needed(self, value):
+ if isinstance(value, (list, tuple)):
+ try:
+ return self._join_multivalued.join(value)
+ except TypeError: # list in value may not contain strings
+ pass
+ return value
+
+ def export_item(self, item):
+ if self._headers_not_written:
+ self._headers_not_written = False
+ self._write_headers_and_set_fields_to_export(item)
+
+ fields = self._get_serialized_fields(item, default_value="", include_empty=True)
+ values = list(self._build_row(x for _, x in fields))
+ self.csv_writer.writerow(values)
+
+ def finish_exporting(self):
+ self.stream.detach() # Avoid closing the wrapped file.
+
+ def _build_row(self, values):
+ for s in values:
+ try:
+ yield to_unicode(s, self.encoding)
+ except TypeError:
+ yield s
+
+ def _write_headers_and_set_fields_to_export(self, item):
+ if self.include_headers_line:
+ if not self.fields_to_export:
+ # use declared field names, or keys if the item is a dict
+ self.fields_to_export = ItemAdapter(item).field_names()
+ if isinstance(self.fields_to_export, Mapping):
+ fields = self.fields_to_export.values()
+ else:
+ fields = self.fields_to_export
+ row = list(self._build_row(fields))
+ self.csv_writer.writerow(row)
-class PickleItemExporter(BaseItemExporter):
+class PickleItemExporter(BaseItemExporter):
def __init__(self, file, protocol=4, **kwargs):
super().__init__(**kwargs)
self.file = file
self.protocol = protocol
+ def export_item(self, item):
+ d = dict(self._get_serialized_fields(item))
+ pickle.dump(d, self.file, self.protocol)
+
class MarshalItemExporter(BaseItemExporter):
"""Exports items in a Python-specific binary format (see
@@ -108,13 +303,19 @@ class MarshalItemExporter(BaseItemExporter):
super().__init__(**kwargs)
self.file = file
+ def export_item(self, item):
+ marshal.dump(dict(self._get_serialized_fields(item)), self.file)
-class PprintItemExporter(BaseItemExporter):
+class PprintItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(**kwargs)
self.file = file
+ def export_item(self, item):
+ itemdict = dict(self._get_serialized_fields(item))
+ self.file.write(to_bytes(pprint.pformat(itemdict) + "\n"))
+
class PythonItemExporter(BaseItemExporter):
"""This is a base class for item exporters that extends
@@ -125,3 +326,31 @@ class PythonItemExporter(BaseItemExporter):
.. _msgpack: https://pypi.org/project/msgpack/
"""
+
+ def _configure(self, options, dont_fail=False):
+ super()._configure(options, dont_fail)
+ if not self.encoding:
+ self.encoding = "utf-8"
+
+ def serialize_field(self, field, name, value):
+ serializer = field.get("serializer", self._serialize_value)
+ return serializer(value)
+
+ def _serialize_value(self, value):
+ if isinstance(value, Item):
+ return self.export_item(value)
+ if is_item(value):
+ return dict(self._serialize_item(value))
+ if is_listlike(value):
+ return [self._serialize_value(v) for v in value]
+ if isinstance(value, (str, bytes)):
+ return to_unicode(value, encoding=self.encoding)
+ return value
+
+ def _serialize_item(self, item):
+ for key, value in ItemAdapter(item).items():
+ yield key, self._serialize_value(value)
+
+ def export_item(self, item):
+ result = dict(self._get_serialized_fields(item))
+ return result
diff --git a/scrapy/extension.py b/scrapy/extension.py
index 27464cc17..4e365cfa1 100644
--- a/scrapy/extension.py
+++ b/scrapy/extension.py
@@ -8,4 +8,8 @@ from scrapy.utils.conf import build_component_list
class ExtensionManager(MiddlewareManager):
- component_name = 'extension'
+ component_name = "extension"
+
+ @classmethod
+ def _get_mwlist_from_settings(cls, settings):
+ return build_component_list(settings.getwithbase("EXTENSIONS"))
diff --git a/scrapy/extensions/closespider.py b/scrapy/extensions/closespider.py
index a01d48a8e..4307b4170 100644
--- a/scrapy/extensions/closespider.py
+++ b/scrapy/extensions/closespider.py
@@ -3,44 +3,110 @@ conditions are met.
See documentation in docs/topics/extensions.rst
"""
+
import logging
from collections import defaultdict
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
+
logger = logging.getLogger(__name__)
class CloseSpider:
-
def __init__(self, crawler):
self.crawler = crawler
- self.close_on = {'timeout': crawler.settings.getfloat(
- 'CLOSESPIDER_TIMEOUT'), 'itemcount': crawler.settings.getint(
- 'CLOSESPIDER_ITEMCOUNT'), 'pagecount': crawler.settings.getint(
- 'CLOSESPIDER_PAGECOUNT'), 'errorcount': crawler.settings.getint
- ('CLOSESPIDER_ERRORCOUNT'), 'timeout_no_item': crawler.settings
- .getint('CLOSESPIDER_TIMEOUT_NO_ITEM')}
+
+ self.close_on = {
+ "timeout": crawler.settings.getfloat("CLOSESPIDER_TIMEOUT"),
+ "itemcount": crawler.settings.getint("CLOSESPIDER_ITEMCOUNT"),
+ "pagecount": crawler.settings.getint("CLOSESPIDER_PAGECOUNT"),
+ "errorcount": crawler.settings.getint("CLOSESPIDER_ERRORCOUNT"),
+ "timeout_no_item": crawler.settings.getint("CLOSESPIDER_TIMEOUT_NO_ITEM"),
+ }
+
if not any(self.close_on.values()):
raise NotConfigured
+
self.counter = defaultdict(int)
- if self.close_on.get('errorcount'):
- crawler.signals.connect(self.error_count, signal=signals.
- spider_error)
- if self.close_on.get('pagecount'):
- crawler.signals.connect(self.page_count, signal=signals.
- response_received)
- if self.close_on.get('timeout'):
- crawler.signals.connect(self.spider_opened, signal=signals.
- spider_opened)
- if self.close_on.get('itemcount'):
- crawler.signals.connect(self.item_scraped, signal=signals.
- item_scraped)
- if self.close_on.get('timeout_no_item'):
- self.timeout_no_item = self.close_on['timeout_no_item']
+
+ if self.close_on.get("errorcount"):
+ crawler.signals.connect(self.error_count, signal=signals.spider_error)
+ if self.close_on.get("pagecount"):
+ crawler.signals.connect(self.page_count, signal=signals.response_received)
+ if self.close_on.get("timeout"):
+ crawler.signals.connect(self.spider_opened, signal=signals.spider_opened)
+ if self.close_on.get("itemcount"):
+ crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
+ if self.close_on.get("timeout_no_item"):
+ self.timeout_no_item = self.close_on["timeout_no_item"]
+ self.items_in_period = 0
+ crawler.signals.connect(
+ self.spider_opened_no_item, signal=signals.spider_opened
+ )
+ crawler.signals.connect(
+ self.item_scraped_no_item, signal=signals.item_scraped
+ )
+ crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def error_count(self, failure, response, spider):
+ self.counter["errorcount"] += 1
+ if self.counter["errorcount"] == self.close_on["errorcount"]:
+ self.crawler.engine.close_spider(spider, "closespider_errorcount")
+
+ def page_count(self, response, request, spider):
+ self.counter["pagecount"] += 1
+ if self.counter["pagecount"] == self.close_on["pagecount"]:
+ self.crawler.engine.close_spider(spider, "closespider_pagecount")
+
+ def spider_opened(self, spider):
+ from twisted.internet import reactor
+
+ self.task = reactor.callLater(
+ self.close_on["timeout"],
+ self.crawler.engine.close_spider,
+ spider,
+ reason="closespider_timeout",
+ )
+
+ def item_scraped(self, item, spider):
+ self.counter["itemcount"] += 1
+ if self.counter["itemcount"] == self.close_on["itemcount"]:
+ self.crawler.engine.close_spider(spider, "closespider_itemcount")
+
+ def spider_closed(self, spider):
+ task = getattr(self, "task", False)
+ if task and task.active():
+ task.cancel()
+
+ task_no_item = getattr(self, "task_no_item", False)
+ if task_no_item and task_no_item.running:
+ task_no_item.stop()
+
+ def spider_opened_no_item(self, spider):
+ from twisted.internet import task
+
+ self.task_no_item = task.LoopingCall(self._count_items_produced, spider)
+ self.task_no_item.start(self.timeout_no_item, now=False)
+
+ logger.info(
+ f"Spider will stop when no items are produced after "
+ f"{self.timeout_no_item} seconds."
+ )
+
+ def item_scraped_no_item(self, item, spider):
+ self.items_in_period += 1
+
+ def _count_items_produced(self, spider):
+ if self.items_in_period >= 1:
self.items_in_period = 0
- crawler.signals.connect(self.spider_opened_no_item, signal=
- signals.spider_opened)
- crawler.signals.connect(self.item_scraped_no_item, signal=
- signals.item_scraped)
- crawler.signals.connect(self.spider_closed, signal=signals.
- spider_closed)
+ else:
+ logger.info(
+ f"Closing spider since no items were produced in the last "
+ f"{self.timeout_no_item} seconds."
+ )
+ self.crawler.engine.close_spider(spider, "closespider_timeout_no_item")
diff --git a/scrapy/extensions/corestats.py b/scrapy/extensions/corestats.py
index c8451087e..302a615f2 100644
--- a/scrapy/extensions/corestats.py
+++ b/scrapy/extensions/corestats.py
@@ -2,11 +2,46 @@
Extension for collecting core stats like items scraped and start/finish times
"""
from datetime import datetime, timezone
+
from scrapy import signals
class CoreStats:
-
def __init__(self, stats):
self.stats = stats
self.start_time = None
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls(crawler.stats)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ crawler.signals.connect(o.item_scraped, signal=signals.item_scraped)
+ crawler.signals.connect(o.item_dropped, signal=signals.item_dropped)
+ crawler.signals.connect(o.response_received, signal=signals.response_received)
+ return o
+
+ def spider_opened(self, spider):
+ self.start_time = datetime.now(tz=timezone.utc)
+ self.stats.set_value("start_time", self.start_time, spider=spider)
+
+ def spider_closed(self, spider, reason):
+ finish_time = datetime.now(tz=timezone.utc)
+ elapsed_time = finish_time - self.start_time
+ elapsed_time_seconds = elapsed_time.total_seconds()
+ self.stats.set_value(
+ "elapsed_time_seconds", elapsed_time_seconds, spider=spider
+ )
+ self.stats.set_value("finish_time", finish_time, spider=spider)
+ self.stats.set_value("finish_reason", reason, spider=spider)
+
+ def item_scraped(self, item, spider):
+ self.stats.inc_value("item_scraped_count", spider=spider)
+
+ def response_received(self, spider):
+ self.stats.inc_value("response_received_count", spider=spider)
+
+ def item_dropped(self, item, spider, exception):
+ reason = exception.__class__.__name__
+ self.stats.inc_value("item_dropped_count", spider=spider)
+ self.stats.inc_value(f"item_dropped_reasons_count/{reason}", spider=spider)
diff --git a/scrapy/extensions/debug.py b/scrapy/extensions/debug.py
index cac078bc7..1b6c7777f 100644
--- a/scrapy/extensions/debug.py
+++ b/scrapy/extensions/debug.py
@@ -3,32 +3,64 @@ Extensions for debugging Scrapy
See documentation in docs/topics/extensions.rst
"""
+
import logging
import signal
import sys
import threading
import traceback
from pdb import Pdb
+
from scrapy.utils.engine import format_engine_status
from scrapy.utils.trackref import format_live_refs
+
logger = logging.getLogger(__name__)
class StackTraceDump:
-
def __init__(self, crawler=None):
self.crawler = crawler
try:
signal.signal(signal.SIGUSR2, self.dump_stacktrace)
signal.signal(signal.SIGQUIT, self.dump_stacktrace)
except AttributeError:
+ # win32 platforms don't support SIGUSR signals
pass
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
-class Debugger:
+ def dump_stacktrace(self, signum, frame):
+ log_args = {
+ "stackdumps": self._thread_stacks(),
+ "enginestatus": format_engine_status(self.crawler.engine),
+ "liverefs": format_live_refs(),
+ }
+ logger.info(
+ "Dumping stack trace and engine status\n"
+ "%(enginestatus)s\n%(liverefs)s\n%(stackdumps)s",
+ log_args,
+ extra={"crawler": self.crawler},
+ )
+ def _thread_stacks(self):
+ id2name = dict((th.ident, th.name) for th in threading.enumerate())
+ dumps = ""
+ for id_, frame in sys._current_frames().items():
+ name = id2name.get(id_, "")
+ dump = "".join(traceback.format_stack(frame))
+ dumps += f"# Thread: {name}({id_})\n{dump}\n"
+ return dumps
+
+
+class Debugger:
def __init__(self):
try:
signal.signal(signal.SIGUSR2, self._enter_debugger)
except AttributeError:
+ # win32 platforms don't support SIGUSR signals
pass
+
+ def _enter_debugger(self, signum, frame):
+ Pdb().set_trace(frame.f_back)
diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py
index a30ae2dbd..4e846d1bd 100644
--- a/scrapy/extensions/feedexport.py
+++ b/scrapy/extensions/feedexport.py
@@ -3,6 +3,7 @@ Feed Exports extension
See documentation in docs/topics/feed-exports.rst
"""
+
import logging
import re
import sys
@@ -12,10 +13,12 @@ from pathlib import Path, PureWindowsPath
from tempfile import NamedTemporaryFile
from typing import IO, Any, Callable, Dict, List, Optional, Tuple, Union
from urllib.parse import unquote, urlparse
+
from twisted.internet import defer, threads
from twisted.internet.defer import DeferredList
from w3lib.url import file_uri_to_path
from zope.interface import Interface, implementer
+
from scrapy import Spider, signals
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.extensions.postprocessing import PostProcessingManager
@@ -27,14 +30,32 @@ from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import create_instance, load_object
from scrapy.utils.python import get_func_args, without_none_values
+
logger = logging.getLogger(__name__)
+
try:
- import boto3
+ import boto3 # noqa: F401
+
IS_BOTO3_AVAILABLE = True
except ImportError:
IS_BOTO3_AVAILABLE = False
+def build_storage(builder, uri, *args, feed_options=None, preargs=(), **kwargs):
+ argument_names = get_func_args(builder)
+ if "feed_options" in argument_names:
+ kwargs["feed_options"] = feed_options
+ else:
+ warnings.warn(
+ f"{builder.__qualname__} does not support the 'feed_options' keyword argument. Add a "
+ "'feed_options' parameter to its signature to remove this "
+ "warning. This parameter will become mandatory in a future "
+ "version of Scrapy.",
+ category=ScrapyDeprecationWarning,
+ )
+ return builder(*preargs, uri, *args, **kwargs)
+
+
class ItemFilter:
"""
This will be used by FeedExporter to decide if an item should be allowed
@@ -43,18 +64,21 @@ class ItemFilter:
:param feed_options: feed specific options passed from FeedExporter
:type feed_options: dict
"""
+
feed_options: Optional[dict]
item_classes: Tuple
- def __init__(self, feed_options: Optional[dict]) ->None:
+ def __init__(self, feed_options: Optional[dict]) -> None:
self.feed_options = feed_options
if feed_options is not None:
- self.item_classes = tuple(load_object(item_class) for
- item_class in feed_options.get('item_classes') or ())
+ self.item_classes = tuple(
+ load_object(item_class)
+ for item_class in feed_options.get("item_classes") or ()
+ )
else:
self.item_classes = tuple()
- def accepts(self, item: Any) ->bool:
+ def accepts(self, item: Any) -> bool:
"""
Return ``True`` if `item` should be exported or ``False`` otherwise.
@@ -63,7 +87,9 @@ class ItemFilter:
:return: `True` if accepted, `False` otherwise
:rtype: bool
"""
- pass
+ if self.item_classes:
+ return isinstance(item, self.item_classes)
+ return True # accept all items by default
class IFeedStorage(Interface):
@@ -76,175 +102,437 @@ class IFeedStorage(Interface):
def open(spider):
"""Open the storage for the given spider. It must return a file-like
object that will be used for the exporters"""
- pass
def store(file):
"""Store the given file stream"""
- pass
@implementer(IFeedStorage)
class BlockingFeedStorage:
- pass
+ def open(self, spider):
+ path = spider.crawler.settings["FEED_TEMPDIR"]
+ if path and not Path(path).is_dir():
+ raise OSError("Not a Directory: " + str(path))
+
+ return NamedTemporaryFile(prefix="feed-", dir=path)
+
+ def store(self, file):
+ return threads.deferToThread(self._store_in_thread, file)
+
+ def _store_in_thread(self, file):
+ raise NotImplementedError
@implementer(IFeedStorage)
class StdoutFeedStorage:
-
def __init__(self, uri, _stdout=None, *, feed_options=None):
if not _stdout:
_stdout = sys.stdout.buffer
self._stdout = _stdout
- if feed_options and feed_options.get('overwrite', False) is True:
+ if feed_options and feed_options.get("overwrite", False) is True:
logger.warning(
- 'Standard output (stdout) storage does not support overwriting. To suppress this warning, remove the overwrite option from your FEEDS setting, or set it to False.'
- )
+ "Standard output (stdout) storage does not support "
+ "overwriting. To suppress this warning, remove the "
+ "overwrite option from your FEEDS setting, or set "
+ "it to False."
+ )
+
+ def open(self, spider):
+ return self._stdout
+
+ def store(self, file):
+ pass
@implementer(IFeedStorage)
class FileFeedStorage:
-
def __init__(self, uri, *, feed_options=None):
self.path = file_uri_to_path(uri)
feed_options = feed_options or {}
- self.write_mode = 'wb' if feed_options.get('overwrite', False
- ) else 'ab'
+ self.write_mode = "wb" if feed_options.get("overwrite", False) else "ab"
+ def open(self, spider) -> IO[Any]:
+ dirname = Path(self.path).parent
+ if dirname and not dirname.exists():
+ dirname.mkdir(parents=True)
+ return Path(self.path).open(self.write_mode)
-class S3FeedStorage(BlockingFeedStorage):
+ def store(self, file):
+ file.close()
- def __init__(self, uri, access_key=None, secret_key=None, acl=None,
- endpoint_url=None, *, feed_options=None, session_token=None,
- region_name=None):
+
+class S3FeedStorage(BlockingFeedStorage):
+ def __init__(
+ self,
+ uri,
+ access_key=None,
+ secret_key=None,
+ acl=None,
+ endpoint_url=None,
+ *,
+ feed_options=None,
+ session_token=None,
+ region_name=None,
+ ):
if not is_botocore_available():
- raise NotConfigured('missing botocore library')
+ raise NotConfigured("missing botocore library")
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
self.secret_key = u.password or secret_key
self.session_token = session_token
- self.keyname = u.path[1:]
+ self.keyname = u.path[1:] # remove first "/"
self.acl = acl
self.endpoint_url = endpoint_url
self.region_name = region_name
+
if IS_BOTO3_AVAILABLE:
import boto3.session
+
session = boto3.session.Session()
- self.s3_client = session.client('s3', aws_access_key_id=self.
- access_key, aws_secret_access_key=self.secret_key,
- aws_session_token=self.session_token, endpoint_url=self.
- endpoint_url, region_name=self.region_name)
+
+ self.s3_client = session.client(
+ "s3",
+ aws_access_key_id=self.access_key,
+ aws_secret_access_key=self.secret_key,
+ aws_session_token=self.session_token,
+ endpoint_url=self.endpoint_url,
+ region_name=self.region_name,
+ )
else:
warnings.warn(
- '`botocore` usage has been deprecated for S3 feed export, please use `boto3` to avoid problems'
- , category=ScrapyDeprecationWarning)
+ "`botocore` usage has been deprecated for S3 feed "
+ "export, please use `boto3` to avoid problems",
+ category=ScrapyDeprecationWarning,
+ )
+
import botocore.session
+
session = botocore.session.get_session()
- self.s3_client = session.create_client('s3', aws_access_key_id=
- self.access_key, aws_secret_access_key=self.secret_key,
- aws_session_token=self.session_token, endpoint_url=self.
- endpoint_url, region_name=self.region_name)
- if feed_options and feed_options.get('overwrite', True) is False:
+
+ self.s3_client = session.create_client(
+ "s3",
+ aws_access_key_id=self.access_key,
+ aws_secret_access_key=self.secret_key,
+ aws_session_token=self.session_token,
+ endpoint_url=self.endpoint_url,
+ region_name=self.region_name,
+ )
+
+ if feed_options and feed_options.get("overwrite", True) is False:
logger.warning(
- 'S3 does not support appending to files. To suppress this warning, remove the overwrite option from your FEEDS setting or set it to True.'
- )
+ "S3 does not support appending to files. To "
+ "suppress this warning, remove the overwrite "
+ "option from your FEEDS setting or set it to True."
+ )
+
+ @classmethod
+ def from_crawler(cls, crawler, uri, *, feed_options=None):
+ return build_storage(
+ cls,
+ uri,
+ access_key=crawler.settings["AWS_ACCESS_KEY_ID"],
+ secret_key=crawler.settings["AWS_SECRET_ACCESS_KEY"],
+ session_token=crawler.settings["AWS_SESSION_TOKEN"],
+ acl=crawler.settings["FEED_STORAGE_S3_ACL"] or None,
+ endpoint_url=crawler.settings["AWS_ENDPOINT_URL"] or None,
+ region_name=crawler.settings["AWS_REGION_NAME"] or None,
+ feed_options=feed_options,
+ )
+
+ def _store_in_thread(self, file):
+ file.seek(0)
+ if IS_BOTO3_AVAILABLE:
+ kwargs = {"ExtraArgs": {"ACL": self.acl}} if self.acl else {}
+ self.s3_client.upload_fileobj(
+ Bucket=self.bucketname, Key=self.keyname, Fileobj=file, **kwargs
+ )
+ else:
+ kwargs = {"ACL": self.acl} if self.acl else {}
+ self.s3_client.put_object(
+ Bucket=self.bucketname, Key=self.keyname, Body=file, **kwargs
+ )
+ file.close()
class GCSFeedStorage(BlockingFeedStorage):
-
def __init__(self, uri, project_id, acl):
self.project_id = project_id
self.acl = acl
u = urlparse(uri)
self.bucket_name = u.hostname
- self.blob_name = u.path[1:]
+ self.blob_name = u.path[1:] # remove first "/"
+ @classmethod
+ def from_crawler(cls, crawler, uri):
+ return cls(
+ uri,
+ crawler.settings["GCS_PROJECT_ID"],
+ crawler.settings["FEED_STORAGE_GCS_ACL"] or None,
+ )
-class FTPFeedStorage(BlockingFeedStorage):
+ def _store_in_thread(self, file):
+ file.seek(0)
+ from google.cloud.storage import Client
- def __init__(self, uri: str, use_active_mode: bool=False, *,
- feed_options: Optional[Dict[str, Any]]=None):
+ client = Client(project=self.project_id)
+ bucket = client.get_bucket(self.bucket_name)
+ blob = bucket.blob(self.blob_name)
+ blob.upload_from_file(file, predefined_acl=self.acl)
+
+
+class FTPFeedStorage(BlockingFeedStorage):
+ def __init__(
+ self,
+ uri: str,
+ use_active_mode: bool = False,
+ *,
+ feed_options: Optional[Dict[str, Any]] = None,
+ ):
u = urlparse(uri)
if not u.hostname:
- raise ValueError(f'Got a storage URI without a hostname: {uri}')
+ raise ValueError(f"Got a storage URI without a hostname: {uri}")
self.host: str = u.hostname
- self.port: int = int(u.port or '21')
- self.username: str = u.username or ''
- self.password: str = unquote(u.password or '')
+ self.port: int = int(u.port or "21")
+ self.username: str = u.username or ""
+ self.password: str = unquote(u.password or "")
self.path: str = u.path
self.use_active_mode: bool = use_active_mode
- self.overwrite: bool = not feed_options or feed_options.get('overwrite'
- , True)
+ self.overwrite: bool = not feed_options or feed_options.get("overwrite", True)
+
+ @classmethod
+ def from_crawler(cls, crawler, uri, *, feed_options=None):
+ return build_storage(
+ cls,
+ uri,
+ crawler.settings.getbool("FEED_STORAGE_FTP_ACTIVE"),
+ feed_options=feed_options,
+ )
+
+ def _store_in_thread(self, file):
+ ftp_store_file(
+ path=self.path,
+ file=file,
+ host=self.host,
+ port=self.port,
+ username=self.username,
+ password=self.password,
+ use_active_mode=self.use_active_mode,
+ overwrite=self.overwrite,
+ )
class FeedSlot:
-
- def __init__(self, storage, uri, format, store_empty, batch_id,
- uri_template, filter, feed_options, spider, exporters, settings,
- crawler):
+ def __init__(
+ self,
+ storage,
+ uri,
+ format,
+ store_empty,
+ batch_id,
+ uri_template,
+ filter,
+ feed_options,
+ spider,
+ exporters,
+ settings,
+ crawler,
+ ):
self.file = None
self.exporter = None
self.storage = storage
+ # feed params
self.batch_id = batch_id
self.format = format
self.store_empty = store_empty
self.uri_template = uri_template
self.uri = uri
self.filter = filter
+ # exporter params
self.feed_options = feed_options
self.spider = spider
self.exporters = exporters
self.settings = settings
self.crawler = crawler
+ # flags
self.itemcount = 0
self._exporting = False
self._fileloaded = False
+ def start_exporting(self):
+ if not self._fileloaded:
+ self.file = self.storage.open(self.spider)
+ if "postprocessing" in self.feed_options:
+ self.file = PostProcessingManager(
+ self.feed_options["postprocessing"], self.file, self.feed_options
+ )
+ self.exporter = self._get_exporter(
+ file=self.file,
+ format=self.feed_options["format"],
+ fields_to_export=self.feed_options["fields"],
+ encoding=self.feed_options["encoding"],
+ indent=self.feed_options["indent"],
+ **self.feed_options["item_export_kwargs"],
+ )
+ self._fileloaded = True
+
+ if not self._exporting:
+ self.exporter.start_exporting()
+ self._exporting = True
+
+ def _get_instance(self, objcls, *args, **kwargs):
+ return create_instance(objcls, self.settings, self.crawler, *args, **kwargs)
+
+ def _get_exporter(self, file, format, *args, **kwargs):
+ return self._get_instance(self.exporters[format], file, *args, **kwargs)
-_FeedSlot = create_deprecated_class(name='_FeedSlot', new_class=FeedSlot)
+ def finish_exporting(self):
+ if self._exporting:
+ self.exporter.finish_exporting()
+ self._exporting = False
+
+
+_FeedSlot = create_deprecated_class(
+ name="_FeedSlot",
+ new_class=FeedSlot,
+)
class FeedExporter:
_pending_deferreds: List[defer.Deferred] = []
+ @classmethod
+ def from_crawler(cls, crawler):
+ exporter = cls(crawler)
+ crawler.signals.connect(exporter.open_spider, signals.spider_opened)
+ crawler.signals.connect(exporter.close_spider, signals.spider_closed)
+ crawler.signals.connect(exporter.item_scraped, signals.item_scraped)
+ return exporter
+
def __init__(self, crawler):
self.crawler = crawler
self.settings = crawler.settings
self.feeds = {}
self.slots = []
self.filters = {}
- if not self.settings['FEEDS'] and not self.settings['FEED_URI']:
+
+ if not self.settings["FEEDS"] and not self.settings["FEED_URI"]:
raise NotConfigured
- if self.settings['FEED_URI']:
+
+ # Begin: Backward compatibility for FEED_URI and FEED_FORMAT settings
+ if self.settings["FEED_URI"]:
warnings.warn(
- 'The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of the `FEEDS` setting. Please see the `FEEDS` setting docs for more details'
- , category=ScrapyDeprecationWarning, stacklevel=2)
- uri = self.settings['FEED_URI']
- uri = str(uri) if not isinstance(uri, Path) else uri.absolute(
- ).as_uri()
- feed_options = {'format': self.settings.get('FEED_FORMAT',
- 'jsonlines')}
+ "The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of "
+ "the `FEEDS` setting. Please see the `FEEDS` setting docs for more details",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ uri = self.settings["FEED_URI"]
+ # handle pathlib.Path objects
+ uri = str(uri) if not isinstance(uri, Path) else uri.absolute().as_uri()
+ feed_options = {"format": self.settings.get("FEED_FORMAT", "jsonlines")}
self.feeds[uri] = feed_complete_default_values_from_settings(
- feed_options, self.settings)
+ feed_options, self.settings
+ )
self.filters[uri] = self._load_filter(feed_options)
- for uri, feed_options in self.settings.getdict('FEEDS').items():
- uri = str(uri) if not isinstance(uri, Path) else uri.absolute(
- ).as_uri()
+ # End: Backward compatibility for FEED_URI and FEED_FORMAT settings
+
+ # 'FEEDS' setting takes precedence over 'FEED_URI'
+ for uri, feed_options in self.settings.getdict("FEEDS").items():
+ # handle pathlib.Path objects
+ uri = str(uri) if not isinstance(uri, Path) else uri.absolute().as_uri()
self.feeds[uri] = feed_complete_default_values_from_settings(
- feed_options, self.settings)
+ feed_options, self.settings
+ )
self.filters[uri] = self._load_filter(feed_options)
- self.storages = self._load_components('FEED_STORAGES')
- self.exporters = self._load_components('FEED_EXPORTERS')
+
+ self.storages = self._load_components("FEED_STORAGES")
+ self.exporters = self._load_components("FEED_EXPORTERS")
for uri, feed_options in self.feeds.items():
if not self._storage_supported(uri, feed_options):
raise NotConfigured
if not self._settings_are_valid():
raise NotConfigured
- if not self._exporter_supported(feed_options['format']):
+ if not self._exporter_supported(feed_options["format"]):
raise NotConfigured
- def _start_new_batch(self, batch_id, uri, feed_options, spider,
- uri_template):
+ def open_spider(self, spider):
+ for uri, feed_options in self.feeds.items():
+ uri_params = self._get_uri_params(spider, feed_options["uri_params"])
+ self.slots.append(
+ self._start_new_batch(
+ batch_id=1,
+ uri=uri % uri_params,
+ feed_options=feed_options,
+ spider=spider,
+ uri_template=uri,
+ )
+ )
+
+ async def close_spider(self, spider):
+ for slot in self.slots:
+ self._close_slot(slot, spider)
+
+ # Await all deferreds
+ if self._pending_deferreds:
+ await maybe_deferred_to_future(DeferredList(self._pending_deferreds))
+
+ # Send FEED_EXPORTER_CLOSED signal
+ await maybe_deferred_to_future(
+ self.crawler.signals.send_catch_log_deferred(signals.feed_exporter_closed)
+ )
+
+ def _close_slot(self, slot, spider):
+ def get_file(slot_):
+ if isinstance(slot_.file, PostProcessingManager):
+ slot_.file.close()
+ return slot_.file.file
+ return slot_.file
+
+ if slot.itemcount:
+ # Normal case
+ slot.finish_exporting()
+ elif slot.store_empty and slot.batch_id == 1:
+ # Need to store the empty file
+ slot.start_exporting()
+ slot.finish_exporting()
+ else:
+ # In this case, the file is not stored, so no processing is required.
+ return None
+
+ logmsg = f"{slot.format} feed ({slot.itemcount} items) in: {slot.uri}"
+ d = defer.maybeDeferred(slot.storage.store, get_file(slot))
+
+ d.addCallback(
+ self._handle_store_success, logmsg, spider, type(slot.storage).__name__
+ )
+ d.addErrback(
+ self._handle_store_error, logmsg, spider, type(slot.storage).__name__
+ )
+ self._pending_deferreds.append(d)
+ d.addCallback(
+ lambda _: self.crawler.signals.send_catch_log_deferred(
+ signals.feed_slot_closed, slot=slot
+ )
+ )
+ d.addBoth(lambda _: self._pending_deferreds.remove(d))
+
+ return d
+
+ def _handle_store_error(self, f, logmsg, spider, slot_type):
+ logger.error(
+ "Error storing %s",
+ logmsg,
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": spider},
+ )
+ self.crawler.stats.inc_value(f"feedexport/failed_count/{slot_type}")
+
+ def _handle_store_success(self, f, logmsg, spider, slot_type):
+ logger.info("Stored %s", logmsg, extra={"spider": spider})
+ self.crawler.stats.inc_value(f"feedexport/success_count/{slot_type}")
+
+ def _start_new_batch(self, batch_id, uri, feed_options, spider, uri_template):
"""
Redirect the output data stream to a new file.
Execute multiple times if FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified
@@ -254,14 +542,103 @@ class FeedExporter:
:param spider: user spider
:param uri_template: template of uri which contains %(batch_time)s or %(batch_id)d to create new uri
"""
- pass
+ storage = self._get_storage(uri, feed_options)
+ slot = FeedSlot(
+ storage=storage,
+ uri=uri,
+ format=feed_options["format"],
+ store_empty=feed_options["store_empty"],
+ batch_id=batch_id,
+ uri_template=uri_template,
+ filter=self.filters[uri_template],
+ feed_options=feed_options,
+ spider=spider,
+ exporters=self.exporters,
+ settings=self.settings,
+ crawler=getattr(self, "crawler", None),
+ )
+ return slot
+
+ def item_scraped(self, item, spider):
+ slots = []
+ for slot in self.slots:
+ if not slot.filter.accepts(item):
+ slots.append(
+ slot
+ ) # if slot doesn't accept item, continue with next slot
+ continue
+
+ slot.start_exporting()
+ slot.exporter.export_item(item)
+ slot.itemcount += 1
+ # create new slot for each slot with itemcount == FEED_EXPORT_BATCH_ITEM_COUNT and close the old one
+ if (
+ self.feeds[slot.uri_template]["batch_item_count"]
+ and slot.itemcount >= self.feeds[slot.uri_template]["batch_item_count"]
+ ):
+ uri_params = self._get_uri_params(
+ spider, self.feeds[slot.uri_template]["uri_params"], slot
+ )
+ self._close_slot(slot, spider)
+ slots.append(
+ self._start_new_batch(
+ batch_id=slot.batch_id + 1,
+ uri=slot.uri_template % uri_params,
+ feed_options=self.feeds[slot.uri_template],
+ spider=spider,
+ uri_template=slot.uri_template,
+ )
+ )
+ else:
+ slots.append(slot)
+ self.slots = slots
+
+ def _load_components(self, setting_prefix):
+ conf = without_none_values(self.settings.getwithbase(setting_prefix))
+ d = {}
+ for k, v in conf.items():
+ try:
+ d[k] = load_object(v)
+ except NotConfigured:
+ pass
+ return d
+
+ def _exporter_supported(self, format):
+ if format in self.exporters:
+ return True
+ logger.error("Unknown feed format: %(format)s", {"format": format})
def _settings_are_valid(self):
"""
If FEED_EXPORT_BATCH_ITEM_COUNT setting or FEEDS.batch_item_count is specified uri has to contain
%(batch_time)s or %(batch_id)d to distinguish different files of partial output
"""
- pass
+ for uri_template, values in self.feeds.items():
+ if values["batch_item_count"] and not re.search(
+ r"%\(batch_time\)s|%\(batch_id\)", uri_template
+ ):
+ logger.error(
+ "%%(batch_time)s or %%(batch_id)d must be in the feed URI (%s) if FEED_EXPORT_BATCH_ITEM_COUNT "
+ "setting or FEEDS.batch_item_count is specified and greater than 0. For more info see: "
+ "https://docs.scrapy.org/en/latest/topics/feed-exports.html#feed-export-batch-item-count",
+ uri_template,
+ )
+ return False
+ return True
+
+ def _storage_supported(self, uri, feed_options):
+ scheme = urlparse(uri).scheme
+ if scheme in self.storages or PureWindowsPath(uri).drive:
+ try:
+ self._get_storage(uri, feed_options)
+ return True
+ except NotConfigured as e:
+ logger.error(
+ "Disabled feed storage scheme: %(scheme)s. " "Reason: %(reason)s",
+ {"scheme": scheme, "reason": str(e)},
+ )
+ else:
+ logger.error("Unknown feed storage scheme: %(scheme)s", {"scheme": scheme})
def _get_storage(self, uri, feed_options):
"""Fork of create_instance specific to feed storage classes
@@ -269,4 +646,49 @@ class FeedExporter:
It supports not passing the *feed_options* parameters to classes that
do not support it, and issuing a deprecation warning instead.
"""
- pass
+ feedcls = self.storages.get(urlparse(uri).scheme, self.storages["file"])
+ crawler = getattr(self, "crawler", None)
+
+ def build_instance(builder, *preargs):
+ return build_storage(
+ builder, uri, feed_options=feed_options, preargs=preargs
+ )
+
+ if crawler and hasattr(feedcls, "from_crawler"):
+ instance = build_instance(feedcls.from_crawler, crawler)
+ method_name = "from_crawler"
+ elif hasattr(feedcls, "from_settings"):
+ instance = build_instance(feedcls.from_settings, self.settings)
+ method_name = "from_settings"
+ else:
+ instance = build_instance(feedcls)
+ method_name = "__new__"
+ if instance is None:
+ raise TypeError(f"{feedcls.__qualname__}.{method_name} returned None")
+ return instance
+
+ def _get_uri_params(
+ self,
+ spider: Spider,
+ uri_params_function: Optional[Union[str, Callable[[dict, Spider], dict]]],
+ slot: Optional[FeedSlot] = None,
+ ) -> dict:
+ params = {}
+ for k in dir(spider):
+ params[k] = getattr(spider, k)
+ utc_now = datetime.now(tz=timezone.utc)
+ params["time"] = utc_now.replace(microsecond=0).isoformat().replace(":", "-")
+ params["batch_time"] = utc_now.isoformat().replace(":", "-")
+ params["batch_id"] = slot.batch_id + 1 if slot is not None else 1
+ uripar_function = (
+ load_object(uri_params_function)
+ if uri_params_function
+ else lambda params, _: params
+ )
+ new_params = uripar_function(params, spider)
+ return new_params if new_params is not None else params
+
+ def _load_filter(self, feed_options):
+ # load the item filter if declared else load the default filter class
+ item_filter_class = load_object(feed_options.get("item_filter", ItemFilter))
+ return item_filter_class(feed_options)
diff --git a/scrapy/extensions/httpcache.py b/scrapy/extensions/httpcache.py
index b70d7ec39..7e4f047a8 100644
--- a/scrapy/extensions/httpcache.py
+++ b/scrapy/extensions/httpcache.py
@@ -6,7 +6,9 @@ from importlib import import_module
from pathlib import Path
from time import time
from weakref import WeakKeyDictionary
+
from w3lib.http import headers_dict_to_raw, headers_raw_to_dict
+
from scrapy.http import Headers, Response
from scrapy.http.request import Request
from scrapy.responsetypes import responsetypes
@@ -14,52 +16,343 @@ from scrapy.spiders import Spider
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.project import data_path
from scrapy.utils.python import to_bytes, to_unicode
+
logger = logging.getLogger(__name__)
class DummyPolicy:
-
def __init__(self, settings):
- self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
- self.ignore_http_codes = [int(x) for x in settings.getlist(
- 'HTTPCACHE_IGNORE_HTTP_CODES')]
+ self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
+ self.ignore_http_codes = [
+ int(x) for x in settings.getlist("HTTPCACHE_IGNORE_HTTP_CODES")
+ ]
+
+ def should_cache_request(self, request):
+ return urlparse_cached(request).scheme not in self.ignore_schemes
+
+ def should_cache_response(self, response, request):
+ return response.status not in self.ignore_http_codes
+
+ def is_cached_response_fresh(self, cachedresponse, request):
+ return True
+
+ def is_cached_response_valid(self, cachedresponse, response, request):
+ return True
class RFC2616Policy:
- MAXAGE = 3600 * 24 * 365
+ MAXAGE = 3600 * 24 * 365 # one year
def __init__(self, settings):
- self.always_store = settings.getbool('HTTPCACHE_ALWAYS_STORE')
- self.ignore_schemes = settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
+ self.always_store = settings.getbool("HTTPCACHE_ALWAYS_STORE")
+ self.ignore_schemes = settings.getlist("HTTPCACHE_IGNORE_SCHEMES")
self._cc_parsed = WeakKeyDictionary()
- self.ignore_response_cache_controls = [to_bytes(cc) for cc in
- settings.getlist('HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS')]
+ self.ignore_response_cache_controls = [
+ to_bytes(cc)
+ for cc in settings.getlist("HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS")
+ ]
+ def _parse_cachecontrol(self, r):
+ if r not in self._cc_parsed:
+ cch = r.headers.get(b"Cache-Control", b"")
+ parsed = parse_cachecontrol(cch)
+ if isinstance(r, Response):
+ for key in self.ignore_response_cache_controls:
+ parsed.pop(key, None)
+ self._cc_parsed[r] = parsed
+ return self._cc_parsed[r]
-class DbmCacheStorage:
+ def should_cache_request(self, request):
+ if urlparse_cached(request).scheme in self.ignore_schemes:
+ return False
+ cc = self._parse_cachecontrol(request)
+ # obey user-agent directive "Cache-Control: no-store"
+ if b"no-store" in cc:
+ return False
+ # Any other is eligible for caching
+ return True
+
+ def should_cache_response(self, response, request):
+ # What is cacheable - https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.9.1
+ # Response cacheability - https://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.4
+ # Status code 206 is not included because cache can not deal with partial contents
+ cc = self._parse_cachecontrol(response)
+ # obey directive "Cache-Control: no-store"
+ if b"no-store" in cc:
+ return False
+ # Never cache 304 (Not Modified) responses
+ if response.status == 304:
+ return False
+ # Cache unconditionally if configured to do so
+ if self.always_store:
+ return True
+ # Any hint on response expiration is good
+ if b"max-age" in cc or b"Expires" in response.headers:
+ return True
+ # Firefox fallbacks this statuses to one year expiration if none is set
+ if response.status in (300, 301, 308):
+ return True
+ # Other statuses without expiration requires at least one validator
+ if response.status in (200, 203, 401):
+ return b"Last-Modified" in response.headers or b"ETag" in response.headers
+ # Any other is probably not eligible for caching
+ # Makes no sense to cache responses that does not contain expiration
+ # info and can not be revalidated
+ return False
+
+ def is_cached_response_fresh(self, cachedresponse, request):
+ cc = self._parse_cachecontrol(cachedresponse)
+ ccreq = self._parse_cachecontrol(request)
+ if b"no-cache" in cc or b"no-cache" in ccreq:
+ return False
+
+ now = time()
+ freshnesslifetime = self._compute_freshness_lifetime(
+ cachedresponse, request, now
+ )
+ currentage = self._compute_current_age(cachedresponse, request, now)
+
+ reqmaxage = self._get_max_age(ccreq)
+ if reqmaxage is not None:
+ freshnesslifetime = min(freshnesslifetime, reqmaxage)
+
+ if currentage < freshnesslifetime:
+ return True
+
+ if b"max-stale" in ccreq and b"must-revalidate" not in cc:
+ # From RFC2616: "Indicates that the client is willing to
+ # accept a response that has exceeded its expiration time.
+ # If max-stale is assigned a value, then the client is
+ # willing to accept a response that has exceeded its
+ # expiration time by no more than the specified number of
+ # seconds. If no value is assigned to max-stale, then the
+ # client is willing to accept a stale response of any age."
+ staleage = ccreq[b"max-stale"]
+ if staleage is None:
+ return True
+
+ try:
+ if currentage < freshnesslifetime + max(0, int(staleage)):
+ return True
+ except ValueError:
+ pass
+
+ # Cached response is stale, try to set validators if any
+ self._set_conditional_validators(request, cachedresponse)
+ return False
+
+ def is_cached_response_valid(self, cachedresponse, response, request):
+ # Use the cached response if the new response is a server error,
+ # as long as the old response didn't specify must-revalidate.
+ if response.status >= 500:
+ cc = self._parse_cachecontrol(cachedresponse)
+ if b"must-revalidate" not in cc:
+ return True
+
+ # Use the cached response if the server says it hasn't changed.
+ return response.status == 304
+
+ def _set_conditional_validators(self, request, cachedresponse):
+ if b"Last-Modified" in cachedresponse.headers:
+ request.headers[b"If-Modified-Since"] = cachedresponse.headers[
+ b"Last-Modified"
+ ]
+
+ if b"ETag" in cachedresponse.headers:
+ request.headers[b"If-None-Match"] = cachedresponse.headers[b"ETag"]
+
+ def _get_max_age(self, cc):
+ try:
+ return max(0, int(cc[b"max-age"]))
+ except (KeyError, ValueError):
+ return None
+
+ def _compute_freshness_lifetime(self, response, request, now):
+ # Reference nsHttpResponseHead::ComputeFreshnessLifetime
+ # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#706
+ cc = self._parse_cachecontrol(response)
+ maxage = self._get_max_age(cc)
+ if maxage is not None:
+ return maxage
+
+ # Parse date header or synthesize it if none exists
+ date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
+
+ # Try HTTP/1.0 Expires header
+ if b"Expires" in response.headers:
+ expires = rfc1123_to_epoch(response.headers[b"Expires"])
+ # When parsing Expires header fails RFC 2616 section 14.21 says we
+ # should treat this as an expiration time in the past.
+ return max(0, expires - date) if expires else 0
+
+ # Fallback to heuristic using last-modified header
+ # This is not in RFC but on Firefox caching implementation
+ lastmodified = rfc1123_to_epoch(response.headers.get(b"Last-Modified"))
+ if lastmodified and lastmodified <= date:
+ return (date - lastmodified) / 10
+
+ # This request can be cached indefinitely
+ if response.status in (300, 301, 308):
+ return self.MAXAGE
+ # Insufficient information to compute freshness lifetime
+ return 0
+
+ def _compute_current_age(self, response, request, now):
+ # Reference nsHttpResponseHead::ComputeCurrentAge
+ # https://dxr.mozilla.org/mozilla-central/source/netwerk/protocol/http/nsHttpResponseHead.cpp#658
+ currentage = 0
+ # If Date header is not set we assume it is a fast connection, and
+ # clock is in sync with the server
+ date = rfc1123_to_epoch(response.headers.get(b"Date")) or now
+ if now > date:
+ currentage = now - date
+
+ if b"Age" in response.headers:
+ try:
+ age = int(response.headers[b"Age"])
+ currentage = max(currentage, age)
+ except ValueError:
+ pass
+
+ return currentage
+
+
+class DbmCacheStorage:
def __init__(self, settings):
- self.cachedir = data_path(settings['HTTPCACHE_DIR'], createdir=True)
- self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
- self.dbmodule = import_module(settings['HTTPCACHE_DBM_MODULE'])
+ self.cachedir = data_path(settings["HTTPCACHE_DIR"], createdir=True)
+ self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
+ self.dbmodule = import_module(settings["HTTPCACHE_DBM_MODULE"])
self.db = None
+ def open_spider(self, spider: Spider):
+ dbpath = Path(self.cachedir, f"{spider.name}.db")
+ self.db = self.dbmodule.open(str(dbpath), "c")
-class FilesystemCacheStorage:
+ logger.debug(
+ "Using DBM cache storage in %(cachepath)s",
+ {"cachepath": dbpath},
+ extra={"spider": spider},
+ )
+
+ self._fingerprinter = spider.crawler.request_fingerprinter
+
+ def close_spider(self, spider):
+ self.db.close()
+
+ def retrieve_response(self, spider, request):
+ data = self._read_data(spider, request)
+ if data is None:
+ return # not cached
+ url = data["url"]
+ status = data["status"]
+ headers = Headers(data["headers"])
+ body = data["body"]
+ respcls = responsetypes.from_args(headers=headers, url=url, body=body)
+ response = respcls(url=url, headers=headers, status=status, body=body)
+ return response
+
+ def store_response(self, spider, request, response):
+ key = self._fingerprinter.fingerprint(request).hex()
+ data = {
+ "status": response.status,
+ "url": response.url,
+ "headers": dict(response.headers),
+ "body": response.body,
+ }
+ self.db[f"{key}_data"] = pickle.dumps(data, protocol=4)
+ self.db[f"{key}_time"] = str(time())
+
+ def _read_data(self, spider, request):
+ key = self._fingerprinter.fingerprint(request).hex()
+ db = self.db
+ tkey = f"{key}_time"
+ if tkey not in db:
+ return # not found
+
+ ts = db[tkey]
+ if 0 < self.expiration_secs < time() - float(ts):
+ return # expired
+ return pickle.loads(db[f"{key}_data"])
+
+
+class FilesystemCacheStorage:
def __init__(self, settings):
- self.cachedir = data_path(settings['HTTPCACHE_DIR'])
- self.expiration_secs = settings.getint('HTTPCACHE_EXPIRATION_SECS')
- self.use_gzip = settings.getbool('HTTPCACHE_GZIP')
+ self.cachedir = data_path(settings["HTTPCACHE_DIR"])
+ self.expiration_secs = settings.getint("HTTPCACHE_EXPIRATION_SECS")
+ self.use_gzip = settings.getbool("HTTPCACHE_GZIP")
self._open = gzip.open if self.use_gzip else open
+ def open_spider(self, spider: Spider):
+ logger.debug(
+ "Using filesystem cache storage in %(cachedir)s",
+ {"cachedir": self.cachedir},
+ extra={"spider": spider},
+ )
+
+ assert spider.crawler.request_fingerprinter
+ self._fingerprinter = spider.crawler.request_fingerprinter
+
+ def close_spider(self, spider):
+ pass
+
def retrieve_response(self, spider: Spider, request: Request):
"""Return response if present in cache, or None otherwise."""
- pass
+ metadata = self._read_meta(spider, request)
+ if metadata is None:
+ return # not cached
+ rpath = Path(self._get_request_path(spider, request))
+ with self._open(rpath / "response_body", "rb") as f:
+ body = f.read()
+ with self._open(rpath / "response_headers", "rb") as f:
+ rawheaders = f.read()
+ url = metadata.get("response_url")
+ status = metadata["status"]
+ headers = Headers(headers_raw_to_dict(rawheaders))
+ respcls = responsetypes.from_args(headers=headers, url=url, body=body)
+ response = respcls(url=url, headers=headers, status=status, body=body)
+ return response
def store_response(self, spider: Spider, request: Request, response):
"""Store the given response in the cache."""
- pass
+ rpath = Path(self._get_request_path(spider, request))
+ if not rpath.exists():
+ rpath.mkdir(parents=True)
+ metadata = {
+ "url": request.url,
+ "method": request.method,
+ "status": response.status,
+ "response_url": response.url,
+ "timestamp": time(),
+ }
+ with self._open(rpath / "meta", "wb") as f:
+ f.write(to_bytes(repr(metadata)))
+ with self._open(rpath / "pickled_meta", "wb") as f:
+ pickle.dump(metadata, f, protocol=4)
+ with self._open(rpath / "response_headers", "wb") as f:
+ f.write(headers_dict_to_raw(response.headers))
+ with self._open(rpath / "response_body", "wb") as f:
+ f.write(response.body)
+ with self._open(rpath / "request_headers", "wb") as f:
+ f.write(headers_dict_to_raw(request.headers))
+ with self._open(rpath / "request_body", "wb") as f:
+ f.write(request.body)
+
+ def _get_request_path(self, spider: Spider, request: Request) -> str:
+ key = self._fingerprinter.fingerprint(request).hex()
+ return str(Path(self.cachedir, spider.name, key[0:2], key))
+
+ def _read_meta(self, spider: Spider, request: Request):
+ rpath = Path(self._get_request_path(spider, request))
+ metapath = rpath / "pickled_meta"
+ if not metapath.exists():
+ return # not found
+ mtime = metapath.stat().st_mtime
+ if 0 < self.expiration_secs < time() - mtime:
+ return # expired
+ with self._open(metapath, "rb") as f:
+ return pickle.load(f)
def parse_cachecontrol(header):
@@ -74,4 +367,17 @@ def parse_cachecontrol(header):
True
"""
- pass
+ directives = {}
+ for directive in header.split(b","):
+ key, sep, val = directive.strip().partition(b"=")
+ if key:
+ directives[key.lower()] = val if sep else None
+ return directives
+
+
+def rfc1123_to_epoch(date_str):
+ try:
+ date_str = to_unicode(date_str, encoding="ascii")
+ return mktime_tz(parsedate_tz(date_str))
+ except Exception:
+ return None
diff --git a/scrapy/extensions/logstats.py b/scrapy/extensions/logstats.py
index e2864b322..78874a6db 100644
--- a/scrapy/extensions/logstats.py
+++ b/scrapy/extensions/logstats.py
@@ -1,7 +1,10 @@
import logging
+
from twisted.internet import task
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
+
logger = logging.getLogger(__name__)
@@ -13,3 +16,43 @@ class LogStats:
self.interval = interval
self.multiplier = 60.0 / self.interval
self.task = None
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ interval = crawler.settings.getfloat("LOGSTATS_INTERVAL")
+ if not interval:
+ raise NotConfigured
+ o = cls(crawler.stats, interval)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_opened(self, spider):
+ self.pagesprev = 0
+ self.itemsprev = 0
+
+ self.task = task.LoopingCall(self.log, spider)
+ self.task.start(self.interval)
+
+ def log(self, spider):
+ items = self.stats.get_value("item_scraped_count", 0)
+ pages = self.stats.get_value("response_received_count", 0)
+ irate = (items - self.itemsprev) * self.multiplier
+ prate = (pages - self.pagesprev) * self.multiplier
+ self.pagesprev, self.itemsprev = pages, items
+
+ msg = (
+ "Crawled %(pages)d pages (at %(pagerate)d pages/min), "
+ "scraped %(items)d items (at %(itemrate)d items/min)"
+ )
+ log_args = {
+ "pages": pages,
+ "pagerate": prate,
+ "items": items,
+ "itemrate": irate,
+ }
+ logger.info(msg, log_args, extra={"spider": spider})
+
+ def spider_closed(self, spider, reason):
+ if self.task and self.task.running:
+ self.task.stop()
diff --git a/scrapy/extensions/memdebug.py b/scrapy/extensions/memdebug.py
index 8eb617202..03ede0681 100644
--- a/scrapy/extensions/memdebug.py
+++ b/scrapy/extensions/memdebug.py
@@ -3,13 +3,34 @@ MemoryDebugger extension
See documentation in docs/topics/extensions.rst
"""
+
import gc
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.trackref import live_refs
class MemoryDebugger:
-
def __init__(self, stats):
self.stats = stats
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("MEMDEBUG_ENABLED"):
+ raise NotConfigured
+ o = cls(crawler.stats)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_closed(self, spider, reason):
+ gc.collect()
+ self.stats.set_value(
+ "memdebug/gc_garbage_count", len(gc.garbage), spider=spider
+ )
+ for cls, wdict in live_refs.items():
+ if not wdict:
+ continue
+ self.stats.set_value(
+ f"memdebug/live_refs/{cls.__name__}", len(wdict), spider=spider
+ )
diff --git a/scrapy/extensions/memusage.py b/scrapy/extensions/memusage.py
index 81fb5c242..ca766c938 100644
--- a/scrapy/extensions/memusage.py
+++ b/scrapy/extensions/memusage.py
@@ -8,37 +8,134 @@ import socket
import sys
from importlib import import_module
from pprint import pformat
+
from twisted.internet import task
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.mail import MailSender
from scrapy.utils.engine import get_engine_status
+
logger = logging.getLogger(__name__)
class MemoryUsage:
-
def __init__(self, crawler):
- if not crawler.settings.getbool('MEMUSAGE_ENABLED'):
+ if not crawler.settings.getbool("MEMUSAGE_ENABLED"):
raise NotConfigured
try:
- self.resource = import_module('resource')
+ # stdlib's resource module is only available on unix platforms.
+ self.resource = import_module("resource")
except ImportError:
raise NotConfigured
+
self.crawler = crawler
self.warned = False
- self.notify_mails = crawler.settings.getlist('MEMUSAGE_NOTIFY_MAIL')
- self.limit = crawler.settings.getint('MEMUSAGE_LIMIT_MB') * 1024 * 1024
- self.warning = crawler.settings.getint('MEMUSAGE_WARNING_MB'
- ) * 1024 * 1024
+ self.notify_mails = crawler.settings.getlist("MEMUSAGE_NOTIFY_MAIL")
+ self.limit = crawler.settings.getint("MEMUSAGE_LIMIT_MB") * 1024 * 1024
+ self.warning = crawler.settings.getint("MEMUSAGE_WARNING_MB") * 1024 * 1024
self.check_interval = crawler.settings.getfloat(
- 'MEMUSAGE_CHECK_INTERVAL_SECONDS')
+ "MEMUSAGE_CHECK_INTERVAL_SECONDS"
+ )
self.mail = MailSender.from_settings(crawler.settings)
- crawler.signals.connect(self.engine_started, signal=signals.
- engine_started)
- crawler.signals.connect(self.engine_stopped, signal=signals.
- engine_stopped)
+ crawler.signals.connect(self.engine_started, signal=signals.engine_started)
+ crawler.signals.connect(self.engine_stopped, signal=signals.engine_stopped)
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def get_virtual_size(self):
+ size = self.resource.getrusage(self.resource.RUSAGE_SELF).ru_maxrss
+ if sys.platform != "darwin":
+ # on macOS ru_maxrss is in bytes, on Linux it is in KB
+ size *= 1024
+ return size
+
+ def engine_started(self):
+ self.crawler.stats.set_value("memusage/startup", self.get_virtual_size())
+ self.tasks = []
+ tsk = task.LoopingCall(self.update)
+ self.tasks.append(tsk)
+ tsk.start(self.check_interval, now=True)
+ if self.limit:
+ tsk = task.LoopingCall(self._check_limit)
+ self.tasks.append(tsk)
+ tsk.start(self.check_interval, now=True)
+ if self.warning:
+ tsk = task.LoopingCall(self._check_warning)
+ self.tasks.append(tsk)
+ tsk.start(self.check_interval, now=True)
+
+ def engine_stopped(self):
+ for tsk in self.tasks:
+ if tsk.running:
+ tsk.stop()
+
+ def update(self):
+ self.crawler.stats.max_value("memusage/max", self.get_virtual_size())
+
+ def _check_limit(self):
+ peak_mem_usage = self.get_virtual_size()
+ if peak_mem_usage > self.limit:
+ self.crawler.stats.set_value("memusage/limit_reached", 1)
+ mem = self.limit / 1024 / 1024
+ logger.error(
+ "Memory usage exceeded %(memusage)dMiB. Shutting down Scrapy...",
+ {"memusage": mem},
+ extra={"crawler": self.crawler},
+ )
+ if self.notify_mails:
+ subj = (
+ f"{self.crawler.settings['BOT_NAME']} terminated: "
+ f"memory usage exceeded {mem}MiB at {socket.gethostname()}"
+ )
+ self._send_report(self.notify_mails, subj)
+ self.crawler.stats.set_value("memusage/limit_notified", 1)
+
+ if self.crawler.engine.spider is not None:
+ self.crawler.engine.close_spider(
+ self.crawler.engine.spider, "memusage_exceeded"
+ )
+ else:
+ self.crawler.stop()
+ else:
+ logger.info(
+ "Peak memory usage is %(virtualsize)dMiB",
+ {"virtualsize": peak_mem_usage / 1024 / 1024},
+ )
+
+ def _check_warning(self):
+ if self.warned: # warn only once
+ return
+ if self.get_virtual_size() > self.warning:
+ self.crawler.stats.set_value("memusage/warning_reached", 1)
+ mem = self.warning / 1024 / 1024
+ logger.warning(
+ "Memory usage reached %(memusage)dMiB",
+ {"memusage": mem},
+ extra={"crawler": self.crawler},
+ )
+ if self.notify_mails:
+ subj = (
+ f"{self.crawler.settings['BOT_NAME']} warning: "
+ f"memory usage reached {mem}MiB at {socket.gethostname()}"
+ )
+ self._send_report(self.notify_mails, subj)
+ self.crawler.stats.set_value("memusage/warning_notified", 1)
+ self.warned = True
def _send_report(self, rcpts, subject):
"""send notification mail with some additional useful info"""
- pass
+ stats = self.crawler.stats
+ s = f"Memory usage at engine startup : {stats.get_value('memusage/startup') / 1024 / 1024}M\r\n"
+ s += f"Maximum memory usage : {stats.get_value('memusage/max') / 1024 / 1024}M\r\n"
+ s += f"Current memory usage : {self.get_virtual_size() / 1024 / 1024}M\r\n"
+
+ s += (
+ "ENGINE STATUS ------------------------------------------------------- \r\n"
+ )
+ s += "\r\n"
+ s += pformat(get_engine_status(self.crawler.engine))
+ s += "\r\n"
+ self.mail.send(rcpts, subject, s)
diff --git a/scrapy/extensions/periodic_log.py b/scrapy/extensions/periodic_log.py
index 6703689b9..2d557f123 100644
--- a/scrapy/extensions/periodic_log.py
+++ b/scrapy/extensions/periodic_log.py
@@ -1,26 +1,140 @@
import logging
from datetime import datetime, timezone
+
from twisted.internet import task
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.serialize import ScrapyJSONEncoder
+
logger = logging.getLogger(__name__)
class PeriodicLog:
"""Log basic scraping stats periodically"""
- def __init__(self, stats, interval=60.0, ext_stats={}, ext_delta={},
- ext_timing_enabled=False):
+ def __init__(
+ self,
+ stats,
+ interval=60.0,
+ ext_stats={},
+ ext_delta={},
+ ext_timing_enabled=False,
+ ):
self.stats = stats
self.interval = interval
self.multiplier = 60.0 / self.interval
self.task = None
self.encoder = ScrapyJSONEncoder(sort_keys=True, indent=4)
self.ext_stats_enabled = bool(ext_stats)
- self.ext_stats_include = ext_stats.get('include', [])
- self.ext_stats_exclude = ext_stats.get('exclude', [])
+ self.ext_stats_include = ext_stats.get("include", [])
+ self.ext_stats_exclude = ext_stats.get("exclude", [])
self.ext_delta_enabled = bool(ext_delta)
- self.ext_delta_include = ext_delta.get('include', [])
- self.ext_delta_exclude = ext_delta.get('exclude', [])
+ self.ext_delta_include = ext_delta.get("include", [])
+ self.ext_delta_exclude = ext_delta.get("exclude", [])
self.ext_timing_enabled = ext_timing_enabled
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ interval = crawler.settings.getfloat("LOGSTATS_INTERVAL")
+ if not interval:
+ raise NotConfigured
+ try:
+ ext_stats = crawler.settings.getdict("PERIODIC_LOG_STATS")
+ except (TypeError, ValueError):
+ ext_stats = (
+ {"enabled": True}
+ if crawler.settings.getbool("PERIODIC_LOG_STATS")
+ else None
+ )
+ try:
+ ext_delta = crawler.settings.getdict("PERIODIC_LOG_DELTA")
+ except (TypeError, ValueError):
+ ext_delta = (
+ {"enabled": True}
+ if crawler.settings.getbool("PERIODIC_LOG_DELTA")
+ else None
+ )
+
+ ext_timing_enabled = crawler.settings.getbool(
+ "PERIODIC_LOG_TIMING_ENABLED", False
+ )
+ if not (ext_stats or ext_delta or ext_timing_enabled):
+ raise NotConfigured
+ o = cls(
+ crawler.stats,
+ interval,
+ ext_stats,
+ ext_delta,
+ ext_timing_enabled,
+ )
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_opened(self, spider):
+ self.time_prev = datetime.now(tz=timezone.utc)
+ self.delta_prev = {}
+ self.stats_prev = {}
+
+ self.task = task.LoopingCall(self.log)
+ self.task.start(self.interval)
+
+ def log(self):
+ data = {}
+ if self.ext_timing_enabled:
+ data.update(self.log_timing())
+ if self.ext_delta_enabled:
+ data.update(self.log_delta())
+ if self.ext_stats_enabled:
+ data.update(self.log_crawler_stats())
+ logger.info(self.encoder.encode(data))
+
+ def log_delta(self):
+ num_stats = {
+ k: v
+ for k, v in self.stats._stats.items()
+ if isinstance(v, (int, float))
+ and self.param_allowed(k, self.ext_delta_include, self.ext_delta_exclude)
+ }
+ delta = {k: v - self.delta_prev.get(k, 0) for k, v in num_stats.items()}
+ self.delta_prev = num_stats
+ return {"delta": delta}
+
+ def log_timing(self):
+ now = datetime.now(tz=timezone.utc)
+ time = {
+ "log_interval": self.interval,
+ "start_time": self.stats._stats["start_time"],
+ "utcnow": now,
+ "log_interval_real": (now - self.time_prev).total_seconds(),
+ "elapsed": (now - self.stats._stats["start_time"]).total_seconds(),
+ }
+ self.time_prev = now
+ return {"time": time}
+
+ def log_crawler_stats(self):
+ stats = {
+ k: v
+ for k, v in self.stats._stats.items()
+ if self.param_allowed(k, self.ext_stats_include, self.ext_stats_exclude)
+ }
+ return {"stats": stats}
+
+ def param_allowed(self, stat_name, include, exclude):
+ if not include and not exclude:
+ return True
+ for p in exclude:
+ if p in stat_name:
+ return False
+ if exclude and not include:
+ return True
+ for p in include:
+ if p in stat_name:
+ return True
+ return False
+
+ def spider_closed(self, spider, reason):
+ self.log()
+ if self.task and self.task.running:
+ self.task.stop()
diff --git a/scrapy/extensions/postprocessing.py b/scrapy/extensions/postprocessing.py
index 32f5ff6bc..17969c5b0 100644
--- a/scrapy/extensions/postprocessing.py
+++ b/scrapy/extensions/postprocessing.py
@@ -6,6 +6,7 @@ from gzip import GzipFile
from io import IOBase
from lzma import LZMAFile
from typing import Any, BinaryIO, Dict, List
+
from scrapy.utils.misc import load_object
@@ -22,14 +23,25 @@ class GzipPlugin:
See :py:class:`gzip.GzipFile` for more info about parameters.
"""
- def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) ->None:
+ def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
- compress_level = self.feed_options.get('gzip_compresslevel', 9)
- mtime = self.feed_options.get('gzip_mtime')
- filename = self.feed_options.get('gzip_filename')
- self.gzipfile = GzipFile(fileobj=self.file, mode='wb',
- compresslevel=compress_level, mtime=mtime, filename=filename)
+ compress_level = self.feed_options.get("gzip_compresslevel", 9)
+ mtime = self.feed_options.get("gzip_mtime")
+ filename = self.feed_options.get("gzip_filename")
+ self.gzipfile = GzipFile(
+ fileobj=self.file,
+ mode="wb",
+ compresslevel=compress_level,
+ mtime=mtime,
+ filename=filename,
+ )
+
+ def write(self, data: bytes) -> int:
+ return self.gzipfile.write(data)
+
+ def close(self) -> None:
+ self.gzipfile.close()
class Bz2Plugin:
@@ -43,12 +55,19 @@ class Bz2Plugin:
See :py:class:`bz2.BZ2File` for more info about parameters.
"""
- def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) ->None:
+ def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
- compress_level = self.feed_options.get('bz2_compresslevel', 9)
- self.bz2file = BZ2File(filename=self.file, mode='wb', compresslevel
- =compress_level)
+ compress_level = self.feed_options.get("bz2_compresslevel", 9)
+ self.bz2file = BZ2File(
+ filename=self.file, mode="wb", compresslevel=compress_level
+ )
+
+ def write(self, data: bytes) -> int:
+ return self.bz2file.write(data)
+
+ def close(self) -> None:
+ self.bz2file.close()
class LZMAPlugin:
@@ -68,17 +87,33 @@ class LZMAPlugin:
See :py:class:`lzma.LZMAFile` for more info about parameters.
"""
- def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) ->None:
+ def __init__(self, file: BinaryIO, feed_options: Dict[str, Any]) -> None:
self.file = file
self.feed_options = feed_options
- format = self.feed_options.get('lzma_format')
- check = self.feed_options.get('lzma_check', -1)
- preset = self.feed_options.get('lzma_preset')
- filters = self.feed_options.get('lzma_filters')
- self.lzmafile = LZMAFile(filename=self.file, mode='wb', format=
- format, check=check, preset=preset, filters=filters)
-
+ format = self.feed_options.get("lzma_format")
+ check = self.feed_options.get("lzma_check", -1)
+ preset = self.feed_options.get("lzma_preset")
+ filters = self.feed_options.get("lzma_filters")
+ self.lzmafile = LZMAFile(
+ filename=self.file,
+ mode="wb",
+ format=format,
+ check=check,
+ preset=preset,
+ filters=filters,
+ )
+
+ def write(self, data: bytes) -> int:
+ return self.lzmafile.write(data)
+
+ def close(self) -> None:
+ self.lzmafile.close()
+
+
+# io.IOBase is subclassed here, so that exporters can use the PostProcessingManager
+# instance as a file like writable object. This could be needed by some exporters
+# such as CsvItemExporter which wraps the feed storage with io.TextIOWrapper.
class PostProcessingManager(IOBase):
"""
This will manage and use declared plugins to process data in a
@@ -89,14 +124,15 @@ class PostProcessingManager(IOBase):
:type file: file like object
"""
- def __init__(self, plugins: List[Any], file: BinaryIO, feed_options:
- Dict[str, Any]) ->None:
+ def __init__(
+ self, plugins: List[Any], file: BinaryIO, feed_options: Dict[str, Any]
+ ) -> None:
self.plugins = self._load_plugins(plugins)
self.file = file
self.feed_options = feed_options
self.head_plugin = self._get_head_plugin()
- def write(self, data: bytes) ->int:
+ def write(self, data: bytes) -> int:
"""
Uses all the declared plugins to process data first, then writes
the processed data to target file.
@@ -105,10 +141,26 @@ class PostProcessingManager(IOBase):
:return: returns number of bytes written
:rtype: int
"""
- pass
+ return self.head_plugin.write(data)
+
+ def tell(self) -> int:
+ return self.file.tell()
- def close(self) ->None:
+ def close(self) -> None:
"""
Close the target file along with all the plugins.
"""
- pass
+ self.head_plugin.close()
+
+ def writable(self) -> bool:
+ return True
+
+ def _load_plugins(self, plugins: List[Any]) -> List[Any]:
+ plugins = [load_object(plugin) for plugin in plugins]
+ return plugins
+
+ def _get_head_plugin(self) -> Any:
+ prev = self.file
+ for plugin in self.plugins[::-1]:
+ prev = plugin(prev, self.feed_options)
+ return prev
diff --git a/scrapy/extensions/spiderstate.py b/scrapy/extensions/spiderstate.py
index 903837a0c..929a3be70 100644
--- a/scrapy/extensions/spiderstate.py
+++ b/scrapy/extensions/spiderstate.py
@@ -1,5 +1,6 @@
import pickle
from pathlib import Path
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.job import job_dir
@@ -10,3 +11,30 @@ class SpiderState:
def __init__(self, jobdir=None):
self.jobdir = jobdir
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ jobdir = job_dir(crawler.settings)
+ if not jobdir:
+ raise NotConfigured
+
+ obj = cls(jobdir)
+ crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed)
+ crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened)
+ return obj
+
+ def spider_closed(self, spider):
+ if self.jobdir:
+ with Path(self.statefn).open("wb") as f:
+ pickle.dump(spider.state, f, protocol=4)
+
+ def spider_opened(self, spider):
+ if self.jobdir and Path(self.statefn).exists():
+ with Path(self.statefn).open("rb") as f:
+ spider.state = pickle.load(f)
+ else:
+ spider.state = {}
+
+ @property
+ def statefn(self) -> str:
+ return str(Path(self.jobdir, "spider.state"))
diff --git a/scrapy/extensions/statsmailer.py b/scrapy/extensions/statsmailer.py
index b0e2395d9..58610c25e 100644
--- a/scrapy/extensions/statsmailer.py
+++ b/scrapy/extensions/statsmailer.py
@@ -3,14 +3,32 @@ StatsMailer extension sends an email when a spider finishes scraping.
Use STATSMAILER_RCPTS setting to enable and give the recipient mail address
"""
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.mail import MailSender
class StatsMailer:
-
def __init__(self, stats, recipients, mail):
self.stats = stats
self.recipients = recipients
self.mail = mail
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ recipients = crawler.settings.getlist("STATSMAILER_RCPTS")
+ if not recipients:
+ raise NotConfigured
+ mail = MailSender.from_settings(crawler.settings)
+ o = cls(crawler.stats, recipients, mail)
+ crawler.signals.connect(o.spider_closed, signal=signals.spider_closed)
+ return o
+
+ def spider_closed(self, spider):
+ spider_stats = self.stats.get_stats(spider)
+ body = "Global stats\n\n"
+ body += "\n".join(f"{k:<50} : {v}" for k, v in self.stats.get_stats().items())
+ body += f"\n\n{spider.name} stats\n\n"
+ body += "\n".join(f"{k:<50} : {v}" for k, v in spider_stats.items())
+ return self.mail.send(self.recipients, f"Scrapy stats for: {spider.name}", body)
diff --git a/scrapy/extensions/telnet.py b/scrapy/extensions/telnet.py
index d866ecf30..c92b7f5fe 100644
--- a/scrapy/extensions/telnet.py
+++ b/scrapy/extensions/telnet.py
@@ -3,50 +3,113 @@ Scrapy Telnet Console extension
See documentation in docs/topics/telnetconsole.rst
"""
+
import binascii
import logging
import os
import pprint
import traceback
+
from twisted.internet import protocol
+
try:
from twisted.conch import manhole, telnet
from twisted.conch.insults import insults
+
TWISTED_CONCH_AVAILABLE = True
except (ImportError, SyntaxError):
_TWISTED_CONCH_TRACEBACK = traceback.format_exc()
TWISTED_CONCH_AVAILABLE = False
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.decorators import defers
from scrapy.utils.engine import print_engine_status
from scrapy.utils.reactor import listen_tcp
from scrapy.utils.trackref import print_live_refs
+
logger = logging.getLogger(__name__)
+
+# signal to update telnet variables
+# args: telnet_vars
update_telnet_vars = object()
class TelnetConsole(protocol.ServerFactory):
-
def __init__(self, crawler):
- if not crawler.settings.getbool('TELNETCONSOLE_ENABLED'):
+ if not crawler.settings.getbool("TELNETCONSOLE_ENABLED"):
raise NotConfigured
if not TWISTED_CONCH_AVAILABLE:
raise NotConfigured(
- """TELNETCONSOLE_ENABLED setting is True but required twisted modules failed to import:
-"""
- + _TWISTED_CONCH_TRACEBACK)
+ "TELNETCONSOLE_ENABLED setting is True but required twisted "
+ "modules failed to import:\n" + _TWISTED_CONCH_TRACEBACK
+ )
self.crawler = crawler
self.noisy = False
- self.portrange = [int(x) for x in crawler.settings.getlist(
- 'TELNETCONSOLE_PORT')]
- self.host = crawler.settings['TELNETCONSOLE_HOST']
- self.username = crawler.settings['TELNETCONSOLE_USERNAME']
- self.password = crawler.settings['TELNETCONSOLE_PASSWORD']
+ self.portrange = [
+ int(x) for x in crawler.settings.getlist("TELNETCONSOLE_PORT")
+ ]
+ self.host = crawler.settings["TELNETCONSOLE_HOST"]
+ self.username = crawler.settings["TELNETCONSOLE_USERNAME"]
+ self.password = crawler.settings["TELNETCONSOLE_PASSWORD"]
+
if not self.password:
- self.password = binascii.hexlify(os.urandom(8)).decode('utf8')
- logger.info('Telnet Password: %s', self.password)
- self.crawler.signals.connect(self.start_listening, signals.
- engine_started)
- self.crawler.signals.connect(self.stop_listening, signals.
- engine_stopped)
+ self.password = binascii.hexlify(os.urandom(8)).decode("utf8")
+ logger.info("Telnet Password: %s", self.password)
+
+ self.crawler.signals.connect(self.start_listening, signals.engine_started)
+ self.crawler.signals.connect(self.stop_listening, signals.engine_stopped)
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def start_listening(self):
+ self.port = listen_tcp(self.portrange, self.host, self)
+ h = self.port.getHost()
+ logger.info(
+ "Telnet console listening on %(host)s:%(port)d",
+ {"host": h.host, "port": h.port},
+ extra={"crawler": self.crawler},
+ )
+
+ def stop_listening(self):
+ self.port.stopListening()
+
+ def protocol(self):
+ class Portal:
+ """An implementation of IPortal"""
+
+ @defers
+ def login(self_, credentials, mind, *interfaces):
+ if not (
+ credentials.username == self.username.encode("utf8")
+ and credentials.checkPassword(self.password.encode("utf8"))
+ ):
+ raise ValueError("Invalid credentials")
+
+ protocol = telnet.TelnetBootstrapProtocol(
+ insults.ServerProtocol, manhole.Manhole, self._get_telnet_vars()
+ )
+ return (interfaces[0], protocol, lambda: None)
+
+ return telnet.TelnetTransport(telnet.AuthenticatingTelnetProtocol, Portal())
+
+ def _get_telnet_vars(self):
+ # Note: if you add entries here also update topics/telnetconsole.rst
+ telnet_vars = {
+ "engine": self.crawler.engine,
+ "spider": self.crawler.engine.spider,
+ "slot": self.crawler.engine.slot,
+ "crawler": self.crawler,
+ "extensions": self.crawler.extensions,
+ "stats": self.crawler.stats,
+ "settings": self.crawler.settings,
+ "est": lambda: print_engine_status(self.crawler.engine),
+ "p": pprint.pprint,
+ "prefs": print_live_refs,
+ "help": "This is Scrapy telnet console. For more info see: "
+ "https://docs.scrapy.org/en/latest/topics/telnetconsole.html",
+ }
+ self.crawler.signals.send_catch_log(update_telnet_vars, telnet_vars=telnet_vars)
+ return telnet_vars
diff --git a/scrapy/extensions/throttle.py b/scrapy/extensions/throttle.py
index 4920a7cc7..396800775 100644
--- a/scrapy/extensions/throttle.py
+++ b/scrapy/extensions/throttle.py
@@ -1,23 +1,101 @@
import logging
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
+
logger = logging.getLogger(__name__)
class AutoThrottle:
-
def __init__(self, crawler):
self.crawler = crawler
- if not crawler.settings.getbool('AUTOTHROTTLE_ENABLED'):
+ if not crawler.settings.getbool("AUTOTHROTTLE_ENABLED"):
raise NotConfigured
- self.debug = crawler.settings.getbool('AUTOTHROTTLE_DEBUG')
+
+ self.debug = crawler.settings.getbool("AUTOTHROTTLE_DEBUG")
self.target_concurrency = crawler.settings.getfloat(
- 'AUTOTHROTTLE_TARGET_CONCURRENCY')
- crawler.signals.connect(self._spider_opened, signal=signals.
- spider_opened)
- crawler.signals.connect(self._response_downloaded, signal=signals.
- response_downloaded)
+ "AUTOTHROTTLE_TARGET_CONCURRENCY"
+ )
+ crawler.signals.connect(self._spider_opened, signal=signals.spider_opened)
+ crawler.signals.connect(
+ self._response_downloaded, signal=signals.response_downloaded
+ )
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def _spider_opened(self, spider):
+ self.mindelay = self._min_delay(spider)
+ self.maxdelay = self._max_delay(spider)
+ spider.download_delay = self._start_delay(spider)
+
+ def _min_delay(self, spider):
+ s = self.crawler.settings
+ return getattr(spider, "download_delay", s.getfloat("DOWNLOAD_DELAY"))
+
+ def _max_delay(self, spider):
+ return self.crawler.settings.getfloat("AUTOTHROTTLE_MAX_DELAY")
+
+ def _start_delay(self, spider):
+ return max(
+ self.mindelay, self.crawler.settings.getfloat("AUTOTHROTTLE_START_DELAY")
+ )
+
+ def _response_downloaded(self, response, request, spider):
+ key, slot = self._get_slot(request, spider)
+ latency = request.meta.get("download_latency")
+ if latency is None or slot is None:
+ return
+
+ olddelay = slot.delay
+ self._adjust_delay(slot, latency, response)
+ if self.debug:
+ diff = slot.delay - olddelay
+ size = len(response.body)
+ conc = len(slot.transferring)
+ logger.info(
+ "slot: %(slot)s | conc:%(concurrency)2d | "
+ "delay:%(delay)5d ms (%(delaydiff)+d) | "
+ "latency:%(latency)5d ms | size:%(size)6d bytes",
+ {
+ "slot": key,
+ "concurrency": conc,
+ "delay": slot.delay * 1000,
+ "delaydiff": diff * 1000,
+ "latency": latency * 1000,
+ "size": size,
+ },
+ extra={"spider": spider},
+ )
+
+ def _get_slot(self, request, spider):
+ key = request.meta.get("download_slot")
+ return key, self.crawler.engine.downloader.slots.get(key)
def _adjust_delay(self, slot, latency, response):
"""Define delay adjustment policy"""
- pass
+
+ # If a server needs `latency` seconds to respond then
+ # we should send a request each `latency/N` seconds
+ # to have N requests processed in parallel
+ target_delay = latency / self.target_concurrency
+
+ # Adjust the delay to make it closer to target_delay
+ new_delay = (slot.delay + target_delay) / 2.0
+
+ # If target delay is bigger than old delay, then use it instead of mean.
+ # It works better with problematic sites.
+ new_delay = max(target_delay, new_delay)
+
+ # Make sure self.mindelay <= new_delay <= self.max_delay
+ new_delay = min(max(self.mindelay, new_delay), self.maxdelay)
+
+ # Dont adjust delay if response status != 200 and new delay is smaller
+ # than old one, as error pages (and redirections) are usually small and
+ # so tend to reduce latency, thus provoking a positive feedback by
+ # reducing delay instead of increase.
+ if response.status != 200 and new_delay <= slot.delay:
+ return
+
+ slot.delay = new_delay
diff --git a/scrapy/http/common.py b/scrapy/http/common.py
index e69de29bb..bc8861574 100644
--- a/scrapy/http/common.py
+++ b/scrapy/http/common.py
@@ -0,0 +1,7 @@
+def obsolete_setter(setter, attrname):
+ def newsetter(self, value):
+ c = self.__class__.__name__
+ msg = f"{c}.{attrname} is not modifiable, use {c}.replace() instead"
+ raise AttributeError(msg)
+
+ return newsetter
diff --git a/scrapy/http/cookies.py b/scrapy/http/cookies.py
index 25e4927de..15f25f69d 100644
--- a/scrapy/http/cookies.py
+++ b/scrapy/http/cookies.py
@@ -2,13 +2,16 @@ import re
import time
from http.cookiejar import CookieJar as _CookieJar
from http.cookiejar import DefaultCookiePolicy
+
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.python import to_unicode
-IPV4_RE = re.compile('\\.\\d+$', re.ASCII)
+# Defined in the http.cookiejar module, but undocumented:
+# https://github.com/python/cpython/blob/v3.9.0/Lib/http/cookiejar.py#L527
+IPV4_RE = re.compile(r"\.\d+$", re.ASCII)
-class CookieJar:
+class CookieJar:
def __init__(self, policy=None, check_expired_frequency=10000):
self.policy = policy or DefaultCookiePolicy()
self.jar = _CookieJar(self.policy)
@@ -16,12 +19,73 @@ class CookieJar:
self.check_expired_frequency = check_expired_frequency
self.processed = 0
+ def extract_cookies(self, response, request):
+ wreq = WrappedRequest(request)
+ wrsp = WrappedResponse(response)
+ return self.jar.extract_cookies(wrsp, wreq)
+
+ def add_cookie_header(self, request):
+ wreq = WrappedRequest(request)
+ self.policy._now = self.jar._now = int(time.time())
+
+ # the cookiejar implementation iterates through all domains
+ # instead we restrict to potential matches on the domain
+ req_host = urlparse_cached(request).hostname
+ if not req_host:
+ return
+
+ if not IPV4_RE.search(req_host):
+ hosts = potential_domain_matches(req_host)
+ if "." not in req_host:
+ hosts += [req_host + ".local"]
+ else:
+ hosts = [req_host]
+
+ cookies = []
+ for host in hosts:
+ if host in self.jar._cookies:
+ cookies += self.jar._cookies_for_domain(host, wreq)
+
+ attrs = self.jar._cookie_attrs(cookies)
+ if attrs:
+ if not wreq.has_header("Cookie"):
+ wreq.add_unredirected_header("Cookie", "; ".join(attrs))
+
+ self.processed += 1
+ if self.processed % self.check_expired_frequency == 0:
+ # This is still quite inefficient for large number of cookies
+ self.jar.clear_expired_cookies()
+
+ @property
+ def _cookies(self):
+ return self.jar._cookies
+
+ def clear_session_cookies(self, *args, **kwargs):
+ return self.jar.clear_session_cookies(*args, **kwargs)
+
+ def clear(self, domain=None, path=None, name=None):
+ return self.jar.clear(domain, path, name)
+
def __iter__(self):
return iter(self.jar)
def __len__(self):
return len(self.jar)
+ def set_policy(self, pol):
+ return self.jar.set_policy(pol)
+
+ def make_cookies(self, response, request):
+ wreq = WrappedRequest(request)
+ wrsp = WrappedResponse(response)
+ return self.jar.make_cookies(wrsp, wreq)
+
+ def set_cookie(self, cookie):
+ self.jar.set_cookie(cookie)
+
+ def set_cookie_if_ok(self, cookie, request):
+ self.jar.set_cookie_if_ok(cookie, WrappedRequest(request))
+
def potential_domain_matches(domain):
"""Potential domain matches for a cookie
@@ -30,11 +94,24 @@ def potential_domain_matches(domain):
['www.example.com', 'example.com', '.www.example.com', '.example.com']
"""
- pass
+ matches = [domain]
+ try:
+ start = domain.index(".") + 1
+ end = domain.rindex(".")
+ while start < end:
+ matches.append(domain[start:])
+ start = domain.index(".", start) + 1
+ except ValueError:
+ pass
+ return matches + ["." + d for d in matches]
class _DummyLock:
- pass
+ def acquire(self):
+ pass
+
+ def release(self):
+ pass
class WrappedRequest:
@@ -46,6 +123,15 @@ class WrappedRequest:
def __init__(self, request):
self.request = request
+ def get_full_url(self):
+ return self.request.url
+
+ def get_host(self):
+ return urlparse_cached(self.request).netloc
+
+ def get_type(self):
+ return urlparse_cached(self.request).scheme
+
def is_unverifiable(self):
"""Unverifiable should indicate whether the request is unverifiable, as defined by RFC 2965.
@@ -54,10 +140,56 @@ class WrappedRequest:
HTML document, and the user had no option to approve the automatic
fetching of the image, this should be true.
"""
- pass
+ return self.request.meta.get("is_unverifiable", False)
+ @property
+ def full_url(self):
+ return self.get_full_url()
-class WrappedResponse:
+ @property
+ def host(self):
+ return self.get_host()
+
+ @property
+ def type(self):
+ return self.get_type()
+
+ @property
+ def unverifiable(self):
+ return self.is_unverifiable()
+
+ @property
+ def origin_req_host(self):
+ return urlparse_cached(self.request).hostname
+
+ def has_header(self, name):
+ return name in self.request.headers
+ def get_header(self, name, default=None):
+ value = self.request.headers.get(name, default)
+ return to_unicode(value, errors="replace") if value is not None else None
+
+ def header_items(self):
+ return [
+ (
+ to_unicode(k, errors="replace"),
+ [to_unicode(x, errors="replace") for x in v],
+ )
+ for k, v in self.request.headers.items()
+ ]
+
+ def add_unredirected_header(self, name, value):
+ self.request.headers.appendlist(name, value)
+
+
+class WrappedResponse:
def __init__(self, response):
self.response = response
+
+ def info(self):
+ return self
+
+ def get_all(self, name, default=None):
+ return [
+ to_unicode(v, errors="replace") for v in self.response.headers.getlist(name)
+ ]
diff --git a/scrapy/http/headers.py b/scrapy/http/headers.py
index 730065335..822597c84 100644
--- a/scrapy/http/headers.py
+++ b/scrapy/http/headers.py
@@ -1,5 +1,7 @@
from collections.abc import Mapping
+
from w3lib.http import headers_dict_to_raw
+
from scrapy.utils.datatypes import CaseInsensitiveDict, CaselessDict
from scrapy.utils.python import to_unicode
@@ -7,17 +9,40 @@ from scrapy.utils.python import to_unicode
class Headers(CaselessDict):
"""Case insensitive http headers dictionary"""
- def __init__(self, seq=None, encoding='utf-8'):
+ def __init__(self, seq=None, encoding="utf-8"):
self.encoding = encoding
super().__init__(seq)
+ def update(self, seq):
+ seq = seq.items() if isinstance(seq, Mapping) else seq
+ iseq = {}
+ for k, v in seq:
+ iseq.setdefault(self.normkey(k), []).extend(self.normvalue(v))
+ super().update(iseq)
+
def normkey(self, key):
"""Normalize key to bytes"""
- pass
+ return self._tobytes(key.title())
def normvalue(self, value):
"""Normalize values to bytes"""
- pass
+ if value is None:
+ value = []
+ elif isinstance(value, (str, bytes)):
+ value = [value]
+ elif not hasattr(value, "__iter__"):
+ value = [value]
+
+ return [self._tobytes(x) for x in value]
+
+ def _tobytes(self, x):
+ if isinstance(x, bytes):
+ return x
+ if isinstance(x, str):
+ return x.encode(self.encoding)
+ if isinstance(x, int):
+ return str(x).encode(self.encoding)
+ raise TypeError(f"Unsupported value type: {type(x)}")
def __getitem__(self, key):
try:
@@ -25,12 +50,53 @@ class Headers(CaselessDict):
except IndexError:
return None
+ def get(self, key, def_val=None):
+ try:
+ return super().get(key, def_val)[-1]
+ except IndexError:
+ return None
+
+ def getlist(self, key, def_val=None):
+ try:
+ return super().__getitem__(key)
+ except KeyError:
+ if def_val is not None:
+ return self.normvalue(def_val)
+ return []
+
+ def setlist(self, key, list_):
+ self[key] = list_
+
+ def setlistdefault(self, key, default_list=()):
+ return self.setdefault(key, default_list)
+
+ def appendlist(self, key, value):
+ lst = self.getlist(key)
+ lst.extend(self.normvalue(value))
+ self[key] = lst
+
+ def items(self):
+ return ((k, self.getlist(k)) for k in self.keys())
+
+ def values(self):
+ return [self[k] for k in self.keys()]
+
+ def to_string(self):
+ return headers_dict_to_raw(self)
+
def to_unicode_dict(self):
"""Return headers as a CaselessDict with unicode keys
and unicode values. Multiple values are joined with ','.
"""
- pass
+ return CaseInsensitiveDict(
+ (
+ to_unicode(key, encoding=self.encoding),
+ to_unicode(b",".join(value), encoding=self.encoding),
+ )
+ for key, value in self.items()
+ )
def __copy__(self):
return self.__class__(self)
+
copy = __copy__
diff --git a/scrapy/http/request/form.py b/scrapy/http/request/form.py
index 351539ea3..2d1f33edd 100644
--- a/scrapy/http/request/form.py
+++ b/scrapy/http/request/form.py
@@ -4,59 +4,255 @@ This module implements the FormRequest class which is a more convenient class
See documentation in docs/topics/request-response.rst
"""
+
from typing import Iterable, List, Optional, Tuple, Type, TypeVar, Union, cast
from urllib.parse import urlencode, urljoin, urlsplit, urlunsplit
-from lxml.html import FormElement
-from lxml.html import InputElement
-from lxml.html import MultipleSelectOptions
-from lxml.html import SelectElement
-from lxml.html import TextareaElement
+
+from lxml.html import FormElement # nosec
+from lxml.html import InputElement # nosec
+from lxml.html import MultipleSelectOptions # nosec
+from lxml.html import SelectElement # nosec
+from lxml.html import TextareaElement # nosec
from w3lib.html import strip_html5_whitespace
+
from scrapy.http.request import Request
from scrapy.http.response.text import TextResponse
from scrapy.utils.python import is_listlike, to_bytes
-FormRequestTypeVar = TypeVar('FormRequestTypeVar', bound='FormRequest')
+
+FormRequestTypeVar = TypeVar("FormRequestTypeVar", bound="FormRequest")
+
FormdataKVType = Tuple[str, Union[str, Iterable[str]]]
FormdataType = Optional[Union[dict, List[FormdataKVType]]]
class FormRequest(Request):
- valid_form_methods = ['GET', 'POST']
+ valid_form_methods = ["GET", "POST"]
+
+ def __init__(self, *args, formdata: FormdataType = None, **kwargs) -> None:
+ if formdata and kwargs.get("method") is None:
+ kwargs["method"] = "POST"
- def __init__(self, *args, formdata: FormdataType=None, **kwargs) ->None:
- if formdata and kwargs.get('method') is None:
- kwargs['method'] = 'POST'
super().__init__(*args, **kwargs)
+
if formdata:
- items = formdata.items() if isinstance(formdata, dict
- ) else formdata
+ items = formdata.items() if isinstance(formdata, dict) else formdata
form_query_str = _urlencode(items, self.encoding)
- if self.method == 'POST':
- self.headers.setdefault(b'Content-Type',
- b'application/x-www-form-urlencoded')
+ if self.method == "POST":
+ self.headers.setdefault(
+ b"Content-Type", b"application/x-www-form-urlencoded"
+ )
self._set_body(form_query_str)
else:
- self._set_url(urlunsplit(urlsplit(self.url)._replace(query=
- form_query_str)))
+ self._set_url(
+ urlunsplit(urlsplit(self.url)._replace(query=form_query_str))
+ )
+
+ @classmethod
+ def from_response(
+ cls: Type[FormRequestTypeVar],
+ response: TextResponse,
+ formname: Optional[str] = None,
+ formid: Optional[str] = None,
+ formnumber: int = 0,
+ formdata: FormdataType = None,
+ clickdata: Optional[dict] = None,
+ dont_click: bool = False,
+ formxpath: Optional[str] = None,
+ formcss: Optional[str] = None,
+ **kwargs,
+ ) -> FormRequestTypeVar:
+ kwargs.setdefault("encoding", response.encoding)
+
+ if formcss is not None:
+ from parsel.csstranslator import HTMLTranslator
+
+ formxpath = HTMLTranslator().css_to_xpath(formcss)
+
+ form = _get_form(response, formname, formid, formnumber, formxpath)
+ formdata = _get_inputs(form, formdata, dont_click, clickdata)
+ url = _get_form_url(form, kwargs.pop("url", None))
+
+ method = kwargs.pop("method", form.method)
+ if method is not None:
+ method = method.upper()
+ if method not in cls.valid_form_methods:
+ method = "GET"
+
+ return cls(url=url, method=method, formdata=formdata, **kwargs)
+
+
+def _get_form_url(form: FormElement, url: Optional[str]) -> str:
+ assert form.base_url is not None # typing
+ if url is None:
+ action = form.get("action")
+ if action is None:
+ return form.base_url
+ return urljoin(form.base_url, strip_html5_whitespace(action))
+ return urljoin(form.base_url, url)
+
+def _urlencode(seq: Iterable[FormdataKVType], enc: str) -> str:
+ values = [
+ (to_bytes(k, enc), to_bytes(v, enc))
+ for k, vs in seq
+ for v in (cast(Iterable[str], vs) if is_listlike(vs) else [cast(str, vs)])
+ ]
+ return urlencode(values, doseq=True)
-def _get_form(response: TextResponse, formname: Optional[str], formid:
- Optional[str], formnumber: int, formxpath: Optional[str]) ->FormElement:
+
+def _get_form(
+ response: TextResponse,
+ formname: Optional[str],
+ formid: Optional[str],
+ formnumber: int,
+ formxpath: Optional[str],
+) -> FormElement:
"""Find the wanted form element within the given response."""
- pass
+ root = response.selector.root
+ forms = root.xpath("//form")
+ if not forms:
+ raise ValueError(f"No <form> element found in {response}")
+
+ if formname is not None:
+ f = root.xpath(f'//form[@name="{formname}"]')
+ if f:
+ return f[0]
+
+ if formid is not None:
+ f = root.xpath(f'//form[@id="{formid}"]')
+ if f:
+ return f[0]
+
+ # Get form element from xpath, if not found, go up
+ if formxpath is not None:
+ nodes = root.xpath(formxpath)
+ if nodes:
+ el = nodes[0]
+ while True:
+ if el.tag == "form":
+ return el
+ el = el.getparent()
+ if el is None:
+ break
+ raise ValueError(f"No <form> element found with {formxpath}")
+ # If we get here, it means that either formname was None or invalid
+ try:
+ form = forms[formnumber]
+ except IndexError:
+ raise IndexError(f"Form number {formnumber} not found in {response}")
+ else:
+ return form
-def _get_inputs(form: FormElement, formdata: FormdataType, dont_click: bool,
- clickdata: Optional[dict]) ->List[FormdataKVType]:
+
+def _get_inputs(
+ form: FormElement,
+ formdata: FormdataType,
+ dont_click: bool,
+ clickdata: Optional[dict],
+) -> List[FormdataKVType]:
"""Return a list of key-value pairs for the inputs found in the given form."""
- pass
+ try:
+ formdata_keys = dict(formdata or ()).keys()
+ except (ValueError, TypeError):
+ raise ValueError("formdata should be a dict or iterable of tuples")
+
+ if not formdata:
+ formdata = []
+ inputs = form.xpath(
+ "descendant::textarea"
+ "|descendant::select"
+ "|descendant::input[not(@type) or @type["
+ ' not(re:test(., "^(?:submit|image|reset)$", "i"))'
+ " and (../@checked or"
+ ' not(re:test(., "^(?:checkbox|radio)$", "i")))]]',
+ namespaces={"re": "http://exslt.org/regular-expressions"},
+ )
+ values: List[FormdataKVType] = [
+ (k, "" if v is None else v)
+ for k, v in (_value(e) for e in inputs)
+ if k and k not in formdata_keys
+ ]
+
+ if not dont_click:
+ clickable = _get_clickable(clickdata, form)
+ if clickable and clickable[0] not in formdata and not clickable[0] is None:
+ values.append(clickable)
+
+ if isinstance(formdata, dict):
+ formdata = formdata.items() # type: ignore[assignment]
+ values.extend((k, v) for k, v in formdata if v is not None)
+ return values
-def _get_clickable(clickdata: Optional[dict], form: FormElement) ->Optional[
- Tuple[str, str]]:
+
+def _value(
+ ele: Union[InputElement, SelectElement, TextareaElement]
+) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
+ n = ele.name
+ v = ele.value
+ if ele.tag == "select":
+ return _select_value(cast(SelectElement, ele), n, v)
+ return n, v
+
+
+def _select_value(
+ ele: SelectElement, n: Optional[str], v: Union[None, str, MultipleSelectOptions]
+) -> Tuple[Optional[str], Union[None, str, MultipleSelectOptions]]:
+ multiple = ele.multiple
+ if v is None and not multiple:
+ # Match browser behaviour on simple select tag without options selected
+ # And for select tags without options
+ o = ele.value_options
+ return (n, o[0]) if o else (None, None)
+ return n, v
+
+
+def _get_clickable(
+ clickdata: Optional[dict], form: FormElement
+) -> Optional[Tuple[str, str]]:
"""
Returns the clickable element specified in clickdata,
if the latter is given. If not, it returns the first
clickable element found
"""
- pass
+ clickables = list(
+ form.xpath(
+ 'descendant::input[re:test(@type, "^(submit|image)$", "i")]'
+ '|descendant::button[not(@type) or re:test(@type, "^submit$", "i")]',
+ namespaces={"re": "http://exslt.org/regular-expressions"},
+ )
+ )
+ if not clickables:
+ return None
+
+ # If we don't have clickdata, we just use the first clickable element
+ if clickdata is None:
+ el = clickables[0]
+ return (el.get("name"), el.get("value") or "")
+
+ # If clickdata is given, we compare it to the clickable elements to find a
+ # match. We first look to see if the number is specified in clickdata,
+ # because that uniquely identifies the element
+ nr = clickdata.get("nr", None)
+ if nr is not None:
+ try:
+ el = list(form.inputs)[nr]
+ except IndexError:
+ pass
+ else:
+ return (el.get("name"), el.get("value") or "")
+
+ # We didn't find it, so now we build an XPath expression out of the other
+ # arguments, because they can be used as such
+ xpath = ".//*" + "".join(f'[@{k}="{v}"]' for k, v in clickdata.items())
+ el = form.xpath(xpath)
+ if len(el) == 1:
+ return (el[0].get("name"), el[0].get("value") or "")
+ if len(el) > 1:
+ raise ValueError(
+ f"Multiple elements found ({el!r}) matching the "
+ f"criteria in clickdata: {clickdata!r}"
+ )
+ else:
+ raise ValueError(f"No clickable element matching clickdata: {clickdata!r}")
diff --git a/scrapy/http/request/json_request.py b/scrapy/http/request/json_request.py
index 7a8404484..510c903db 100644
--- a/scrapy/http/request/json_request.py
+++ b/scrapy/http/request/json_request.py
@@ -4,40 +4,60 @@ This module implements the JsonRequest class which is a more convenient class
See documentation in docs/topics/request-response.rst
"""
+
import copy
import json
import warnings
from typing import Optional, Tuple
+
from scrapy.http.request import Request
from scrapy.utils.deprecate import create_deprecated_class
class JsonRequest(Request):
- attributes: Tuple[str, ...] = Request.attributes + ('dumps_kwargs',)
+ attributes: Tuple[str, ...] = Request.attributes + ("dumps_kwargs",)
- def __init__(self, *args, dumps_kwargs: Optional[dict]=None, **kwargs
- ) ->None:
- dumps_kwargs = copy.deepcopy(dumps_kwargs
- ) if dumps_kwargs is not None else {}
- dumps_kwargs.setdefault('sort_keys', True)
+ def __init__(self, *args, dumps_kwargs: Optional[dict] = None, **kwargs) -> None:
+ dumps_kwargs = copy.deepcopy(dumps_kwargs) if dumps_kwargs is not None else {}
+ dumps_kwargs.setdefault("sort_keys", True)
self._dumps_kwargs = dumps_kwargs
- body_passed = kwargs.get('body', None) is not None
- data = kwargs.pop('data', None)
+
+ body_passed = kwargs.get("body", None) is not None
+ data = kwargs.pop("data", None)
data_passed = data is not None
+
if body_passed and data_passed:
- warnings.warn('Both body and data passed. data will be ignored')
+ warnings.warn("Both body and data passed. data will be ignored")
elif not body_passed and data_passed:
- kwargs['body'] = self._dumps(data)
- if 'method' not in kwargs:
- kwargs['method'] = 'POST'
+ kwargs["body"] = self._dumps(data)
+ if "method" not in kwargs:
+ kwargs["method"] = "POST"
+
super().__init__(*args, **kwargs)
- self.headers.setdefault('Content-Type', 'application/json')
- self.headers.setdefault('Accept',
- 'application/json, text/javascript, */*; q=0.01')
+ self.headers.setdefault("Content-Type", "application/json")
+ self.headers.setdefault(
+ "Accept", "application/json, text/javascript, */*; q=0.01"
+ )
+
+ @property
+ def dumps_kwargs(self) -> dict:
+ return self._dumps_kwargs
+
+ def replace(self, *args, **kwargs) -> Request:
+ body_passed = kwargs.get("body", None) is not None
+ data = kwargs.pop("data", None)
+ data_passed = data is not None
+
+ if body_passed and data_passed:
+ warnings.warn("Both body and data passed. data will be ignored")
+ elif not body_passed and data_passed:
+ kwargs["body"] = self._dumps(data)
+
+ return super().replace(*args, **kwargs)
- def _dumps(self, data: dict) ->str:
+ def _dumps(self, data: dict) -> str:
"""Convert to JSON"""
- pass
+ return json.dumps(data, **self._dumps_kwargs)
-JSONRequest = create_deprecated_class('JSONRequest', JsonRequest)
+JSONRequest = create_deprecated_class("JSONRequest", JsonRequest)
diff --git a/scrapy/http/request/rpc.py b/scrapy/http/request/rpc.py
index 2ed828664..59767de7a 100644
--- a/scrapy/http/request/rpc.py
+++ b/scrapy/http/request/rpc.py
@@ -6,22 +6,32 @@ See documentation in docs/topics/request-response.rst
"""
import xmlrpc.client as xmlrpclib
from typing import Optional
+
import defusedxml.xmlrpc
+
from scrapy.http.request import Request
from scrapy.utils.python import get_func_args
+
defusedxml.xmlrpc.monkey_patch()
+
DUMPS_ARGS = get_func_args(xmlrpclib.dumps)
class XmlRpcRequest(Request):
-
- def __init__(self, *args, encoding: Optional[str]=None, **kwargs):
- if 'body' not in kwargs and 'params' in kwargs:
+ def __init__(self, *args, encoding: Optional[str] = None, **kwargs):
+ if "body" not in kwargs and "params" in kwargs:
kw = dict((k, kwargs.pop(k)) for k in DUMPS_ARGS if k in kwargs)
- kwargs['body'] = xmlrpclib.dumps(**kw)
- kwargs.setdefault('method', 'POST')
- kwargs.setdefault('dont_filter', True)
+ kwargs["body"] = xmlrpclib.dumps(**kw)
+
+ # spec defines that requests must use POST method
+ kwargs.setdefault("method", "POST")
+
+ # xmlrpc query multiples times over the same url
+ kwargs.setdefault("dont_filter", True)
+
+ # restore encoding
if encoding is not None:
- kwargs['encoding'] = encoding
+ kwargs["encoding"] = encoding
+
super().__init__(*args, **kwargs)
- self.headers.setdefault('Content-Type', 'text/xml')
+ self.headers.setdefault("Content-Type", "text/xml")
diff --git a/scrapy/http/response/html.py b/scrapy/http/response/html.py
index d55895aa3..7eed052c2 100644
--- a/scrapy/http/response/html.py
+++ b/scrapy/http/response/html.py
@@ -4,6 +4,7 @@ discovering through HTML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
+
from scrapy.http.response.text import TextResponse
diff --git a/scrapy/http/response/text.py b/scrapy/http/response/text.py
index 4b8b63972..47d7bc10f 100644
--- a/scrapy/http/response/text.py
+++ b/scrapy/http/response/text.py
@@ -5,55 +5,176 @@ discovering (through HTTP headers) to base Response class.
See documentation in docs/topics/request-response.rst
"""
from __future__ import annotations
+
import json
from contextlib import suppress
from typing import TYPE_CHECKING, Any, Generator, Optional, Tuple
from urllib.parse import urljoin
+
import parsel
-from w3lib.encoding import html_body_declared_encoding, html_to_unicode, http_content_type_encoding, read_bom, resolve_encoding
+from w3lib.encoding import (
+ html_body_declared_encoding,
+ html_to_unicode,
+ http_content_type_encoding,
+ read_bom,
+ resolve_encoding,
+)
from w3lib.html import strip_html5_whitespace
+
from scrapy.http import Request
from scrapy.http.response import Response
from scrapy.utils.python import memoizemethod_noargs, to_unicode
from scrapy.utils.response import get_base_url
+
if TYPE_CHECKING:
from scrapy.selector import Selector
+
_NONE = object()
class TextResponse(Response):
- _DEFAULT_ENCODING = 'ascii'
+ _DEFAULT_ENCODING = "ascii"
_cached_decoded_json = _NONE
- attributes: Tuple[str, ...] = Response.attributes + ('encoding',)
+
+ attributes: Tuple[str, ...] = Response.attributes + ("encoding",)
def __init__(self, *args: Any, **kwargs: Any):
- self._encoding = kwargs.pop('encoding', None)
+ self._encoding = kwargs.pop("encoding", None)
self._cached_benc: Optional[str] = None
self._cached_ubody: Optional[str] = None
self._cached_selector: Optional[Selector] = None
super().__init__(*args, **kwargs)
+ def _set_url(self, url):
+ if isinstance(url, str):
+ self._url = to_unicode(url, self.encoding)
+ else:
+ super()._set_url(url)
+
+ def _set_body(self, body):
+ self._body = b"" # used by encoding detection
+ if isinstance(body, str):
+ if self._encoding is None:
+ raise TypeError(
+ "Cannot convert unicode body - "
+ f"{type(self).__name__} has no encoding"
+ )
+ self._body = body.encode(self._encoding)
+ else:
+ super()._set_body(body)
+
+ @property
+ def encoding(self):
+ return self._declared_encoding() or self._body_inferred_encoding()
+
+ def _declared_encoding(self):
+ return (
+ self._encoding
+ or self._bom_encoding()
+ or self._headers_encoding()
+ or self._body_declared_encoding()
+ )
+
def json(self):
"""
.. versionadded:: 2.2
Deserialize a JSON document to a Python object.
"""
- pass
+ if self._cached_decoded_json is _NONE:
+ self._cached_decoded_json = json.loads(self.body)
+ return self._cached_decoded_json
@property
- def text(self) ->str:
+ def text(self) -> str:
"""Body as unicode"""
- pass
+ # access self.encoding before _cached_ubody to make sure
+ # _body_inferred_encoding is called
+ benc = self.encoding
+ if self._cached_ubody is None:
+ charset = f"charset={benc}"
+ self._cached_ubody = html_to_unicode(charset, self.body)[1]
+ return self._cached_ubody
def urljoin(self, url):
"""Join this Response's url with a possible relative url to form an
absolute interpretation of the latter."""
- pass
+ return urljoin(get_base_url(self), url)
+
+ @memoizemethod_noargs
+ def _headers_encoding(self):
+ content_type = self.headers.get(b"Content-Type", b"")
+ return http_content_type_encoding(to_unicode(content_type, encoding="latin-1"))
+
+ def _body_inferred_encoding(self):
+ if self._cached_benc is None:
+ content_type = to_unicode(
+ self.headers.get(b"Content-Type", b""), encoding="latin-1"
+ )
+ benc, ubody = html_to_unicode(
+ content_type,
+ self.body,
+ auto_detect_fun=self._auto_detect_fun,
+ default_encoding=self._DEFAULT_ENCODING,
+ )
+ self._cached_benc = benc
+ self._cached_ubody = ubody
+ return self._cached_benc
- def follow(self, url, callback=None, method='GET', headers=None, body=
- None, cookies=None, meta=None, encoding=None, priority=0,
- dont_filter=False, errback=None, cb_kwargs=None, flags=None) ->Request:
+ def _auto_detect_fun(self, text):
+ for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"):
+ try:
+ text.decode(enc)
+ except UnicodeError:
+ continue
+ return resolve_encoding(enc)
+
+ @memoizemethod_noargs
+ def _body_declared_encoding(self):
+ return html_body_declared_encoding(self.body)
+
+ @memoizemethod_noargs
+ def _bom_encoding(self):
+ return read_bom(self.body)[0]
+
+ @property
+ def selector(self):
+ from scrapy.selector import Selector
+
+ if self._cached_selector is None:
+ self._cached_selector = Selector(self)
+ return self._cached_selector
+
+ def jmespath(self, query, **kwargs):
+ if not hasattr(self.selector, "jmespath"): # type: ignore[attr-defined]
+ raise AttributeError(
+ "Please install parsel >= 1.8.1 to get jmespath support"
+ )
+
+ return self.selector.jmespath(query, **kwargs) # type: ignore[attr-defined]
+
+ def xpath(self, query, **kwargs):
+ return self.selector.xpath(query, **kwargs)
+
+ def css(self, query):
+ return self.selector.css(query)
+
+ def follow(
+ self,
+ url,
+ callback=None,
+ method="GET",
+ headers=None,
+ body=None,
+ cookies=None,
+ meta=None,
+ encoding=None,
+ priority=0,
+ dont_filter=False,
+ errback=None,
+ cb_kwargs=None,
+ flags=None,
+ ) -> Request:
"""
Return a :class:`~.Request` instance to follow a link ``url``.
It accepts the same arguments as ``Request.__init__`` method,
@@ -70,12 +191,45 @@ class TextResponse(Response):
See :ref:`response-follow-example` for usage examples.
"""
- pass
+ if isinstance(url, parsel.Selector):
+ url = _url_from_selector(url)
+ elif isinstance(url, parsel.SelectorList):
+ raise ValueError("SelectorList is not supported")
+ encoding = self.encoding if encoding is None else encoding
+ return super().follow(
+ url=url,
+ callback=callback,
+ method=method,
+ headers=headers,
+ body=body,
+ cookies=cookies,
+ meta=meta,
+ encoding=encoding,
+ priority=priority,
+ dont_filter=dont_filter,
+ errback=errback,
+ cb_kwargs=cb_kwargs,
+ flags=flags,
+ )
- def follow_all(self, urls=None, callback=None, method='GET', headers=
- None, body=None, cookies=None, meta=None, encoding=None, priority=0,
- dont_filter=False, errback=None, cb_kwargs=None, flags=None, css=
- None, xpath=None) ->Generator[Request, None, None]:
+ def follow_all(
+ self,
+ urls=None,
+ callback=None,
+ method="GET",
+ headers=None,
+ body=None,
+ cookies=None,
+ meta=None,
+ encoding=None,
+ priority=0,
+ dont_filter=False,
+ errback=None,
+ cb_kwargs=None,
+ flags=None,
+ css=None,
+ xpath=None,
+ ) -> Generator[Request, None, None]:
"""
A generator that produces :class:`~.Request` instances to follow all
links in ``urls``. It accepts the same arguments as the :class:`~.Request`'s
@@ -99,10 +253,57 @@ class TextResponse(Response):
selectors from which links cannot be obtained (for instance, anchor tags without an
``href`` attribute)
"""
- pass
+ arguments = [x for x in (urls, css, xpath) if x is not None]
+ if len(arguments) != 1:
+ raise ValueError(
+ "Please supply exactly one of the following arguments: urls, css, xpath"
+ )
+ if not urls:
+ if css:
+ urls = self.css(css)
+ if xpath:
+ urls = self.xpath(xpath)
+ if isinstance(urls, parsel.SelectorList):
+ selectors = urls
+ urls = []
+ for sel in selectors:
+ with suppress(_InvalidSelector):
+ urls.append(_url_from_selector(sel))
+ return super().follow_all(
+ urls=urls,
+ callback=callback,
+ method=method,
+ headers=headers,
+ body=body,
+ cookies=cookies,
+ meta=meta,
+ encoding=encoding,
+ priority=priority,
+ dont_filter=dont_filter,
+ errback=errback,
+ cb_kwargs=cb_kwargs,
+ flags=flags,
+ )
class _InvalidSelector(ValueError):
"""
Raised when a URL cannot be obtained from a Selector
"""
+
+
+def _url_from_selector(sel):
+ # type: (parsel.Selector) -> str
+ if isinstance(sel.root, str):
+ # e.g. ::attr(href) result
+ return strip_html5_whitespace(sel.root)
+ if not hasattr(sel.root, "tag"):
+ raise _InvalidSelector(f"Unsupported selector: {sel}")
+ if sel.root.tag not in ("a", "link"):
+ raise _InvalidSelector(
+ "Only <a> and <link> elements are supported; " f"got <{sel.root.tag}>"
+ )
+ href = sel.root.get("href")
+ if href is None:
+ raise _InvalidSelector(f"<{sel.root.tag}> element has no href attribute: {sel}")
+ return strip_html5_whitespace(href)
diff --git a/scrapy/http/response/xml.py b/scrapy/http/response/xml.py
index 9ff465ec5..abf474a2f 100644
--- a/scrapy/http/response/xml.py
+++ b/scrapy/http/response/xml.py
@@ -4,6 +4,7 @@ discovering through XML encoding declarations to the TextResponse class.
See documentation in docs/topics/request-response.rst
"""
+
from scrapy.http.response.text import TextResponse
diff --git a/scrapy/interfaces.py b/scrapy/interfaces.py
index 151522c8c..9a2c5f170 100644
--- a/scrapy/interfaces.py
+++ b/scrapy/interfaces.py
@@ -2,21 +2,16 @@ from zope.interface import Interface
class ISpiderLoader(Interface):
-
def from_settings(settings):
"""Return an instance of the class for the given settings"""
- pass
def load(spider_name):
"""Return the Spider class for the given spider name. If the spider
name is not found, it must raise a KeyError."""
- pass
def list():
"""Return a list with the names of all spiders available in the
project"""
- pass
def find_by_request(request):
"""Return the list of spiders names that can handle the given request"""
- pass
diff --git a/scrapy/item.py b/scrapy/item.py
index aae295969..d3eb90b7b 100644
--- a/scrapy/item.py
+++ b/scrapy/item.py
@@ -3,11 +3,13 @@ Scrapy Item
See documentation in docs/topics/item.rst
"""
+
from abc import ABCMeta
from collections.abc import MutableMapping
from copy import deepcopy
from pprint import pformat
from typing import Dict
+
from scrapy.utils.trackref import object_ref
@@ -22,11 +24,11 @@ class ItemMeta(ABCMeta):
"""
def __new__(mcs, class_name, bases, attrs):
- classcell = attrs.pop('__classcell__', None)
- new_bases = tuple(base._class for base in bases if hasattr(base,
- '_class'))
- _class = super().__new__(mcs, 'x_' + class_name, new_bases, attrs)
- fields = getattr(_class, 'fields', {})
+ classcell = attrs.pop("__classcell__", None)
+ new_bases = tuple(base._class for base in bases if hasattr(base, "_class"))
+ _class = super().__new__(mcs, "x_" + class_name, new_bases, attrs)
+
+ fields = getattr(_class, "fields", {})
new_attrs = {}
for n in dir(_class):
v = getattr(_class, n)
@@ -34,10 +36,11 @@ class ItemMeta(ABCMeta):
fields[n] = v
elif n in attrs:
new_attrs[n] = attrs[n]
- new_attrs['fields'] = fields
- new_attrs['_class'] = _class
+
+ new_attrs["fields"] = fields
+ new_attrs["_class"] = _class
if classcell is not None:
- new_attrs['__classcell__'] = classcell
+ new_attrs["__classcell__"] = classcell
return super().__new__(mcs, class_name, bases, new_attrs)
@@ -63,11 +66,12 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
Unlike instances of :class:`dict`, instances of :class:`Item` may be
:ref:`tracked <topics-leaks-trackrefs>` to debug memory leaks.
"""
+
fields: Dict[str, Field]
def __init__(self, *args, **kwargs):
self._values = {}
- if args or kwargs:
+ if args or kwargs: # avoid creating dict for most common case
for k, v in dict(*args, **kwargs).items():
self[k] = v
@@ -78,21 +82,19 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
if key in self.fields:
self._values[key] = value
else:
- raise KeyError(
- f'{self.__class__.__name__} does not support field: {key}')
+ raise KeyError(f"{self.__class__.__name__} does not support field: {key}")
def __delitem__(self, key):
del self._values[key]
def __getattr__(self, name):
if name in self.fields:
- raise AttributeError(f'Use item[{name!r}] to get field value')
+ raise AttributeError(f"Use item[{name!r}] to get field value")
raise AttributeError(name)
def __setattr__(self, name, value):
- if not name.startswith('_'):
- raise AttributeError(
- f'Use item[{name!r}] = {value!r} to set field value')
+ if not name.startswith("_"):
+ raise AttributeError(f"Use item[{name!r}] = {value!r} to set field value")
super().__setattr__(name, value)
def __len__(self):
@@ -100,11 +102,18 @@ class Item(MutableMapping, object_ref, metaclass=ItemMeta):
def __iter__(self):
return iter(self._values)
+
__hash__ = object_ref.__hash__
+ def keys(self):
+ return self._values.keys()
+
def __repr__(self):
return pformat(dict(self))
+ def copy(self):
+ return self.__class__(self)
+
def deepcopy(self):
"""Return a :func:`~copy.deepcopy` of this item."""
- pass
+ return deepcopy(self)
diff --git a/scrapy/link.py b/scrapy/link.py
index 2bc6f207e..0868ae5ef 100644
--- a/scrapy/link.py
+++ b/scrapy/link.py
@@ -24,29 +24,37 @@ class Link:
:param nofollow: an indication of the presence or absence of a nofollow value in the ``rel`` attribute
of the anchor tag.
"""
- __slots__ = ['url', 'text', 'fragment', 'nofollow']
- def __init__(self, url: str, text: str='', fragment: str='', nofollow:
- bool=False):
+ __slots__ = ["url", "text", "fragment", "nofollow"]
+
+ def __init__(
+ self, url: str, text: str = "", fragment: str = "", nofollow: bool = False
+ ):
if not isinstance(url, str):
got = url.__class__.__name__
- raise TypeError(f'Link urls must be str objects, got {got}')
+ raise TypeError(f"Link urls must be str objects, got {got}")
self.url: str = url
self.text: str = text
self.fragment: str = fragment
self.nofollow: bool = nofollow
- def __eq__(self, other: Any) ->bool:
+ def __eq__(self, other: Any) -> bool:
if not isinstance(other, Link):
raise NotImplementedError
- return (self.url == other.url and self.text == other.text and self.
- fragment == other.fragment and self.nofollow == other.nofollow)
+ return (
+ self.url == other.url
+ and self.text == other.text
+ and self.fragment == other.fragment
+ and self.nofollow == other.nofollow
+ )
- def __hash__(self) ->int:
- return hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(
- self.nofollow)
+ def __hash__(self) -> int:
+ return (
+ hash(self.url) ^ hash(self.text) ^ hash(self.fragment) ^ hash(self.nofollow)
+ )
- def __repr__(self) ->str:
+ def __repr__(self) -> str:
return (
- f'Link(url={self.url!r}, text={self.text!r}, fragment={self.fragment!r}, nofollow={self.nofollow!r})'
- )
+ f"Link(url={self.url!r}, text={self.text!r}, "
+ f"fragment={self.fragment!r}, nofollow={self.nofollow!r})"
+ )
diff --git a/scrapy/linkextractors/lxmlhtml.py b/scrapy/linkextractors/lxmlhtml.py
index 4c22c2389..de032fdd8 100644
--- a/scrapy/linkextractors/lxmlhtml.py
+++ b/scrapy/linkextractors/lxmlhtml.py
@@ -5,69 +5,226 @@ import logging
import operator
from functools import partial
from urllib.parse import urljoin, urlparse
-from lxml import etree
+
+from lxml import etree # nosec
from parsel.csstranslator import HTMLTranslator
from w3lib.html import strip_html5_whitespace
from w3lib.url import canonicalize_url, safe_url_string
+
from scrapy.link import Link
-from scrapy.linkextractors import IGNORED_EXTENSIONS, _is_valid_url, _matches, _re_type, re
+from scrapy.linkextractors import (
+ IGNORED_EXTENSIONS,
+ _is_valid_url,
+ _matches,
+ _re_type,
+ re,
+)
from scrapy.utils.misc import arg_to_iter, rel_has_nofollow
from scrapy.utils.python import unique as unique_list
from scrapy.utils.response import get_base_url
from scrapy.utils.url import url_has_any_extension, url_is_from_any_domain
+
logger = logging.getLogger(__name__)
-XHTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'
-_collect_string_content = etree.XPath('string()')
+# from lxml/src/lxml/html/__init__.py
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_collect_string_content = etree.XPath("string()")
+
+
+def _nons(tag):
+ if isinstance(tag, str):
+ if tag[0] == "{" and tag[1 : len(XHTML_NAMESPACE) + 1] == XHTML_NAMESPACE:
+ return tag.split("}")[-1]
+ return tag
-class LxmlParserLinkExtractor:
- def __init__(self, tag='a', attr='href', process=None, unique=False,
- strip=True, canonicalized=False):
+def _identity(x):
+ return x
+
+
+def _canonicalize_link_url(link):
+ return canonicalize_url(link.url, keep_fragments=True)
+
+
+class LxmlParserLinkExtractor:
+ def __init__(
+ self,
+ tag="a",
+ attr="href",
+ process=None,
+ unique=False,
+ strip=True,
+ canonicalized=False,
+ ):
self.scan_tag = tag if callable(tag) else partial(operator.eq, tag)
self.scan_attr = attr if callable(attr) else partial(operator.eq, attr)
self.process_attr = process if callable(process) else _identity
self.unique = unique
self.strip = strip
- self.link_key = operator.attrgetter('url'
- ) if canonicalized else _canonicalize_link_url
+ self.link_key = (
+ operator.attrgetter("url") if canonicalized else _canonicalize_link_url
+ )
+
+ def _iter_links(self, document):
+ for el in document.iter(etree.Element):
+ if not self.scan_tag(_nons(el.tag)):
+ continue
+ attribs = el.attrib
+ for attrib in attribs:
+ if not self.scan_attr(attrib):
+ continue
+ yield (el, attrib, attribs[attrib])
+
+ def _extract_links(self, selector, response_url, response_encoding, base_url):
+ links = []
+ # hacky way to get the underlying lxml parsed document
+ for el, attr, attr_val in self._iter_links(selector.root):
+ # pseudo lxml.html.HtmlElement.make_links_absolute(base_url)
+ try:
+ if self.strip:
+ attr_val = strip_html5_whitespace(attr_val)
+ attr_val = urljoin(base_url, attr_val)
+ except ValueError:
+ continue # skipping bogus links
+ else:
+ url = self.process_attr(attr_val)
+ if url is None:
+ continue
+ try:
+ url = safe_url_string(url, encoding=response_encoding)
+ except ValueError:
+ logger.debug(f"Skipping extraction of link with bad URL {url!r}")
+ continue
+
+ # to fix relative links after process_value
+ url = urljoin(response_url, url)
+ link = Link(
+ url,
+ _collect_string_content(el) or "",
+ nofollow=rel_has_nofollow(el.get("rel")),
+ )
+ links.append(link)
+ return self._deduplicate_if_needed(links)
+
+ def extract_links(self, response):
+ base_url = get_base_url(response)
+ return self._extract_links(
+ response.selector, response.url, response.encoding, base_url
+ )
def _process_links(self, links):
"""Normalize and filter extracted links
The subclass should override it if necessary
"""
- pass
+ return self._deduplicate_if_needed(links)
+
+ def _deduplicate_if_needed(self, links):
+ if self.unique:
+ return unique_list(links, key=self.link_key)
+ return links
class LxmlLinkExtractor:
_csstranslator = HTMLTranslator()
- def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(),
- restrict_xpaths=(), tags=('a', 'area'), attrs=('href',),
- canonicalize=False, unique=True, process_value=None,
- deny_extensions=None, restrict_css=(), strip=True, restrict_text=None):
+ def __init__(
+ self,
+ allow=(),
+ deny=(),
+ allow_domains=(),
+ deny_domains=(),
+ restrict_xpaths=(),
+ tags=("a", "area"),
+ attrs=("href",),
+ canonicalize=False,
+ unique=True,
+ process_value=None,
+ deny_extensions=None,
+ restrict_css=(),
+ strip=True,
+ restrict_text=None,
+ ):
tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs))
- self.link_extractor = LxmlParserLinkExtractor(tag=partial(operator.
- contains, tags), attr=partial(operator.contains, attrs), unique
- =unique, process=process_value, strip=strip, canonicalized=not
- canonicalize)
- self.allow_res = [(x if isinstance(x, _re_type) else re.compile(x)) for
- x in arg_to_iter(allow)]
- self.deny_res = [(x if isinstance(x, _re_type) else re.compile(x)) for
- x in arg_to_iter(deny)]
+ self.link_extractor = LxmlParserLinkExtractor(
+ tag=partial(operator.contains, tags),
+ attr=partial(operator.contains, attrs),
+ unique=unique,
+ process=process_value,
+ strip=strip,
+ canonicalized=not canonicalize,
+ )
+ self.allow_res = [
+ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)
+ ]
+ self.deny_res = [
+ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)
+ ]
+
self.allow_domains = set(arg_to_iter(allow_domains))
self.deny_domains = set(arg_to_iter(deny_domains))
+
self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths))
- self.restrict_xpaths += tuple(map(self._csstranslator.css_to_xpath,
- arg_to_iter(restrict_css)))
+ self.restrict_xpaths += tuple(
+ map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))
+ )
+
if deny_extensions is None:
deny_extensions = IGNORED_EXTENSIONS
self.canonicalize = canonicalize
- self.deny_extensions = {('.' + e) for e in arg_to_iter(deny_extensions)
- }
- self.restrict_text = [(x if isinstance(x, _re_type) else re.compile
- (x)) for x in arg_to_iter(restrict_text)]
+ self.deny_extensions = {"." + e for e in arg_to_iter(deny_extensions)}
+ self.restrict_text = [
+ x if isinstance(x, _re_type) else re.compile(x)
+ for x in arg_to_iter(restrict_text)
+ ]
+
+ def _link_allowed(self, link):
+ if not _is_valid_url(link.url):
+ return False
+ if self.allow_res and not _matches(link.url, self.allow_res):
+ return False
+ if self.deny_res and _matches(link.url, self.deny_res):
+ return False
+ parsed_url = urlparse(link.url)
+ if self.allow_domains and not url_is_from_any_domain(
+ parsed_url, self.allow_domains
+ ):
+ return False
+ if self.deny_domains and url_is_from_any_domain(parsed_url, self.deny_domains):
+ return False
+ if self.deny_extensions and url_has_any_extension(
+ parsed_url, self.deny_extensions
+ ):
+ return False
+ if self.restrict_text and not _matches(link.text, self.restrict_text):
+ return False
+ return True
+
+ def matches(self, url):
+ if self.allow_domains and not url_is_from_any_domain(url, self.allow_domains):
+ return False
+ if self.deny_domains and url_is_from_any_domain(url, self.deny_domains):
+ return False
+
+ allowed = (
+ (regex.search(url) for regex in self.allow_res)
+ if self.allow_res
+ else [True]
+ )
+ denied = (regex.search(url) for regex in self.deny_res) if self.deny_res else []
+ return any(allowed) and not any(denied)
+
+ def _process_links(self, links):
+ links = [x for x in links if self._link_allowed(x)]
+ if self.canonicalize:
+ for link in links:
+ link.url = canonicalize_url(link.url)
+ links = self.link_extractor._process_links(links)
+ return links
+
+ def _extract_links(self, *args, **kwargs):
+ return self.link_extractor._extract_links(*args, **kwargs)
def extract_links(self, response):
"""Returns a list of :class:`~scrapy.link.Link` objects from the
@@ -79,4 +236,17 @@ class LxmlLinkExtractor:
Duplicate links are omitted if the ``unique`` attribute is set to ``True``,
otherwise they are returned.
"""
- pass
+ base_url = get_base_url(response)
+ if self.restrict_xpaths:
+ docs = [
+ subdoc for x in self.restrict_xpaths for subdoc in response.xpath(x)
+ ]
+ else:
+ docs = [response.selector]
+ all_links = []
+ for doc in docs:
+ links = self._extract_links(doc, response.url, response.encoding, base_url)
+ all_links.extend(self._process_links(links))
+ if self.link_extractor.unique:
+ return unique_list(all_links, key=self.link_extractor.link_key)
+ return all_links
diff --git a/scrapy/loader/common.py b/scrapy/loader/common.py
index f37e2fc91..3e8644e0c 100644
--- a/scrapy/loader/common.py
+++ b/scrapy/loader/common.py
@@ -1,6 +1,9 @@
"""Common functions used in Item Loaders code"""
+
import warnings
+
from itemloaders import common
+
from scrapy.utils.deprecate import ScrapyDeprecationWarning
@@ -8,4 +11,11 @@ def wrap_loader_context(function, context):
"""Wrap functions that receive loader_context to contain the context
"pre-loaded" and expose a interface that receives only one argument
"""
- pass
+ warnings.warn(
+ "scrapy.loader.common.wrap_loader_context has moved to a new library."
+ "Please update your reference to itemloaders.common.wrap_loader_context",
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+
+ return common.wrap_loader_context(function, context)
diff --git a/scrapy/loader/processors.py b/scrapy/loader/processors.py
index 1b5404f6e..b82c6d5c7 100644
--- a/scrapy/loader/processors.py
+++ b/scrapy/loader/processors.py
@@ -4,10 +4,17 @@ This module provides some commonly used processors for Item Loaders.
See documentation in docs/topics/loaders.rst
"""
from itemloaders import processors
+
from scrapy.utils.deprecate import create_deprecated_class
-MapCompose = create_deprecated_class('MapCompose', processors.MapCompose)
-Compose = create_deprecated_class('Compose', processors.Compose)
-TakeFirst = create_deprecated_class('TakeFirst', processors.TakeFirst)
-Identity = create_deprecated_class('Identity', processors.Identity)
-SelectJmes = create_deprecated_class('SelectJmes', processors.SelectJmes)
-Join = create_deprecated_class('Join', processors.Join)
+
+MapCompose = create_deprecated_class("MapCompose", processors.MapCompose)
+
+Compose = create_deprecated_class("Compose", processors.Compose)
+
+TakeFirst = create_deprecated_class("TakeFirst", processors.TakeFirst)
+
+Identity = create_deprecated_class("Identity", processors.Identity)
+
+SelectJmes = create_deprecated_class("SelectJmes", processors.SelectJmes)
+
+Join = create_deprecated_class("Join", processors.Join)
diff --git a/scrapy/logformatter.py b/scrapy/logformatter.py
index e59eb2a97..d720b2f38 100644
--- a/scrapy/logformatter.py
+++ b/scrapy/logformatter.py
@@ -1,23 +1,29 @@
from __future__ import annotations
+
import logging
import os
from typing import TYPE_CHECKING, Any, Dict, Optional, Union
+
from twisted.python.failure import Failure
+
from scrapy import Request, Spider
from scrapy.http import Response
from scrapy.utils.request import referer_str
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
from scrapy.crawler import Crawler
-SCRAPEDMSG = 'Scraped from %(src)s' + os.linesep + '%(item)s'
-DROPPEDMSG = 'Dropped: %(exception)s' + os.linesep + '%(item)s'
-CRAWLEDMSG = (
- 'Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s'
- )
-ITEMERRORMSG = 'Error processing %(item)s'
-SPIDERERRORMSG = 'Spider error processing %(request)s (referer: %(referer)s)'
-DOWNLOADERRORMSG_SHORT = 'Error downloading %(request)s'
-DOWNLOADERRORMSG_LONG = 'Error downloading %(request)s: %(errmsg)s'
+
+
+SCRAPEDMSG = "Scraped from %(src)s" + os.linesep + "%(item)s"
+DROPPEDMSG = "Dropped: %(exception)s" + os.linesep + "%(item)s"
+CRAWLEDMSG = "Crawled (%(status)s) %(request)s%(request_flags)s (referer: %(referer)s)%(response_flags)s"
+ITEMERRORMSG = "Error processing %(item)s"
+SPIDERERRORMSG = "Spider error processing %(request)s (referer: %(referer)s)"
+DOWNLOADERRORMSG_SHORT = "Error downloading %(request)s"
+DOWNLOADERRORMSG_LONG = "Error downloading %(request)s: %(errmsg)s"
class LogFormatter:
@@ -58,43 +64,115 @@ class LogFormatter:
}
"""
- def crawled(self, request: Request, response: Response, spider: Spider
- ) ->dict:
+ def crawled(self, request: Request, response: Response, spider: Spider) -> dict:
"""Logs a message when the crawler finds a webpage."""
- pass
-
- def scraped(self, item: Any, response: Union[Response, Failure], spider:
- Spider) ->dict:
+ request_flags = f" {str(request.flags)}" if request.flags else ""
+ response_flags = f" {str(response.flags)}" if response.flags else ""
+ return {
+ "level": logging.DEBUG,
+ "msg": CRAWLEDMSG,
+ "args": {
+ "status": response.status,
+ "request": request,
+ "request_flags": request_flags,
+ "referer": referer_str(request),
+ "response_flags": response_flags,
+ # backward compatibility with Scrapy logformatter below 1.4 version
+ "flags": response_flags,
+ },
+ }
+
+ def scraped(
+ self, item: Any, response: Union[Response, Failure], spider: Spider
+ ) -> dict:
"""Logs a message when an item is scraped by a spider."""
- pass
-
- def dropped(self, item: Any, exception: BaseException, response:
- Response, spider: Spider) ->dict:
+ src: Any
+ if isinstance(response, Failure):
+ src = response.getErrorMessage()
+ else:
+ src = response
+ return {
+ "level": logging.DEBUG,
+ "msg": SCRAPEDMSG,
+ "args": {
+ "src": src,
+ "item": item,
+ },
+ }
+
+ def dropped(
+ self, item: Any, exception: BaseException, response: Response, spider: Spider
+ ) -> dict:
"""Logs a message when an item is dropped while it is passing through the item pipeline."""
- pass
-
- def item_error(self, item: Any, exception: BaseException, response:
- Response, spider: Spider) ->dict:
+ return {
+ "level": logging.WARNING,
+ "msg": DROPPEDMSG,
+ "args": {
+ "exception": exception,
+ "item": item,
+ },
+ }
+
+ def item_error(
+ self, item: Any, exception: BaseException, response: Response, spider: Spider
+ ) -> dict:
"""Logs a message when an item causes an error while it is passing
through the item pipeline.
.. versionadded:: 2.0
"""
- pass
-
- def spider_error(self, failure: Failure, request: Request, response:
- Union[Response, Failure], spider: Spider) ->dict:
+ return {
+ "level": logging.ERROR,
+ "msg": ITEMERRORMSG,
+ "args": {
+ "item": item,
+ },
+ }
+
+ def spider_error(
+ self,
+ failure: Failure,
+ request: Request,
+ response: Union[Response, Failure],
+ spider: Spider,
+ ) -> dict:
"""Logs an error message from a spider.
.. versionadded:: 2.0
"""
- pass
-
- def download_error(self, failure: Failure, request: Request, spider:
- Spider, errmsg: Optional[str]=None) ->dict:
+ return {
+ "level": logging.ERROR,
+ "msg": SPIDERERRORMSG,
+ "args": {
+ "request": request,
+ "referer": referer_str(request),
+ },
+ }
+
+ def download_error(
+ self,
+ failure: Failure,
+ request: Request,
+ spider: Spider,
+ errmsg: Optional[str] = None,
+ ) -> dict:
"""Logs a download error message from a spider (typically coming from
the engine).
.. versionadded:: 2.0
"""
- pass
+ args: Dict[str, Any] = {"request": request}
+ if errmsg:
+ msg = DOWNLOADERRORMSG_LONG
+ args["errmsg"] = errmsg
+ else:
+ msg = DOWNLOADERRORMSG_SHORT
+ return {
+ "level": logging.ERROR,
+ "msg": msg,
+ "args": args,
+ }
+
+ @classmethod
+ def from_crawler(cls, crawler: Crawler) -> Self:
+ return cls()
diff --git a/scrapy/mail.py b/scrapy/mail.py
index b149b42bb..237327451 100644
--- a/scrapy/mail.py
+++ b/scrapy/mail.py
@@ -11,20 +11,40 @@ from email.mime.nonmultipart import MIMENonMultipart
from email.mime.text import MIMEText
from email.utils import formatdate
from io import BytesIO
+
from twisted import version as twisted_version
from twisted.internet import defer, ssl
from twisted.python.versions import Version
+
from scrapy.utils.misc import arg_to_iter
from scrapy.utils.python import to_bytes
+
logger = logging.getLogger(__name__)
-COMMASPACE = ', '
-class MailSender:
+# Defined in the email.utils module, but undocumented:
+# https://github.com/python/cpython/blob/v3.9.0/Lib/email/utils.py#L42
+COMMASPACE = ", "
+
+
+def _to_bytes_or_none(text):
+ if text is None:
+ return None
+ return to_bytes(text)
+
- def __init__(self, smtphost='localhost', mailfrom='scrapy@localhost',
- smtpuser=None, smtppass=None, smtpport=25, smtptls=False, smtpssl=
- False, debug=False):
+class MailSender:
+ def __init__(
+ self,
+ smtphost="localhost",
+ mailfrom="scrapy@localhost",
+ smtpuser=None,
+ smtppass=None,
+ smtpport=25,
+ smtptls=False,
+ smtpssl=False,
+ debug=False,
+ ):
self.smtphost = smtphost
self.smtpport = smtpport
self.smtpuser = _to_bytes_or_none(smtpuser)
@@ -33,3 +53,156 @@ class MailSender:
self.smtpssl = smtpssl
self.mailfrom = mailfrom
self.debug = debug
+
+ @classmethod
+ def from_settings(cls, settings):
+ return cls(
+ smtphost=settings["MAIL_HOST"],
+ mailfrom=settings["MAIL_FROM"],
+ smtpuser=settings["MAIL_USER"],
+ smtppass=settings["MAIL_PASS"],
+ smtpport=settings.getint("MAIL_PORT"),
+ smtptls=settings.getbool("MAIL_TLS"),
+ smtpssl=settings.getbool("MAIL_SSL"),
+ )
+
+ def send(
+ self,
+ to,
+ subject,
+ body,
+ cc=None,
+ attachs=(),
+ mimetype="text/plain",
+ charset=None,
+ _callback=None,
+ ):
+ from twisted.internet import reactor
+
+ if attachs:
+ msg = MIMEMultipart()
+ else:
+ msg = MIMENonMultipart(*mimetype.split("/", 1))
+
+ to = list(arg_to_iter(to))
+ cc = list(arg_to_iter(cc))
+
+ msg["From"] = self.mailfrom
+ msg["To"] = COMMASPACE.join(to)
+ msg["Date"] = formatdate(localtime=True)
+ msg["Subject"] = subject
+ rcpts = to[:]
+ if cc:
+ rcpts.extend(cc)
+ msg["Cc"] = COMMASPACE.join(cc)
+
+ if attachs:
+ if charset:
+ msg.set_charset(charset)
+ msg.attach(MIMEText(body, "plain", charset or "us-ascii"))
+ for attach_name, mimetype, f in attachs:
+ part = MIMEBase(*mimetype.split("/"))
+ part.set_payload(f.read())
+ Encoders.encode_base64(part)
+ part.add_header(
+ "Content-Disposition", "attachment", filename=attach_name
+ )
+ msg.attach(part)
+ else:
+ msg.set_payload(body, charset)
+
+ if _callback:
+ _callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg)
+
+ if self.debug:
+ logger.debug(
+ "Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
+ 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
+ {
+ "mailto": to,
+ "mailcc": cc,
+ "mailsubject": subject,
+ "mailattachs": len(attachs),
+ },
+ )
+ return
+
+ dfd = self._sendmail(rcpts, msg.as_string().encode(charset or "utf-8"))
+ dfd.addCallbacks(
+ callback=self._sent_ok,
+ errback=self._sent_failed,
+ callbackArgs=[to, cc, subject, len(attachs)],
+ errbackArgs=[to, cc, subject, len(attachs)],
+ )
+ reactor.addSystemEventTrigger("before", "shutdown", lambda: dfd)
+ return dfd
+
+ def _sent_ok(self, result, to, cc, subject, nattachs):
+ logger.info(
+ "Mail sent OK: To=%(mailto)s Cc=%(mailcc)s "
+ 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d',
+ {
+ "mailto": to,
+ "mailcc": cc,
+ "mailsubject": subject,
+ "mailattachs": nattachs,
+ },
+ )
+
+ def _sent_failed(self, failure, to, cc, subject, nattachs):
+ errstr = str(failure.value)
+ logger.error(
+ "Unable to send mail: To=%(mailto)s Cc=%(mailcc)s "
+ 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d'
+ "- %(mailerr)s",
+ {
+ "mailto": to,
+ "mailcc": cc,
+ "mailsubject": subject,
+ "mailattachs": nattachs,
+ "mailerr": errstr,
+ },
+ )
+ return failure
+
+ def _sendmail(self, to_addrs, msg):
+ from twisted.internet import reactor
+
+ msg = BytesIO(msg)
+ d = defer.Deferred()
+
+ factory = self._create_sender_factory(to_addrs, msg, d)
+
+ if self.smtpssl:
+ reactor.connectSSL(
+ self.smtphost, self.smtpport, factory, ssl.ClientContextFactory()
+ )
+ else:
+ reactor.connectTCP(self.smtphost, self.smtpport, factory)
+
+ return d
+
+ def _create_sender_factory(self, to_addrs, msg, d):
+ from twisted.mail.smtp import ESMTPSenderFactory
+
+ factory_keywords = {
+ "heloFallback": True,
+ "requireAuthentication": False,
+ "requireTransportSecurity": self.smtptls,
+ }
+
+ # Newer versions of twisted require the hostname to use STARTTLS
+ if twisted_version >= Version("twisted", 21, 2, 0):
+ factory_keywords["hostname"] = self.smtphost
+
+ factory = ESMTPSenderFactory(
+ self.smtpuser,
+ self.smtppass,
+ self.mailfrom,
+ to_addrs,
+ msg,
+ d,
+ **factory_keywords
+ )
+ factory.noisy = False
+ return factory
diff --git a/scrapy/middleware.py b/scrapy/middleware.py
index 38f8c46ff..090588130 100644
--- a/scrapy/middleware.py
+++ b/scrapy/middleware.py
@@ -1,27 +1,110 @@
from __future__ import annotations
+
import logging
import pprint
from collections import defaultdict, deque
-from typing import TYPE_CHECKING, Any, Callable, Deque, Dict, Iterable, List, Optional, Tuple, Union, cast
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Deque,
+ Dict,
+ Iterable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+ cast,
+)
+
from twisted.internet.defer import Deferred
+
from scrapy import Spider
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.utils.defer import process_chain, process_parallel
from scrapy.utils.misc import create_instance, load_object
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
from scrapy.crawler import Crawler
+
+
logger = logging.getLogger(__name__)
class MiddlewareManager:
"""Base class for implementing middleware managers"""
- component_name = 'foo middleware'
- def __init__(self, *middlewares: Any) ->None:
+ component_name = "foo middleware"
+
+ def __init__(self, *middlewares: Any) -> None:
self.middlewares = middlewares
- self.methods: Dict[str, Deque[Union[None, Callable, Tuple[Callable,
- Callable]]]] = defaultdict(deque)
+ # Only process_spider_output and process_spider_exception can be None.
+ # Only process_spider_output can be a tuple, and only until _async compatibility methods are removed.
+ self.methods: Dict[
+ str, Deque[Union[None, Callable, Tuple[Callable, Callable]]]
+ ] = defaultdict(deque)
for mw in middlewares:
self._add_middleware(mw)
+
+ @classmethod
+ def _get_mwlist_from_settings(cls, settings: Settings) -> List[Any]:
+ raise NotImplementedError
+
+ @classmethod
+ def from_settings(
+ cls, settings: Settings, crawler: Optional[Crawler] = None
+ ) -> Self:
+ mwlist = cls._get_mwlist_from_settings(settings)
+ middlewares = []
+ enabled = []
+ for clspath in mwlist:
+ try:
+ mwcls = load_object(clspath)
+ mw = create_instance(mwcls, settings, crawler)
+ middlewares.append(mw)
+ enabled.append(clspath)
+ except NotConfigured as e:
+ if e.args:
+ logger.warning(
+ "Disabled %(clspath)s: %(eargs)s",
+ {"clspath": clspath, "eargs": e.args[0]},
+ extra={"crawler": crawler},
+ )
+
+ logger.info(
+ "Enabled %(componentname)ss:\n%(enabledlist)s",
+ {
+ "componentname": cls.component_name,
+ "enabledlist": pprint.pformat(enabled),
+ },
+ extra={"crawler": crawler},
+ )
+ return cls(*middlewares)
+
+ @classmethod
+ def from_crawler(cls, crawler: Crawler) -> Self:
+ return cls.from_settings(crawler.settings, crawler)
+
+ def _add_middleware(self, mw: Any) -> None:
+ if hasattr(mw, "open_spider"):
+ self.methods["open_spider"].append(mw.open_spider)
+ if hasattr(mw, "close_spider"):
+ self.methods["close_spider"].appendleft(mw.close_spider)
+
+ def _process_parallel(self, methodname: str, obj: Any, *args: Any) -> Deferred:
+ methods = cast(Iterable[Callable], self.methods[methodname])
+ return process_parallel(methods, obj, *args)
+
+ def _process_chain(self, methodname: str, obj: Any, *args: Any) -> Deferred:
+ methods = cast(Iterable[Callable], self.methods[methodname])
+ return process_chain(methods, obj, *args)
+
+ def open_spider(self, spider: Spider) -> Deferred:
+ return self._process_parallel("open_spider", spider)
+
+ def close_spider(self, spider: Spider) -> Deferred:
+ return self._process_parallel("close_spider", spider)
diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py
index 3c976dd8f..5c09ab37e 100644
--- a/scrapy/pipelines/files.py
+++ b/scrapy/pipelines/files.py
@@ -18,8 +18,10 @@ from os import PathLike
from pathlib import Path
from typing import DefaultDict, Optional, Set, Union
from urllib.parse import urlparse
+
from itemadapter import ItemAdapter
from twisted.internet import defer, threads
+
from scrapy.exceptions import IgnoreRequest, NotConfigured
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
@@ -32,23 +34,57 @@ from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import md5sum
from scrapy.utils.python import to_bytes
from scrapy.utils.request import referer_str
+
logger = logging.getLogger(__name__)
+def _to_string(path: Union[str, PathLike]) -> str:
+ return str(path) # convert a Path object to string
+
+
class FileException(Exception):
"""General media error exception"""
class FSFilesStore:
-
def __init__(self, basedir: Union[str, PathLike]):
basedir = _to_string(basedir)
- if '://' in basedir:
- basedir = basedir.split('://', 1)[1]
+ if "://" in basedir:
+ basedir = basedir.split("://", 1)[1]
self.basedir = basedir
self._mkdir(Path(self.basedir))
self.created_directories: DefaultDict[str, Set[str]] = defaultdict(set)
+ def persist_file(
+ self, path: Union[str, PathLike], buf, info, meta=None, headers=None
+ ):
+ absolute_path = self._get_filesystem_path(path)
+ self._mkdir(absolute_path.parent, info)
+ absolute_path.write_bytes(buf.getvalue())
+
+ def stat_file(self, path: Union[str, PathLike], info):
+ absolute_path = self._get_filesystem_path(path)
+ try:
+ last_modified = absolute_path.stat().st_mtime
+ except os.error:
+ return {}
+
+ with absolute_path.open("rb") as f:
+ checksum = md5sum(f)
+
+ return {"last_modified": last_modified, "checksum": checksum}
+
+ def _get_filesystem_path(self, path: Union[str, PathLike]) -> Path:
+ path_comps = _to_string(path).split("/")
+ return Path(self.basedir, *path_comps)
+
+ def _mkdir(self, dirname: Path, domain: Optional[str] = None):
+ seen = self.created_directories[domain] if domain else set()
+ if str(dirname) not in seen:
+ if not dirname.exists():
+ dirname.mkdir(parents=True)
+ seen.add(str(dirname))
+
class S3FilesStore:
AWS_ACCESS_KEY_ID = None
@@ -58,53 +94,170 @@ class S3FilesStore:
AWS_REGION_NAME = None
AWS_USE_SSL = None
AWS_VERIFY = None
- POLICY = 'private'
- HEADERS = {'Cache-Control': 'max-age=172800'}
+
+ POLICY = "private" # Overridden from settings.FILES_STORE_S3_ACL in FilesPipeline.from_settings
+ HEADERS = {
+ "Cache-Control": "max-age=172800",
+ }
def __init__(self, uri):
if not is_botocore_available():
- raise NotConfigured('missing botocore library')
+ raise NotConfigured("missing botocore library")
import botocore.session
+
session = botocore.session.get_session()
- self.s3_client = session.create_client('s3', aws_access_key_id=self
- .AWS_ACCESS_KEY_ID, aws_secret_access_key=self.
- AWS_SECRET_ACCESS_KEY, aws_session_token=self.AWS_SESSION_TOKEN,
- endpoint_url=self.AWS_ENDPOINT_URL, region_name=self.
- AWS_REGION_NAME, use_ssl=self.AWS_USE_SSL, verify=self.AWS_VERIFY)
- if not uri.startswith('s3://'):
+ self.s3_client = session.create_client(
+ "s3",
+ aws_access_key_id=self.AWS_ACCESS_KEY_ID,
+ aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
+ aws_session_token=self.AWS_SESSION_TOKEN,
+ endpoint_url=self.AWS_ENDPOINT_URL,
+ region_name=self.AWS_REGION_NAME,
+ use_ssl=self.AWS_USE_SSL,
+ verify=self.AWS_VERIFY,
+ )
+ if not uri.startswith("s3://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
- self.bucket, self.prefix = uri[5:].split('/', 1)
+ self.bucket, self.prefix = uri[5:].split("/", 1)
+
+ def stat_file(self, path, info):
+ def _onsuccess(boto_key):
+ checksum = boto_key["ETag"].strip('"')
+ last_modified = boto_key["LastModified"]
+ modified_stamp = time.mktime(last_modified.timetuple())
+ return {"checksum": checksum, "last_modified": modified_stamp}
+
+ return self._get_boto_key(path).addCallback(_onsuccess)
+
+ def _get_boto_key(self, path):
+ key_name = f"{self.prefix}{path}"
+ return threads.deferToThread(
+ self.s3_client.head_object, Bucket=self.bucket, Key=key_name
+ )
def persist_file(self, path, buf, info, meta=None, headers=None):
"""Upload file to S3 storage"""
- pass
+ key_name = f"{self.prefix}{path}"
+ buf.seek(0)
+ extra = self._headers_to_botocore_kwargs(self.HEADERS)
+ if headers:
+ extra.update(self._headers_to_botocore_kwargs(headers))
+ return threads.deferToThread(
+ self.s3_client.put_object,
+ Bucket=self.bucket,
+ Key=key_name,
+ Body=buf,
+ Metadata={k: str(v) for k, v in (meta or {}).items()},
+ ACL=self.POLICY,
+ **extra,
+ )
def _headers_to_botocore_kwargs(self, headers):
"""Convert headers to botocore keyword arguments."""
- pass
+ # This is required while we need to support both boto and botocore.
+ mapping = CaseInsensitiveDict(
+ {
+ "Content-Type": "ContentType",
+ "Cache-Control": "CacheControl",
+ "Content-Disposition": "ContentDisposition",
+ "Content-Encoding": "ContentEncoding",
+ "Content-Language": "ContentLanguage",
+ "Content-Length": "ContentLength",
+ "Content-MD5": "ContentMD5",
+ "Expires": "Expires",
+ "X-Amz-Grant-Full-Control": "GrantFullControl",
+ "X-Amz-Grant-Read": "GrantRead",
+ "X-Amz-Grant-Read-ACP": "GrantReadACP",
+ "X-Amz-Grant-Write-ACP": "GrantWriteACP",
+ "X-Amz-Object-Lock-Legal-Hold": "ObjectLockLegalHoldStatus",
+ "X-Amz-Object-Lock-Mode": "ObjectLockMode",
+ "X-Amz-Object-Lock-Retain-Until-Date": "ObjectLockRetainUntilDate",
+ "X-Amz-Request-Payer": "RequestPayer",
+ "X-Amz-Server-Side-Encryption": "ServerSideEncryption",
+ "X-Amz-Server-Side-Encryption-Aws-Kms-Key-Id": "SSEKMSKeyId",
+ "X-Amz-Server-Side-Encryption-Context": "SSEKMSEncryptionContext",
+ "X-Amz-Server-Side-Encryption-Customer-Algorithm": "SSECustomerAlgorithm",
+ "X-Amz-Server-Side-Encryption-Customer-Key": "SSECustomerKey",
+ "X-Amz-Server-Side-Encryption-Customer-Key-Md5": "SSECustomerKeyMD5",
+ "X-Amz-Storage-Class": "StorageClass",
+ "X-Amz-Tagging": "Tagging",
+ "X-Amz-Website-Redirect-Location": "WebsiteRedirectLocation",
+ }
+ )
+ extra = {}
+ for key, value in headers.items():
+ try:
+ kwarg = mapping[key]
+ except KeyError:
+ raise TypeError(f'Header "{key}" is not supported by botocore')
+ else:
+ extra[kwarg] = value
+ return extra
class GCSFilesStore:
GCS_PROJECT_ID = None
- CACHE_CONTROL = 'max-age=172800'
+
+ CACHE_CONTROL = "max-age=172800"
+
+ # The bucket's default object ACL will be applied to the object.
+ # Overridden from settings.FILES_STORE_GCS_ACL in FilesPipeline.from_settings.
POLICY = None
def __init__(self, uri):
from google.cloud import storage
+
client = storage.Client(project=self.GCS_PROJECT_ID)
- bucket, prefix = uri[5:].split('/', 1)
+ bucket, prefix = uri[5:].split("/", 1)
self.bucket = client.bucket(bucket)
self.prefix = prefix
- permissions = self.bucket.test_iam_permissions([
- 'storage.objects.get', 'storage.objects.create'])
- if 'storage.objects.get' not in permissions:
+ permissions = self.bucket.test_iam_permissions(
+ ["storage.objects.get", "storage.objects.create"]
+ )
+ if "storage.objects.get" not in permissions:
logger.warning(
- "No 'storage.objects.get' permission for GSC bucket %(bucket)s. Checking if files are up to date will be impossible. Files will be downloaded every time."
- , {'bucket': bucket})
- if 'storage.objects.create' not in permissions:
+ "No 'storage.objects.get' permission for GSC bucket %(bucket)s. "
+ "Checking if files are up to date will be impossible. Files will be downloaded every time.",
+ {"bucket": bucket},
+ )
+ if "storage.objects.create" not in permissions:
logger.error(
- "No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!"
- , {'bucket': bucket})
+ "No 'storage.objects.create' permission for GSC bucket %(bucket)s. Saving files will be impossible!",
+ {"bucket": bucket},
+ )
+
+ def stat_file(self, path, info):
+ def _onsuccess(blob):
+ if blob:
+ checksum = base64.b64decode(blob.md5_hash).hex()
+ last_modified = time.mktime(blob.updated.timetuple())
+ return {"checksum": checksum, "last_modified": last_modified}
+ return {}
+
+ blob_path = self._get_blob_path(path)
+ return threads.deferToThread(self.bucket.get_blob, blob_path).addCallback(
+ _onsuccess
+ )
+
+ def _get_content_type(self, headers):
+ if headers and "Content-Type" in headers:
+ return headers["Content-Type"]
+ return "application/octet-stream"
+
+ def _get_blob_path(self, path):
+ return self.prefix + path
+
+ def persist_file(self, path, buf, info, meta=None, headers=None):
+ blob_path = self._get_blob_path(path)
+ blob = self.bucket.blob(blob_path)
+ blob.cache_control = self.CACHE_CONTROL
+ blob.metadata = {k: str(v) for k, v in (meta or {}).items()}
+ return threads.deferToThread(
+ blob.upload_from_string,
+ data=buf.getvalue(),
+ content_type=self._get_content_type(headers),
+ predefined_acl=self.POLICY,
+ )
class FTPFilesStore:
@@ -113,7 +266,7 @@ class FTPFilesStore:
USE_ACTIVE_MODE = None
def __init__(self, uri):
- if not uri.startswith('ftp://'):
+ if not uri.startswith("ftp://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
u = urlparse(uri)
self.port = u.port
@@ -121,7 +274,39 @@ class FTPFilesStore:
self.port = int(u.port or 21)
self.username = u.username or self.FTP_USERNAME
self.password = u.password or self.FTP_PASSWORD
- self.basedir = u.path.rstrip('/')
+ self.basedir = u.path.rstrip("/")
+
+ def persist_file(self, path, buf, info, meta=None, headers=None):
+ path = f"{self.basedir}/{path}"
+ return threads.deferToThread(
+ ftp_store_file,
+ path=path,
+ file=buf,
+ host=self.host,
+ port=self.port,
+ username=self.username,
+ password=self.password,
+ use_active_mode=self.USE_ACTIVE_MODE,
+ )
+
+ def stat_file(self, path, info):
+ def _stat_file(path):
+ try:
+ ftp = FTP()
+ ftp.connect(self.host, self.port)
+ ftp.login(self.username, self.password)
+ if self.USE_ACTIVE_MODE:
+ ftp.set_pasv(False)
+ file_path = f"{self.basedir}/{path}"
+ last_modified = float(ftp.voidcmd(f"MDTM {file_path}")[4:].strip())
+ m = hashlib.md5()
+ ftp.retrbinary(f"RETR {file_path}", m.update)
+ return {"last_modified": last_modified, "checksum": m.hexdigest()}
+ # The file doesn't exist
+ except Exception:
+ return {}
+
+ return threads.deferToThread(_stat_file, path)
class FilesPipeline(MediaPipeline):
@@ -142,30 +327,226 @@ class FilesPipeline(MediaPipeline):
refresh it in case of change.
"""
- MEDIA_NAME = 'file'
+
+ MEDIA_NAME = "file"
EXPIRES = 90
- STORE_SCHEMES = {'': FSFilesStore, 'file': FSFilesStore, 's3':
- S3FilesStore, 'gs': GCSFilesStore, 'ftp': FTPFilesStore}
- DEFAULT_FILES_URLS_FIELD = 'file_urls'
- DEFAULT_FILES_RESULT_FIELD = 'files'
+ STORE_SCHEMES = {
+ "": FSFilesStore,
+ "file": FSFilesStore,
+ "s3": S3FilesStore,
+ "gs": GCSFilesStore,
+ "ftp": FTPFilesStore,
+ }
+ DEFAULT_FILES_URLS_FIELD = "file_urls"
+ DEFAULT_FILES_RESULT_FIELD = "files"
def __init__(self, store_uri, download_func=None, settings=None):
store_uri = _to_string(store_uri)
if not store_uri:
raise NotConfigured
+
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
- cls_name = 'FilesPipeline'
+ cls_name = "FilesPipeline"
self.store = self._get_store(store_uri)
- resolve = functools.partial(self._key_for_pipe, base_class_name=
- cls_name, settings=settings)
- self.expires = settings.getint(resolve('FILES_EXPIRES'), self.EXPIRES)
- if not hasattr(self, 'FILES_URLS_FIELD'):
+ resolve = functools.partial(
+ self._key_for_pipe, base_class_name=cls_name, settings=settings
+ )
+ self.expires = settings.getint(resolve("FILES_EXPIRES"), self.EXPIRES)
+ if not hasattr(self, "FILES_URLS_FIELD"):
self.FILES_URLS_FIELD = self.DEFAULT_FILES_URLS_FIELD
- if not hasattr(self, 'FILES_RESULT_FIELD'):
+ if not hasattr(self, "FILES_RESULT_FIELD"):
self.FILES_RESULT_FIELD = self.DEFAULT_FILES_RESULT_FIELD
- self.files_urls_field = settings.get(resolve('FILES_URLS_FIELD'),
- self.FILES_URLS_FIELD)
- self.files_result_field = settings.get(resolve('FILES_RESULT_FIELD'
- ), self.FILES_RESULT_FIELD)
+ self.files_urls_field = settings.get(
+ resolve("FILES_URLS_FIELD"), self.FILES_URLS_FIELD
+ )
+ self.files_result_field = settings.get(
+ resolve("FILES_RESULT_FIELD"), self.FILES_RESULT_FIELD
+ )
+
super().__init__(download_func=download_func, settings=settings)
+
+ @classmethod
+ def from_settings(cls, settings):
+ s3store = cls.STORE_SCHEMES["s3"]
+ s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
+ s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
+ s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
+ s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
+ s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
+ s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
+ s3store.AWS_VERIFY = settings["AWS_VERIFY"]
+ s3store.POLICY = settings["FILES_STORE_S3_ACL"]
+
+ gcs_store = cls.STORE_SCHEMES["gs"]
+ gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
+ gcs_store.POLICY = settings["FILES_STORE_GCS_ACL"] or None
+
+ ftp_store = cls.STORE_SCHEMES["ftp"]
+ ftp_store.FTP_USERNAME = settings["FTP_USER"]
+ ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
+ ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
+
+ store_uri = settings["FILES_STORE"]
+ return cls(store_uri, settings=settings)
+
+ def _get_store(self, uri: str):
+ if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir
+ scheme = "file"
+ else:
+ scheme = urlparse(uri).scheme
+ store_cls = self.STORE_SCHEMES[scheme]
+ return store_cls(uri)
+
+ def media_to_download(self, request, info, *, item=None):
+ def _onsuccess(result):
+ if not result:
+ return # returning None force download
+
+ last_modified = result.get("last_modified", None)
+ if not last_modified:
+ return # returning None force download
+
+ age_seconds = time.time() - last_modified
+ age_days = age_seconds / 60 / 60 / 24
+ if age_days > self.expires:
+ return # returning None force download
+
+ referer = referer_str(request)
+ logger.debug(
+ "File (uptodate): Downloaded %(medianame)s from %(request)s "
+ "referred in <%(referer)s>",
+ {"medianame": self.MEDIA_NAME, "request": request, "referer": referer},
+ extra={"spider": info.spider},
+ )
+ self.inc_stats(info.spider, "uptodate")
+
+ checksum = result.get("checksum", None)
+ return {
+ "url": request.url,
+ "path": path,
+ "checksum": checksum,
+ "status": "uptodate",
+ }
+
+ path = self.file_path(request, info=info, item=item)
+ dfd = defer.maybeDeferred(self.store.stat_file, path, info)
+ dfd.addCallbacks(_onsuccess, lambda _: None)
+ dfd.addErrback(
+ lambda f: logger.error(
+ self.__class__.__name__ + ".store.stat_file",
+ exc_info=failure_to_exc_info(f),
+ extra={"spider": info.spider},
+ )
+ )
+ return dfd
+
+ def media_failed(self, failure, request, info):
+ if not isinstance(failure.value, IgnoreRequest):
+ referer = referer_str(request)
+ logger.warning(
+ "File (unknown-error): Error downloading %(medianame)s from "
+ "%(request)s referred in <%(referer)s>: %(exception)s",
+ {
+ "medianame": self.MEDIA_NAME,
+ "request": request,
+ "referer": referer,
+ "exception": failure.value,
+ },
+ extra={"spider": info.spider},
+ )
+
+ raise FileException
+
+ def media_downloaded(self, response, request, info, *, item=None):
+ referer = referer_str(request)
+
+ if response.status != 200:
+ logger.warning(
+ "File (code: %(status)s): Error downloading file from "
+ "%(request)s referred in <%(referer)s>",
+ {"status": response.status, "request": request, "referer": referer},
+ extra={"spider": info.spider},
+ )
+ raise FileException("download-error")
+
+ if not response.body:
+ logger.warning(
+ "File (empty-content): Empty file from %(request)s referred "
+ "in <%(referer)s>: no-content",
+ {"request": request, "referer": referer},
+ extra={"spider": info.spider},
+ )
+ raise FileException("empty-content")
+
+ status = "cached" if "cached" in response.flags else "downloaded"
+ logger.debug(
+ "File (%(status)s): Downloaded file from %(request)s referred in "
+ "<%(referer)s>",
+ {"status": status, "request": request, "referer": referer},
+ extra={"spider": info.spider},
+ )
+ self.inc_stats(info.spider, status)
+
+ try:
+ path = self.file_path(request, response=response, info=info, item=item)
+ checksum = self.file_downloaded(response, request, info, item=item)
+ except FileException as exc:
+ logger.warning(
+ "File (error): Error processing file from %(request)s "
+ "referred in <%(referer)s>: %(errormsg)s",
+ {"request": request, "referer": referer, "errormsg": str(exc)},
+ extra={"spider": info.spider},
+ exc_info=True,
+ )
+ raise
+ except Exception as exc:
+ logger.error(
+ "File (unknown-error): Error processing file from %(request)s "
+ "referred in <%(referer)s>",
+ {"request": request, "referer": referer},
+ exc_info=True,
+ extra={"spider": info.spider},
+ )
+ raise FileException(str(exc))
+
+ return {
+ "url": request.url,
+ "path": path,
+ "checksum": checksum,
+ "status": status,
+ }
+
+ def inc_stats(self, spider, status):
+ spider.crawler.stats.inc_value("file_count", spider=spider)
+ spider.crawler.stats.inc_value(f"file_status_count/{status}", spider=spider)
+
+ # Overridable Interface
+ def get_media_requests(self, item, info):
+ urls = ItemAdapter(item).get(self.files_urls_field, [])
+ return [Request(u, callback=NO_CALLBACK) for u in urls]
+
+ def file_downloaded(self, response, request, info, *, item=None):
+ path = self.file_path(request, response=response, info=info, item=item)
+ buf = BytesIO(response.body)
+ checksum = md5sum(buf)
+ buf.seek(0)
+ self.store.persist_file(path, buf, info)
+ return checksum
+
+ def item_completed(self, results, item, info):
+ with suppress(KeyError):
+ ItemAdapter(item)[self.files_result_field] = [x for ok, x in results if ok]
+ return item
+
+ def file_path(self, request, response=None, info=None, *, item=None):
+ media_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+ media_ext = Path(request.url).suffix
+ # Handles empty and wild extensions by trying to guess the
+ # mime type then extension or default to empty string otherwise
+ if media_ext not in mimetypes.types_map:
+ media_ext = ""
+ media_type = mimetypes.guess_type(request.url)[0]
+ if media_type:
+ media_ext = mimetypes.guess_extension(media_type)
+ return f"full/{media_guid}{media_ext}"
diff --git a/scrapy/pipelines/images.py b/scrapy/pipelines/images.py
index 4c93a08c0..9d18144ee 100644
--- a/scrapy/pipelines/images.py
+++ b/scrapy/pipelines/images.py
@@ -8,11 +8,15 @@ import hashlib
import warnings
from contextlib import suppress
from io import BytesIO
+
from itemadapter import ItemAdapter
+
from scrapy.exceptions import DropItem, NotConfigured, ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FileException, FilesPipeline
+
+# TODO: from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.misc import md5sum
from scrapy.utils.python import get_func_args, to_bytes
@@ -22,8 +26,11 @@ class NoimagesDrop(DropItem):
"""Product with no images exception"""
def __init__(self, *args, **kwargs):
- warnings.warn('The NoimagesDrop class is deprecated', category=
- ScrapyDeprecationWarning, stacklevel=2)
+ warnings.warn(
+ "The NoimagesDrop class is deprecated",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
super().__init__(*args, **kwargs)
@@ -33,39 +40,192 @@ class ImageException(FileException):
class ImagesPipeline(FilesPipeline):
"""Abstract pipeline that implement the image thumbnail generation logic"""
- MEDIA_NAME = 'image'
+
+ MEDIA_NAME = "image"
+
+ # Uppercase attributes kept for backward compatibility with code that subclasses
+ # ImagesPipeline. They may be overridden by settings.
MIN_WIDTH = 0
MIN_HEIGHT = 0
EXPIRES = 90
THUMBS = {}
- DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
- DEFAULT_IMAGES_RESULT_FIELD = 'images'
+ DEFAULT_IMAGES_URLS_FIELD = "image_urls"
+ DEFAULT_IMAGES_RESULT_FIELD = "images"
def __init__(self, store_uri, download_func=None, settings=None):
try:
from PIL import Image
+
self._Image = Image
except ImportError:
raise NotConfigured(
- 'ImagesPipeline requires installing Pillow 4.0.0 or later')
- super().__init__(store_uri, settings=settings, download_func=
- download_func)
+ "ImagesPipeline requires installing Pillow 4.0.0 or later"
+ )
+
+ super().__init__(store_uri, settings=settings, download_func=download_func)
+
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
- resolve = functools.partial(self._key_for_pipe, base_class_name=
- 'ImagesPipeline', settings=settings)
- self.expires = settings.getint(resolve('IMAGES_EXPIRES'), self.EXPIRES)
- if not hasattr(self, 'IMAGES_RESULT_FIELD'):
+
+ resolve = functools.partial(
+ self._key_for_pipe,
+ base_class_name="ImagesPipeline",
+ settings=settings,
+ )
+ self.expires = settings.getint(resolve("IMAGES_EXPIRES"), self.EXPIRES)
+
+ if not hasattr(self, "IMAGES_RESULT_FIELD"):
self.IMAGES_RESULT_FIELD = self.DEFAULT_IMAGES_RESULT_FIELD
- if not hasattr(self, 'IMAGES_URLS_FIELD'):
+ if not hasattr(self, "IMAGES_URLS_FIELD"):
self.IMAGES_URLS_FIELD = self.DEFAULT_IMAGES_URLS_FIELD
- self.images_urls_field = settings.get(resolve('IMAGES_URLS_FIELD'),
- self.IMAGES_URLS_FIELD)
- self.images_result_field = settings.get(resolve(
- 'IMAGES_RESULT_FIELD'), self.IMAGES_RESULT_FIELD)
- self.min_width = settings.getint(resolve('IMAGES_MIN_WIDTH'), self.
- MIN_WIDTH)
- self.min_height = settings.getint(resolve('IMAGES_MIN_HEIGHT'),
- self.MIN_HEIGHT)
- self.thumbs = settings.get(resolve('IMAGES_THUMBS'), self.THUMBS)
+
+ self.images_urls_field = settings.get(
+ resolve("IMAGES_URLS_FIELD"), self.IMAGES_URLS_FIELD
+ )
+ self.images_result_field = settings.get(
+ resolve("IMAGES_RESULT_FIELD"), self.IMAGES_RESULT_FIELD
+ )
+ self.min_width = settings.getint(resolve("IMAGES_MIN_WIDTH"), self.MIN_WIDTH)
+ self.min_height = settings.getint(resolve("IMAGES_MIN_HEIGHT"), self.MIN_HEIGHT)
+ self.thumbs = settings.get(resolve("IMAGES_THUMBS"), self.THUMBS)
+
self._deprecated_convert_image = None
+
+ @classmethod
+ def from_settings(cls, settings):
+ s3store = cls.STORE_SCHEMES["s3"]
+ s3store.AWS_ACCESS_KEY_ID = settings["AWS_ACCESS_KEY_ID"]
+ s3store.AWS_SECRET_ACCESS_KEY = settings["AWS_SECRET_ACCESS_KEY"]
+ s3store.AWS_SESSION_TOKEN = settings["AWS_SESSION_TOKEN"]
+ s3store.AWS_ENDPOINT_URL = settings["AWS_ENDPOINT_URL"]
+ s3store.AWS_REGION_NAME = settings["AWS_REGION_NAME"]
+ s3store.AWS_USE_SSL = settings["AWS_USE_SSL"]
+ s3store.AWS_VERIFY = settings["AWS_VERIFY"]
+ s3store.POLICY = settings["IMAGES_STORE_S3_ACL"]
+
+ gcs_store = cls.STORE_SCHEMES["gs"]
+ gcs_store.GCS_PROJECT_ID = settings["GCS_PROJECT_ID"]
+ gcs_store.POLICY = settings["IMAGES_STORE_GCS_ACL"] or None
+
+ ftp_store = cls.STORE_SCHEMES["ftp"]
+ ftp_store.FTP_USERNAME = settings["FTP_USER"]
+ ftp_store.FTP_PASSWORD = settings["FTP_PASSWORD"]
+ ftp_store.USE_ACTIVE_MODE = settings.getbool("FEED_STORAGE_FTP_ACTIVE")
+
+ store_uri = settings["IMAGES_STORE"]
+ return cls(store_uri, settings=settings)
+
+ def file_downloaded(self, response, request, info, *, item=None):
+ return self.image_downloaded(response, request, info, item=item)
+
+ def image_downloaded(self, response, request, info, *, item=None):
+ checksum = None
+ for path, image, buf in self.get_images(response, request, info, item=item):
+ if checksum is None:
+ buf.seek(0)
+ checksum = md5sum(buf)
+ width, height = image.size
+ self.store.persist_file(
+ path,
+ buf,
+ info,
+ meta={"width": width, "height": height},
+ headers={"Content-Type": "image/jpeg"},
+ )
+ return checksum
+
+ def get_images(self, response, request, info, *, item=None):
+ path = self.file_path(request, response=response, info=info, item=item)
+ orig_image = self._Image.open(BytesIO(response.body))
+
+ width, height = orig_image.size
+ if width < self.min_width or height < self.min_height:
+ raise ImageException(
+ "Image too small "
+ f"({width}x{height} < "
+ f"{self.min_width}x{self.min_height})"
+ )
+
+ if self._deprecated_convert_image is None:
+ self._deprecated_convert_image = "response_body" not in get_func_args(
+ self.convert_image
+ )
+ if self._deprecated_convert_image:
+ warnings.warn(
+ f"{self.__class__.__name__}.convert_image() method overridden in a deprecated way, "
+ "overridden method does not accept response_body argument.",
+ category=ScrapyDeprecationWarning,
+ )
+
+ if self._deprecated_convert_image:
+ image, buf = self.convert_image(orig_image)
+ else:
+ image, buf = self.convert_image(
+ orig_image, response_body=BytesIO(response.body)
+ )
+ yield path, image, buf
+
+ for thumb_id, size in self.thumbs.items():
+ thumb_path = self.thumb_path(
+ request, thumb_id, response=response, info=info, item=item
+ )
+ if self._deprecated_convert_image:
+ thumb_image, thumb_buf = self.convert_image(image, size)
+ else:
+ thumb_image, thumb_buf = self.convert_image(image, size, buf)
+ yield thumb_path, thumb_image, thumb_buf
+
+ def convert_image(self, image, size=None, response_body=None):
+ if response_body is None:
+ warnings.warn(
+ f"{self.__class__.__name__}.convert_image() method called in a deprecated way, "
+ "method called without response_body argument.",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+
+ if image.format in ("PNG", "WEBP") and image.mode == "RGBA":
+ background = self._Image.new("RGBA", image.size, (255, 255, 255))
+ background.paste(image, image)
+ image = background.convert("RGB")
+ elif image.mode == "P":
+ image = image.convert("RGBA")
+ background = self._Image.new("RGBA", image.size, (255, 255, 255))
+ background.paste(image, image)
+ image = background.convert("RGB")
+ elif image.mode != "RGB":
+ image = image.convert("RGB")
+
+ if size:
+ image = image.copy()
+ try:
+ # Image.Resampling.LANCZOS was added in Pillow 9.1.0
+ # remove this try except block,
+ # when updating the minimum requirements for Pillow.
+ resampling_filter = self._Image.Resampling.LANCZOS
+ except AttributeError:
+ resampling_filter = self._Image.ANTIALIAS
+ image.thumbnail(size, resampling_filter)
+ elif response_body is not None and image.format == "JPEG":
+ return image, response_body
+
+ buf = BytesIO()
+ image.save(buf, "JPEG")
+ return image, buf
+
+ def get_media_requests(self, item, info):
+ urls = ItemAdapter(item).get(self.images_urls_field, [])
+ return [Request(u, callback=NO_CALLBACK) for u in urls]
+
+ def item_completed(self, results, item, info):
+ with suppress(KeyError):
+ ItemAdapter(item)[self.images_result_field] = [x for ok, x in results if ok]
+ return item
+
+ def file_path(self, request, response=None, info=None, *, item=None):
+ image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+ return f"full/{image_guid}.jpg"
+
+ def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None):
+ thumb_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
+ return f"thumbs/{thumb_id}/{thumb_guid}.jpg"
diff --git a/scrapy/pipelines/media.py b/scrapy/pipelines/media.py
index f88dbcfe8..153047acf 100644
--- a/scrapy/pipelines/media.py
+++ b/scrapy/pipelines/media.py
@@ -3,8 +3,10 @@ import logging
from collections import defaultdict
from inspect import signature
from warnings import warn
+
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure
+
from scrapy.http.request import NO_CALLBACK
from scrapy.settings import Settings
from scrapy.utils.datatypes import SequenceExclude
@@ -12,15 +14,18 @@ from scrapy.utils.defer import defer_result, mustbe_deferred
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.log import failure_to_exc_info
from scrapy.utils.misc import arg_to_iter
+
logger = logging.getLogger(__name__)
+def _DUMMY_CALLBACK(response):
+ return response
+
+
class MediaPipeline:
LOG_FAILED_RESULTS = True
-
class SpiderInfo:
-
def __init__(self, spider):
self.spider = spider
self.downloading = set()
@@ -30,15 +35,23 @@ class MediaPipeline:
def __init__(self, download_func=None, settings=None):
self.download_func = download_func
self._expects_item = {}
+
if isinstance(settings, dict) or settings is None:
settings = Settings(settings)
- resolve = functools.partial(self._key_for_pipe, base_class_name=
- 'MediaPipeline', settings=settings)
- self.allow_redirects = settings.getbool(resolve(
- 'MEDIA_ALLOW_REDIRECTS'), False)
+ resolve = functools.partial(
+ self._key_for_pipe, base_class_name="MediaPipeline", settings=settings
+ )
+ self.allow_redirects = settings.getbool(resolve("MEDIA_ALLOW_REDIRECTS"), False)
self._handle_statuses(self.allow_redirects)
+
+ # Check if deprecated methods are being used and make them compatible
self._make_compatible()
+ def _handle_statuses(self, allow_redirects):
+ self.handle_httpstatus_list = None
+ if allow_redirects:
+ self.handle_httpstatus_list = SequenceExclude(range(300, 400))
+
def _key_for_pipe(self, key, base_class_name=None, settings=None):
"""
>>> MediaPipeline()._key_for_pipe("IMAGES")
@@ -48,16 +61,186 @@ class MediaPipeline:
>>> MyPipe()._key_for_pipe("IMAGES", base_class_name="MediaPipeline")
'MYPIPE_IMAGES'
"""
- pass
+ class_name = self.__class__.__name__
+ formatted_key = f"{class_name.upper()}_{key}"
+ if (
+ not base_class_name
+ or class_name == base_class_name
+ or settings
+ and not settings.get(formatted_key)
+ ):
+ return key
+ return formatted_key
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ try:
+ pipe = cls.from_settings(crawler.settings)
+ except AttributeError:
+ pipe = cls()
+ pipe.crawler = crawler
+ pipe._fingerprinter = crawler.request_fingerprinter
+ return pipe
+
+ def open_spider(self, spider):
+ self.spiderinfo = self.SpiderInfo(spider)
+
+ def process_item(self, item, spider):
+ info = self.spiderinfo
+ requests = arg_to_iter(self.get_media_requests(item, info))
+ dlist = [self._process_request(r, info, item) for r in requests]
+ dfd = DeferredList(dlist, consumeErrors=True)
+ return dfd.addCallback(self.item_completed, item, info)
+
+ def _process_request(self, request, info, item):
+ fp = self._fingerprinter.fingerprint(request)
+ if not request.callback or request.callback is NO_CALLBACK:
+ cb = _DUMMY_CALLBACK
+ else:
+ cb = request.callback
+ eb = request.errback
+ request.callback = NO_CALLBACK
+ request.errback = None
+
+ # Return cached result if request was already seen
+ if fp in info.downloaded:
+ return defer_result(info.downloaded[fp]).addCallbacks(cb, eb)
+
+ # Otherwise, wait for result
+ wad = Deferred().addCallbacks(cb, eb)
+ info.waiting[fp].append(wad)
+
+ # Check if request is downloading right now to avoid doing it twice
+ if fp in info.downloading:
+ return wad
+
+ # Download request checking media_to_download hook output first
+ info.downloading.add(fp)
+ dfd = mustbe_deferred(self.media_to_download, request, info, item=item)
+ dfd.addCallback(self._check_media_to_download, request, info, item=item)
+ dfd.addErrback(self._log_exception)
+ dfd.addBoth(self._cache_result_and_execute_waiters, fp, info)
+ return dfd.addBoth(lambda _: wad) # it must return wad at last
+
+ def _log_exception(self, result):
+ logger.exception(result)
+ return result
def _make_compatible(self):
"""Make overridable methods of MediaPipeline and subclasses backwards compatible"""
- pass
+ methods = [
+ "file_path",
+ "thumb_path",
+ "media_to_download",
+ "media_downloaded",
+ "file_downloaded",
+ "image_downloaded",
+ "get_images",
+ ]
+
+ for method_name in methods:
+ method = getattr(self, method_name, None)
+ if callable(method):
+ setattr(self, method_name, self._compatible(method))
def _compatible(self, func):
"""Wrapper for overridable methods to allow backwards compatibility"""
- pass
+ self._check_signature(func)
+
+ @functools.wraps(func)
+ def wrapper(*args, **kwargs):
+ if self._expects_item[func.__name__]:
+ return func(*args, **kwargs)
+
+ kwargs.pop("item", None)
+ return func(*args, **kwargs)
+
+ return wrapper
+
+ def _check_signature(self, func):
+ sig = signature(func)
+ self._expects_item[func.__name__] = True
+ if "item" not in sig.parameters:
+ old_params = str(sig)[1:-1]
+ new_params = old_params + ", *, item=None"
+ warn(
+ f"{func.__name__}(self, {old_params}) is deprecated, "
+ f"please use {func.__name__}(self, {new_params})",
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ self._expects_item[func.__name__] = False
+
+ def _modify_media_request(self, request):
+ if self.handle_httpstatus_list:
+ request.meta["handle_httpstatus_list"] = self.handle_httpstatus_list
+ else:
+ request.meta["handle_httpstatus_all"] = True
+
+ def _check_media_to_download(self, result, request, info, item):
+ if result is not None:
+ return result
+ if self.download_func:
+ # this ugly code was left only to support tests. TODO: remove
+ dfd = mustbe_deferred(self.download_func, request, info.spider)
+ dfd.addCallbacks(
+ callback=self.media_downloaded,
+ callbackArgs=(request, info),
+ callbackKeywords={"item": item},
+ errback=self.media_failed,
+ errbackArgs=(request, info),
+ )
+ else:
+ self._modify_media_request(request)
+ dfd = self.crawler.engine.download(request)
+ dfd.addCallbacks(
+ callback=self.media_downloaded,
+ callbackArgs=(request, info),
+ callbackKeywords={"item": item},
+ errback=self.media_failed,
+ errbackArgs=(request, info),
+ )
+ return dfd
+
+ def _cache_result_and_execute_waiters(self, result, fp, info):
+ if isinstance(result, Failure):
+ # minimize cached information for failure
+ result.cleanFailure()
+ result.frames = []
+ result.stack = None
+
+ # This code fixes a memory leak by avoiding to keep references to
+ # the Request and Response objects on the Media Pipeline cache.
+ #
+ # What happens when the media_downloaded callback raises an
+ # exception, for example a FileException('download-error') when
+ # the Response status code is not 200 OK, is that the original
+ # StopIteration exception (which in turn contains the failed
+ # Response and by extension, the original Request) gets encapsulated
+ # within the FileException context.
+ #
+ # Originally, Scrapy was using twisted.internet.defer.returnValue
+ # inside functions decorated with twisted.internet.defer.inlineCallbacks,
+ # encapsulating the returned Response in a _DefGen_Return exception
+ # instead of a StopIteration.
+ #
+ # To avoid keeping references to the Response and therefore Request
+ # objects on the Media Pipeline cache, we should wipe the context of
+ # the encapsulated exception when it is a StopIteration instance
+ #
+ # This problem does not occur in Python 2.7 since we don't have
+ # Exception Chaining (https://www.python.org/dev/peps/pep-3134/).
+ context = getattr(result.value, "__context__", None)
+ if isinstance(context, StopIteration):
+ setattr(result.value, "__context__", None)
+
+ info.downloading.remove(fp)
+ info.downloaded[fp] = result # cache result
+ for wad in info.waiting.pop(fp):
+ defer_result(result).chainDeferred(wad)
+
+ # Overridable Interface
def media_to_download(self, request, info, *, item=None):
"""Check request before starting download"""
pass
@@ -68,15 +251,24 @@ class MediaPipeline:
def media_downloaded(self, response, request, info, *, item=None):
"""Handler for success downloads"""
- pass
+ return response
def media_failed(self, failure, request, info):
"""Handler for failed downloads"""
- pass
+ return failure
def item_completed(self, results, item, info):
"""Called per item when all media requests has been processed"""
- pass
+ if self.LOG_FAILED_RESULTS:
+ for ok, value in results:
+ if not ok:
+ logger.error(
+ "%(class)s found errors processing %(item)s",
+ {"class": self.__class__.__name__, "item": item},
+ exc_info=failure_to_exc_info(value),
+ extra={"spider": info.spider},
+ )
+ return item
def file_path(self, request, response=None, info=None, *, item=None):
"""Returns the path where downloaded media should be stored"""
diff --git a/scrapy/pqueues.py b/scrapy/pqueues.py
index c9f7c822d..62a9af477 100644
--- a/scrapy/pqueues.py
+++ b/scrapy/pqueues.py
@@ -1,6 +1,8 @@
import hashlib
import logging
+
from scrapy.utils.misc import create_instance
+
logger = logging.getLogger(__name__)
@@ -15,7 +17,11 @@ def _path_safe(text):
>>> _path_safe('some@symbol?').startswith('some_symbol_')
True
"""
- pass
+ pathable_slot = "".join([c if c.isalnum() or c in "-._" else "_" for c in text])
+ # as we replace some letters we can get collision for different slots
+ # add we add unique part
+ unique_slot = hashlib.md5(text.encode("utf8")).hexdigest()
+ return "-".join([pathable_slot, unique_slot])
class ScrapyPriorityQueue:
@@ -44,6 +50,10 @@ class ScrapyPriorityQueue:
"""
+ @classmethod
+ def from_crawler(cls, crawler, downstream_queue_cls, key, startprios=()):
+ return cls(crawler, downstream_queue_cls, key, startprios)
+
def __init__(self, crawler, downstream_queue_cls, key, startprios=()):
self.crawler = crawler
self.downstream_queue_cls = downstream_queue_cls
@@ -52,6 +62,47 @@ class ScrapyPriorityQueue:
self.curprio = None
self.init_prios(startprios)
+ def init_prios(self, startprios):
+ if not startprios:
+ return
+
+ for priority in startprios:
+ self.queues[priority] = self.qfactory(priority)
+
+ self.curprio = min(startprios)
+
+ def qfactory(self, key):
+ return create_instance(
+ self.downstream_queue_cls,
+ None,
+ self.crawler,
+ self.key + "/" + str(key),
+ )
+
+ def priority(self, request):
+ return -request.priority
+
+ def push(self, request):
+ priority = self.priority(request)
+ if priority not in self.queues:
+ self.queues[priority] = self.qfactory(priority)
+ q = self.queues[priority]
+ q.push(request) # this may fail (eg. serialization error)
+ if self.curprio is None or priority < self.curprio:
+ self.curprio = priority
+
+ def pop(self):
+ if self.curprio is None:
+ return
+ q = self.queues[self.curprio]
+ m = q.pop()
+ if not q:
+ del self.queues[self.curprio]
+ q.close()
+ prios = [p for p, q in self.queues.items() if q]
+ self.curprio = min(prios) if prios else None
+ return m
+
def peek(self):
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.
@@ -59,20 +110,37 @@ class ScrapyPriorityQueue:
Raises :exc:`NotImplementedError` if the underlying queue class does
not implement a ``peek`` method, which is optional for queues.
"""
- pass
+ if self.curprio is None:
+ return None
+ queue = self.queues[self.curprio]
+ return queue.peek()
+
+ def close(self):
+ active = []
+ for p, q in self.queues.items():
+ active.append(p)
+ q.close()
+ return active
def __len__(self):
return sum(len(x) for x in self.queues.values()) if self.queues else 0
class DownloaderInterface:
-
def __init__(self, crawler):
self.downloader = crawler.engine.downloader
+ def stats(self, possible_slots):
+ return [(self._active_downloads(slot), slot) for slot in possible_slots]
+
+ def get_slot_key(self, request):
+ return self.downloader._get_slot_key(request, None)
+
def _active_downloads(self, slot):
"""Return a number of requests in a Downloader for a given slot"""
- pass
+ if slot not in self.downloader.slots:
+ return 0
+ return len(self.downloader.slots[slot].active)
class DownloaderAwarePriorityQueue:
@@ -81,23 +149,64 @@ class DownloaderAwarePriorityQueue:
first.
"""
+ @classmethod
+ def from_crawler(cls, crawler, downstream_queue_cls, key, startprios=()):
+ return cls(crawler, downstream_queue_cls, key, startprios)
+
def __init__(self, crawler, downstream_queue_cls, key, slot_startprios=()):
- if crawler.settings.getint('CONCURRENT_REQUESTS_PER_IP') != 0:
+ if crawler.settings.getint("CONCURRENT_REQUESTS_PER_IP") != 0:
raise ValueError(
f'"{self.__class__}" does not support CONCURRENT_REQUESTS_PER_IP'
- )
+ )
+
if slot_startprios and not isinstance(slot_startprios, dict):
raise ValueError(
- f'DownloaderAwarePriorityQueue accepts ``slot_startprios`` as a dict; {slot_startprios.__class__!r} instance is passed. Most likely, it means the state iscreated by an incompatible priority queue. Only a crawl started with the same priority queue class can be resumed.'
- )
+ "DownloaderAwarePriorityQueue accepts "
+ "``slot_startprios`` as a dict; "
+ f"{slot_startprios.__class__!r} instance "
+ "is passed. Most likely, it means the state is"
+ "created by an incompatible priority queue. "
+ "Only a crawl started with the same priority "
+ "queue class can be resumed."
+ )
+
self._downloader_interface = DownloaderInterface(crawler)
self.downstream_queue_cls = downstream_queue_cls
self.key = key
self.crawler = crawler
- self.pqueues = {}
+
+ self.pqueues = {} # slot -> priority queue
for slot, startprios in (slot_startprios or {}).items():
self.pqueues[slot] = self.pqfactory(slot, startprios)
+ def pqfactory(self, slot, startprios=()):
+ return ScrapyPriorityQueue(
+ self.crawler,
+ self.downstream_queue_cls,
+ self.key + "/" + _path_safe(slot),
+ startprios,
+ )
+
+ def pop(self):
+ stats = self._downloader_interface.stats(self.pqueues)
+
+ if not stats:
+ return
+
+ slot = min(stats)[1]
+ queue = self.pqueues[slot]
+ request = queue.pop()
+ if len(queue) == 0:
+ del self.pqueues[slot]
+ return request
+
+ def push(self, request):
+ slot = self._downloader_interface.get_slot_key(request)
+ if slot not in self.pqueues:
+ self.pqueues[slot] = self.pqfactory(slot)
+ queue = self.pqueues[slot]
+ queue.push(request)
+
def peek(self):
"""Returns the next object to be returned by :meth:`pop`,
but without removing it from the queue.
@@ -105,11 +214,20 @@ class DownloaderAwarePriorityQueue:
Raises :exc:`NotImplementedError` if the underlying queue class does
not implement a ``peek`` method, which is optional for queues.
"""
- pass
+ stats = self._downloader_interface.stats(self.pqueues)
+ if not stats:
+ return None
+ slot = min(stats)[1]
+ queue = self.pqueues[slot]
+ return queue.peek()
+
+ def close(self):
+ active = {slot: queue.close() for slot, queue in self.pqueues.items()}
+ self.pqueues.clear()
+ return active
def __len__(self):
- return sum(len(x) for x in self.pqueues.values()
- ) if self.pqueues else 0
+ return sum(len(x) for x in self.pqueues.values()) if self.pqueues else 0
def __contains__(self, slot):
return slot in self.pqueues
diff --git a/scrapy/resolver.py b/scrapy/resolver.py
index c5fa8c6de..e2e8beff4 100644
--- a/scrapy/resolver.py
+++ b/scrapy/resolver.py
@@ -1,9 +1,18 @@
from typing import Any
+
from twisted.internet import defer
from twisted.internet.base import ThreadedResolver
-from twisted.internet.interfaces import IHostnameResolver, IHostResolution, IResolutionReceiver, IResolverSimple
+from twisted.internet.interfaces import (
+ IHostnameResolver,
+ IHostResolution,
+ IResolutionReceiver,
+ IResolverSimple,
+)
from zope.interface.declarations import implementer, provider
+
from scrapy.utils.datatypes import LocalCache
+
+# TODO: cache misses
dnscache: LocalCache[str, Any] = LocalCache(10000)
@@ -18,22 +27,64 @@ class CachingThreadedResolver(ThreadedResolver):
dnscache.limit = cache_size
self.timeout = timeout
+ @classmethod
+ def from_crawler(cls, crawler, reactor):
+ if crawler.settings.getbool("DNSCACHE_ENABLED"):
+ cache_size = crawler.settings.getint("DNSCACHE_SIZE")
+ else:
+ cache_size = 0
+ return cls(reactor, cache_size, crawler.settings.getfloat("DNS_TIMEOUT"))
+
+ def install_on_reactor(self):
+ self.reactor.installResolver(self)
+
+ def getHostByName(self, name: str, timeout=None):
+ if name in dnscache:
+ return defer.succeed(dnscache[name])
+ # in Twisted<=16.6, getHostByName() is always called with
+ # a default timeout of 60s (actually passed as (1, 3, 11, 45) tuple),
+ # so the input argument above is simply overridden
+ # to enforce Scrapy's DNS_TIMEOUT setting's value
+ timeout = (self.timeout,)
+ d = super().getHostByName(name, timeout)
+ if dnscache.limit:
+ d.addCallback(self._cache_result, name)
+ return d
+
+ def _cache_result(self, result, name):
+ dnscache[name] = result
+ return result
+
@implementer(IHostResolution)
class HostResolution:
-
def __init__(self, name):
self.name = name
+ def cancel(self):
+ raise NotImplementedError()
+
@provider(IResolutionReceiver)
class _CachingResolutionReceiver:
-
def __init__(self, resolutionReceiver, hostName):
self.resolutionReceiver = resolutionReceiver
self.hostName = hostName
self.addresses = []
+ def resolutionBegan(self, resolution):
+ self.resolutionReceiver.resolutionBegan(resolution)
+ self.resolution = resolution
+
+ def addressResolved(self, address):
+ self.resolutionReceiver.addressResolved(address)
+ self.addresses.append(address)
+
+ def resolutionComplete(self):
+ self.resolutionReceiver.resolutionComplete()
+ if self.addresses:
+ dnscache[self.hostName] = self.addresses
+
@implementer(IHostnameResolver)
class CachingHostnameResolver:
@@ -46,3 +97,39 @@ class CachingHostnameResolver:
self.reactor = reactor
self.original_resolver = reactor.nameResolver
dnscache.limit = cache_size
+
+ @classmethod
+ def from_crawler(cls, crawler, reactor):
+ if crawler.settings.getbool("DNSCACHE_ENABLED"):
+ cache_size = crawler.settings.getint("DNSCACHE_SIZE")
+ else:
+ cache_size = 0
+ return cls(reactor, cache_size)
+
+ def install_on_reactor(self):
+ self.reactor.installNameResolver(self)
+
+ def resolveHostName(
+ self,
+ resolutionReceiver,
+ hostName: str,
+ portNumber=0,
+ addressTypes=None,
+ transportSemantics="TCP",
+ ):
+ try:
+ addresses = dnscache[hostName]
+ except KeyError:
+ return self.original_resolver.resolveHostName(
+ _CachingResolutionReceiver(resolutionReceiver, hostName),
+ hostName,
+ portNumber,
+ addressTypes,
+ transportSemantics,
+ )
+ else:
+ resolutionReceiver.resolutionBegan(HostResolution(hostName))
+ for addr in addresses:
+ resolutionReceiver.addressResolved(addr)
+ resolutionReceiver.resolutionComplete()
+ return resolutionReceiver
diff --git a/scrapy/responsetypes.py b/scrapy/responsetypes.py
index 31e10e18d..9e411d4aa 100644
--- a/scrapy/responsetypes.py
+++ b/scrapy/responsetypes.py
@@ -6,69 +6,134 @@ from io import StringIO
from mimetypes import MimeTypes
from pkgutil import get_data
from typing import Dict, Mapping, Optional, Type, Union
+
from scrapy.http import Response
from scrapy.utils.misc import load_object
from scrapy.utils.python import binary_is_text, to_bytes, to_unicode
class ResponseTypes:
- CLASSES = {'text/html': 'scrapy.http.HtmlResponse',
- 'application/atom+xml': 'scrapy.http.XmlResponse',
- 'application/rdf+xml': 'scrapy.http.XmlResponse',
- 'application/rss+xml': 'scrapy.http.XmlResponse',
- 'application/xhtml+xml': 'scrapy.http.HtmlResponse',
- 'application/vnd.wap.xhtml+xml': 'scrapy.http.HtmlResponse',
- 'application/xml': 'scrapy.http.XmlResponse', 'application/json':
- 'scrapy.http.TextResponse', 'application/x-json':
- 'scrapy.http.TextResponse', 'application/json-amazonui-streaming':
- 'scrapy.http.TextResponse', 'application/javascript':
- 'scrapy.http.TextResponse', 'application/x-javascript':
- 'scrapy.http.TextResponse', 'text/xml': 'scrapy.http.XmlResponse',
- 'text/*': 'scrapy.http.TextResponse'}
+ CLASSES = {
+ "text/html": "scrapy.http.HtmlResponse",
+ "application/atom+xml": "scrapy.http.XmlResponse",
+ "application/rdf+xml": "scrapy.http.XmlResponse",
+ "application/rss+xml": "scrapy.http.XmlResponse",
+ "application/xhtml+xml": "scrapy.http.HtmlResponse",
+ "application/vnd.wap.xhtml+xml": "scrapy.http.HtmlResponse",
+ "application/xml": "scrapy.http.XmlResponse",
+ "application/json": "scrapy.http.TextResponse",
+ "application/x-json": "scrapy.http.TextResponse",
+ "application/json-amazonui-streaming": "scrapy.http.TextResponse",
+ "application/javascript": "scrapy.http.TextResponse",
+ "application/x-javascript": "scrapy.http.TextResponse",
+ "text/xml": "scrapy.http.XmlResponse",
+ "text/*": "scrapy.http.TextResponse",
+ }
- def __init__(self) ->None:
+ def __init__(self) -> None:
self.classes: Dict[str, Type[Response]] = {}
self.mimetypes: MimeTypes = MimeTypes()
- mimedata = get_data('scrapy', 'mime.types')
+ mimedata = get_data("scrapy", "mime.types")
if not mimedata:
raise ValueError(
- 'The mime.types file is not found in the Scrapy installation')
- self.mimetypes.readfp(StringIO(mimedata.decode('utf8')))
+ "The mime.types file is not found in the Scrapy installation"
+ )
+ self.mimetypes.readfp(StringIO(mimedata.decode("utf8")))
for mimetype, cls in self.CLASSES.items():
self.classes[mimetype] = load_object(cls)
- def from_mimetype(self, mimetype: str) ->Type[Response]:
+ def from_mimetype(self, mimetype: str) -> Type[Response]:
"""Return the most appropriate Response class for the given mimetype"""
- pass
+ if mimetype is None:
+ return Response
+ if mimetype in self.classes:
+ return self.classes[mimetype]
+ basetype = f"{mimetype.split('/')[0]}/*"
+ return self.classes.get(basetype, Response)
- def from_content_type(self, content_type: Union[str, bytes],
- content_encoding: Optional[bytes]=None) ->Type[Response]:
+ def from_content_type(
+ self, content_type: Union[str, bytes], content_encoding: Optional[bytes] = None
+ ) -> Type[Response]:
"""Return the most appropriate Response class from an HTTP Content-Type
header"""
- pass
+ if content_encoding:
+ return Response
+ mimetype = (
+ to_unicode(content_type, encoding="latin-1").split(";")[0].strip().lower()
+ )
+ return self.from_mimetype(mimetype)
+
+ def from_content_disposition(
+ self, content_disposition: Union[str, bytes]
+ ) -> Type[Response]:
+ try:
+ filename = (
+ to_unicode(content_disposition, encoding="latin-1", errors="replace")
+ .split(";")[1]
+ .split("=")[1]
+ .strip("\"'")
+ )
+ return self.from_filename(filename)
+ except IndexError:
+ return Response
- def from_headers(self, headers: Mapping[bytes, bytes]) ->Type[Response]:
+ def from_headers(self, headers: Mapping[bytes, bytes]) -> Type[Response]:
"""Return the most appropriate Response class by looking at the HTTP
headers"""
- pass
+ cls = Response
+ if b"Content-Type" in headers:
+ cls = self.from_content_type(
+ content_type=headers[b"Content-Type"],
+ content_encoding=headers.get(b"Content-Encoding"),
+ )
+ if cls is Response and b"Content-Disposition" in headers:
+ cls = self.from_content_disposition(headers[b"Content-Disposition"])
+ return cls
- def from_filename(self, filename: str) ->Type[Response]:
+ def from_filename(self, filename: str) -> Type[Response]:
"""Return the most appropriate Response class from a file name"""
- pass
+ mimetype, encoding = self.mimetypes.guess_type(filename)
+ if mimetype and not encoding:
+ return self.from_mimetype(mimetype)
+ return Response
- def from_body(self, body: bytes) ->Type[Response]:
+ def from_body(self, body: bytes) -> Type[Response]:
"""Try to guess the appropriate response based on the body content.
This method is a bit magic and could be improved in the future, but
it's not meant to be used except for special cases where response types
cannot be guess using more straightforward methods."""
- pass
+ chunk = body[:5000]
+ chunk = to_bytes(chunk)
+ if not binary_is_text(chunk):
+ return self.from_mimetype("application/octet-stream")
+ lowercase_chunk = chunk.lower()
+ if b"<html>" in lowercase_chunk:
+ return self.from_mimetype("text/html")
+ if b"<?xml" in lowercase_chunk:
+ return self.from_mimetype("text/xml")
+ if b"<!doctype html>" in lowercase_chunk:
+ return self.from_mimetype("text/html")
+ return self.from_mimetype("text")
- def from_args(self, headers: Optional[Mapping[bytes, bytes]]=None, url:
- Optional[str]=None, filename: Optional[str]=None, body: Optional[
- bytes]=None) ->Type[Response]:
+ def from_args(
+ self,
+ headers: Optional[Mapping[bytes, bytes]] = None,
+ url: Optional[str] = None,
+ filename: Optional[str] = None,
+ body: Optional[bytes] = None,
+ ) -> Type[Response]:
"""Guess the most appropriate Response class based on
the given arguments."""
- pass
+ cls = Response
+ if headers is not None:
+ cls = self.from_headers(headers)
+ if cls is Response and url is not None:
+ cls = self.from_filename(url)
+ if cls is Response and filename is not None:
+ cls = self.from_filename(filename)
+ if cls is Response and body is not None:
+ cls = self.from_body(body)
+ return cls
responsetypes = ResponseTypes()
diff --git a/scrapy/robotstxt.py b/scrapy/robotstxt.py
index a5916df9a..ea943c364 100644
--- a/scrapy/robotstxt.py
+++ b/scrapy/robotstxt.py
@@ -1,12 +1,32 @@
import logging
import sys
from abc import ABCMeta, abstractmethod
+
from scrapy.utils.python import to_unicode
+
logger = logging.getLogger(__name__)
-class RobotParser(metaclass=ABCMeta):
+def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
+ try:
+ if to_native_str_type:
+ robotstxt_body = to_unicode(robotstxt_body)
+ else:
+ robotstxt_body = robotstxt_body.decode("utf-8", errors="ignore")
+ except UnicodeDecodeError:
+ # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
+ # Switch to 'allow all' state.
+ logger.warning(
+ "Failure while parsing robots.txt. File either contains garbage or "
+ "is in an encoding other than UTF-8, treating it as an empty file.",
+ exc_info=sys.exc_info(),
+ extra={"spider": spider},
+ )
+ robotstxt_body = ""
+ return robotstxt_body
+
+class RobotParser(metaclass=ABCMeta):
@classmethod
@abstractmethod
def from_crawler(cls, crawler, robotstxt_body):
@@ -35,38 +55,81 @@ class RobotParser(metaclass=ABCMeta):
class PythonRobotParser(RobotParser):
-
def __init__(self, robotstxt_body, spider):
from urllib.robotparser import RobotFileParser
+
self.spider = spider
- robotstxt_body = decode_robotstxt(robotstxt_body, spider,
- to_native_str_type=True)
+ robotstxt_body = decode_robotstxt(
+ robotstxt_body, spider, to_native_str_type=True
+ )
self.rp = RobotFileParser()
self.rp.parse(robotstxt_body.splitlines())
+ @classmethod
+ def from_crawler(cls, crawler, robotstxt_body):
+ spider = None if not crawler else crawler.spider
+ o = cls(robotstxt_body, spider)
+ return o
-class ReppyRobotParser(RobotParser):
+ def allowed(self, url, user_agent):
+ user_agent = to_unicode(user_agent)
+ url = to_unicode(url)
+ return self.rp.can_fetch(user_agent, url)
+
+class ReppyRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from reppy.robots import Robots
+
self.spider = spider
- self.rp = Robots.parse('', robotstxt_body)
+ self.rp = Robots.parse("", robotstxt_body)
+ @classmethod
+ def from_crawler(cls, crawler, robotstxt_body):
+ spider = None if not crawler else crawler.spider
+ o = cls(robotstxt_body, spider)
+ return o
+
+ def allowed(self, url, user_agent):
+ return self.rp.allowed(url, user_agent)
-class RerpRobotParser(RobotParser):
+class RerpRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from robotexclusionrulesparser import RobotExclusionRulesParser
+
self.spider = spider
self.rp = RobotExclusionRulesParser()
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp.parse(robotstxt_body)
+ @classmethod
+ def from_crawler(cls, crawler, robotstxt_body):
+ spider = None if not crawler else crawler.spider
+ o = cls(robotstxt_body, spider)
+ return o
-class ProtegoRobotParser(RobotParser):
+ def allowed(self, url, user_agent):
+ user_agent = to_unicode(user_agent)
+ url = to_unicode(url)
+ return self.rp.is_allowed(user_agent, url)
+
+class ProtegoRobotParser(RobotParser):
def __init__(self, robotstxt_body, spider):
from protego import Protego
+
self.spider = spider
robotstxt_body = decode_robotstxt(robotstxt_body, spider)
self.rp = Protego.parse(robotstxt_body)
+
+ @classmethod
+ def from_crawler(cls, crawler, robotstxt_body):
+ spider = None if not crawler else crawler.spider
+ o = cls(robotstxt_body, spider)
+ return o
+
+ def allowed(self, url, user_agent):
+ user_agent = to_unicode(user_agent)
+ url = to_unicode(url)
+ return self.rp.can_fetch(url, user_agent)
diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
index 0c13d190a..5ad4724c0 100644
--- a/scrapy/selector/unified.py
+++ b/scrapy/selector/unified.py
@@ -2,15 +2,30 @@
XPath selectors based on lxml
"""
from typing import Any, Optional, Type, Union
+
from parsel import Selector as _ParselSelector
+
from scrapy.http import HtmlResponse, TextResponse, XmlResponse
from scrapy.utils.python import to_bytes
from scrapy.utils.response import get_base_url
from scrapy.utils.trackref import object_ref
-__all__ = ['Selector', 'SelectorList']
+
+__all__ = ["Selector", "SelectorList"]
+
_NOT_SET = object()
+def _st(response: Optional[TextResponse], st: Optional[str]) -> str:
+ if st is None:
+ return "xml" if isinstance(response, XmlResponse) else "html"
+ return st
+
+
+def _response_from_text(text: Union[str, bytes], st: Optional[str]) -> TextResponse:
+ rt: Type[TextResponse] = XmlResponse if st == "xml" else HtmlResponse
+ return rt(url="about:blank", encoding="utf-8", body=to_bytes(text, "utf-8"))
+
+
class SelectorList(_ParselSelector.selectorlist_cls, object_ref):
"""
The :class:`SelectorList` class is a subclass of the builtin ``list``
@@ -48,23 +63,36 @@ class Selector(_ParselSelector, object_ref):
Otherwise, if ``type`` is set, the selector type will be forced and no
detection will occur.
"""
- __slots__ = ['response']
+
+ __slots__ = ["response"]
selectorlist_cls = SelectorList
- def __init__(self, response: Optional[TextResponse]=None, text:
- Optional[str]=None, type: Optional[str]=None, root: Optional[Any]=
- _NOT_SET, **kwargs: Any):
+ def __init__(
+ self,
+ response: Optional[TextResponse] = None,
+ text: Optional[str] = None,
+ type: Optional[str] = None,
+ root: Optional[Any] = _NOT_SET,
+ **kwargs: Any,
+ ):
if response is not None and text is not None:
raise ValueError(
- f'{self.__class__.__name__}.__init__() received both response and text'
- )
+ f"{self.__class__.__name__}.__init__() received "
+ "both response and text"
+ )
+
st = _st(response, type)
+
if text is not None:
response = _response_from_text(text, st)
+
if response is not None:
text = response.text
- kwargs.setdefault('base_url', get_base_url(response))
+ kwargs.setdefault("base_url", get_base_url(response))
+
self.response = response
+
if root is not _NOT_SET:
- kwargs['root'] = root
+ kwargs["root"] = root
+
super().__init__(text=text, type=st, **kwargs)
diff --git a/scrapy/settings/default_settings.py b/scrapy/settings/default_settings.py
index 417c6e729..6affd2125 100644
--- a/scrapy/settings/default_settings.py
+++ b/scrapy/settings/default_settings.py
@@ -12,234 +12,327 @@ Scrapy developers, if you add a setting here remember to:
(docs/topics/settings.rst)
"""
+
import sys
from importlib import import_module
from pathlib import Path
+
ADDONS = {}
+
AJAXCRAWL_ENABLED = False
+
ASYNCIO_EVENT_LOOP = None
+
AUTOTHROTTLE_ENABLED = False
AUTOTHROTTLE_DEBUG = False
AUTOTHROTTLE_MAX_DELAY = 60.0
AUTOTHROTTLE_START_DELAY = 5.0
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
-BOT_NAME = 'scrapybot'
+
+BOT_NAME = "scrapybot"
+
CLOSESPIDER_TIMEOUT = 0
CLOSESPIDER_PAGECOUNT = 0
CLOSESPIDER_ITEMCOUNT = 0
CLOSESPIDER_ERRORCOUNT = 0
-COMMANDS_MODULE = ''
+
+COMMANDS_MODULE = ""
+
COMPRESSION_ENABLED = True
+
CONCURRENT_ITEMS = 100
+
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 0
+
COOKIES_ENABLED = True
COOKIES_DEBUG = False
-DEFAULT_ITEM_CLASS = 'scrapy.item.Item'
-DEFAULT_REQUEST_HEADERS = {'Accept':
- 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language': 'en'}
+
+DEFAULT_ITEM_CLASS = "scrapy.item.Item"
+
+DEFAULT_REQUEST_HEADERS = {
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+ "Accept-Language": "en",
+}
+
DEPTH_LIMIT = 0
DEPTH_STATS_VERBOSE = False
DEPTH_PRIORITY = 0
+
DNSCACHE_ENABLED = True
DNSCACHE_SIZE = 10000
-DNS_RESOLVER = 'scrapy.resolver.CachingThreadedResolver'
+DNS_RESOLVER = "scrapy.resolver.CachingThreadedResolver"
DNS_TIMEOUT = 60
+
DOWNLOAD_DELAY = 0
+
DOWNLOAD_HANDLERS = {}
-DOWNLOAD_HANDLERS_BASE = {'data':
- 'scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler',
- 'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
- 'http': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
- 'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler',
- 's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler', 'ftp':
- 'scrapy.core.downloader.handlers.ftp.FTPDownloadHandler'}
-DOWNLOAD_TIMEOUT = 180
-DOWNLOAD_MAXSIZE = 1024 * 1024 * 1024
-DOWNLOAD_WARNSIZE = 32 * 1024 * 1024
+DOWNLOAD_HANDLERS_BASE = {
+ "data": "scrapy.core.downloader.handlers.datauri.DataURIDownloadHandler",
+ "file": "scrapy.core.downloader.handlers.file.FileDownloadHandler",
+ "http": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
+ "https": "scrapy.core.downloader.handlers.http.HTTPDownloadHandler",
+ "s3": "scrapy.core.downloader.handlers.s3.S3DownloadHandler",
+ "ftp": "scrapy.core.downloader.handlers.ftp.FTPDownloadHandler",
+}
+
+DOWNLOAD_TIMEOUT = 180 # 3mins
+
+DOWNLOAD_MAXSIZE = 1024 * 1024 * 1024 # 1024m
+DOWNLOAD_WARNSIZE = 32 * 1024 * 1024 # 32m
+
DOWNLOAD_FAIL_ON_DATALOSS = True
-DOWNLOADER = 'scrapy.core.downloader.Downloader'
+
+DOWNLOADER = "scrapy.core.downloader.Downloader"
+
DOWNLOADER_HTTPCLIENTFACTORY = (
- 'scrapy.core.downloader.webclient.ScrapyHTTPClientFactory')
+ "scrapy.core.downloader.webclient.ScrapyHTTPClientFactory"
+)
DOWNLOADER_CLIENTCONTEXTFACTORY = (
- 'scrapy.core.downloader.contextfactory.ScrapyClientContextFactory')
-DOWNLOADER_CLIENT_TLS_CIPHERS = 'DEFAULT'
-DOWNLOADER_CLIENT_TLS_METHOD = 'TLS'
+ "scrapy.core.downloader.contextfactory.ScrapyClientContextFactory"
+)
+DOWNLOADER_CLIENT_TLS_CIPHERS = "DEFAULT"
+# Use highest TLS/SSL protocol version supported by the platform, also allowing negotiation:
+DOWNLOADER_CLIENT_TLS_METHOD = "TLS"
DOWNLOADER_CLIENT_TLS_VERBOSE_LOGGING = False
+
DOWNLOADER_MIDDLEWARES = {}
+
DOWNLOADER_MIDDLEWARES_BASE = {
- 'scrapy.downloadermiddlewares.offsite.OffsiteMiddleware': 50,
- 'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
- 'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
- 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware':
- 350,
- 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware':
- 400, 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
- 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
- 'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
- 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
- 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware':
- 590, 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
- 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
- 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
- 'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
- 'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900}
+ # Engine side
+ "scrapy.downloadermiddlewares.offsite.OffsiteMiddleware": 50,
+ "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": 100,
+ "scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware": 300,
+ "scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware": 350,
+ "scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware": 400,
+ "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": 500,
+ "scrapy.downloadermiddlewares.retry.RetryMiddleware": 550,
+ "scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware": 560,
+ "scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware": 580,
+ "scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware": 590,
+ "scrapy.downloadermiddlewares.redirect.RedirectMiddleware": 600,
+ "scrapy.downloadermiddlewares.cookies.CookiesMiddleware": 700,
+ "scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware": 750,
+ "scrapy.downloadermiddlewares.stats.DownloaderStats": 850,
+ "scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware": 900,
+ # Downloader side
+}
+
DOWNLOADER_STATS = True
-DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter'
-EDITOR = 'vi'
-if sys.platform == 'win32':
- EDITOR = '%s -m idlelib.idle'
+
+DUPEFILTER_CLASS = "scrapy.dupefilters.RFPDupeFilter"
+
+EDITOR = "vi"
+if sys.platform == "win32":
+ EDITOR = "%s -m idlelib.idle"
+
EXTENSIONS = {}
-EXTENSIONS_BASE = {'scrapy.extensions.corestats.CoreStats': 0,
- 'scrapy.extensions.telnet.TelnetConsole': 0,
- 'scrapy.extensions.memusage.MemoryUsage': 0,
- 'scrapy.extensions.memdebug.MemoryDebugger': 0,
- 'scrapy.extensions.closespider.CloseSpider': 0,
- 'scrapy.extensions.feedexport.FeedExporter': 0,
- 'scrapy.extensions.logstats.LogStats': 0,
- 'scrapy.extensions.spiderstate.SpiderState': 0,
- 'scrapy.extensions.throttle.AutoThrottle': 0}
+
+EXTENSIONS_BASE = {
+ "scrapy.extensions.corestats.CoreStats": 0,
+ "scrapy.extensions.telnet.TelnetConsole": 0,
+ "scrapy.extensions.memusage.MemoryUsage": 0,
+ "scrapy.extensions.memdebug.MemoryDebugger": 0,
+ "scrapy.extensions.closespider.CloseSpider": 0,
+ "scrapy.extensions.feedexport.FeedExporter": 0,
+ "scrapy.extensions.logstats.LogStats": 0,
+ "scrapy.extensions.spiderstate.SpiderState": 0,
+ "scrapy.extensions.throttle.AutoThrottle": 0,
+}
+
FEED_TEMPDIR = None
FEEDS = {}
-FEED_URI_PARAMS = None
+FEED_URI_PARAMS = None # a function to extend uri arguments
FEED_STORE_EMPTY = True
FEED_EXPORT_ENCODING = None
FEED_EXPORT_FIELDS = None
FEED_STORAGES = {}
-FEED_STORAGES_BASE = {'': 'scrapy.extensions.feedexport.FileFeedStorage',
- 'file': 'scrapy.extensions.feedexport.FileFeedStorage', 'ftp':
- 'scrapy.extensions.feedexport.FTPFeedStorage', 'gs':
- 'scrapy.extensions.feedexport.GCSFeedStorage', 's3':
- 'scrapy.extensions.feedexport.S3FeedStorage', 'stdout':
- 'scrapy.extensions.feedexport.StdoutFeedStorage'}
+FEED_STORAGES_BASE = {
+ "": "scrapy.extensions.feedexport.FileFeedStorage",
+ "file": "scrapy.extensions.feedexport.FileFeedStorage",
+ "ftp": "scrapy.extensions.feedexport.FTPFeedStorage",
+ "gs": "scrapy.extensions.feedexport.GCSFeedStorage",
+ "s3": "scrapy.extensions.feedexport.S3FeedStorage",
+ "stdout": "scrapy.extensions.feedexport.StdoutFeedStorage",
+}
FEED_EXPORT_BATCH_ITEM_COUNT = 0
FEED_EXPORTERS = {}
-FEED_EXPORTERS_BASE = {'json': 'scrapy.exporters.JsonItemExporter',
- 'jsonlines': 'scrapy.exporters.JsonLinesItemExporter', 'jsonl':
- 'scrapy.exporters.JsonLinesItemExporter', 'jl':
- 'scrapy.exporters.JsonLinesItemExporter', 'csv':
- 'scrapy.exporters.CsvItemExporter', 'xml':
- 'scrapy.exporters.XmlItemExporter', 'marshal':
- 'scrapy.exporters.MarshalItemExporter', 'pickle':
- 'scrapy.exporters.PickleItemExporter'}
+FEED_EXPORTERS_BASE = {
+ "json": "scrapy.exporters.JsonItemExporter",
+ "jsonlines": "scrapy.exporters.JsonLinesItemExporter",
+ "jsonl": "scrapy.exporters.JsonLinesItemExporter",
+ "jl": "scrapy.exporters.JsonLinesItemExporter",
+ "csv": "scrapy.exporters.CsvItemExporter",
+ "xml": "scrapy.exporters.XmlItemExporter",
+ "marshal": "scrapy.exporters.MarshalItemExporter",
+ "pickle": "scrapy.exporters.PickleItemExporter",
+}
FEED_EXPORT_INDENT = 0
+
FEED_STORAGE_FTP_ACTIVE = False
-FEED_STORAGE_GCS_ACL = ''
-FEED_STORAGE_S3_ACL = ''
-FILES_STORE_S3_ACL = 'private'
-FILES_STORE_GCS_ACL = ''
-FTP_USER = 'anonymous'
-FTP_PASSWORD = 'guest'
+FEED_STORAGE_GCS_ACL = ""
+FEED_STORAGE_S3_ACL = ""
+
+FILES_STORE_S3_ACL = "private"
+FILES_STORE_GCS_ACL = ""
+
+FTP_USER = "anonymous"
+FTP_PASSWORD = "guest"
FTP_PASSIVE_MODE = True
+
GCS_PROJECT_ID = None
+
HTTPCACHE_ENABLED = False
-HTTPCACHE_DIR = 'httpcache'
+HTTPCACHE_DIR = "httpcache"
HTTPCACHE_IGNORE_MISSING = False
-HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
HTTPCACHE_EXPIRATION_SECS = 0
HTTPCACHE_ALWAYS_STORE = False
HTTPCACHE_IGNORE_HTTP_CODES = []
-HTTPCACHE_IGNORE_SCHEMES = ['file']
+HTTPCACHE_IGNORE_SCHEMES = ["file"]
HTTPCACHE_IGNORE_RESPONSE_CACHE_CONTROLS = []
-HTTPCACHE_DBM_MODULE = 'dbm'
-HTTPCACHE_POLICY = 'scrapy.extensions.httpcache.DummyPolicy'
+HTTPCACHE_DBM_MODULE = "dbm"
+HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
HTTPCACHE_GZIP = False
+
HTTPPROXY_ENABLED = True
-HTTPPROXY_AUTH_ENCODING = 'latin-1'
-IMAGES_STORE_S3_ACL = 'private'
-IMAGES_STORE_GCS_ACL = ''
-ITEM_PROCESSOR = 'scrapy.pipelines.ItemPipelineManager'
+HTTPPROXY_AUTH_ENCODING = "latin-1"
+
+IMAGES_STORE_S3_ACL = "private"
+IMAGES_STORE_GCS_ACL = ""
+
+ITEM_PROCESSOR = "scrapy.pipelines.ItemPipelineManager"
+
ITEM_PIPELINES = {}
ITEM_PIPELINES_BASE = {}
+
JOBDIR = None
+
LOG_ENABLED = True
-LOG_ENCODING = 'utf-8'
-LOG_FORMATTER = 'scrapy.logformatter.LogFormatter'
-LOG_FORMAT = '%(asctime)s [%(name)s] %(levelname)s: %(message)s'
-LOG_DATEFORMAT = '%Y-%m-%d %H:%M:%S'
+LOG_ENCODING = "utf-8"
+LOG_FORMATTER = "scrapy.logformatter.LogFormatter"
+LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"
+LOG_DATEFORMAT = "%Y-%m-%d %H:%M:%S"
LOG_STDOUT = False
-LOG_LEVEL = 'DEBUG'
+LOG_LEVEL = "DEBUG"
LOG_FILE = None
LOG_FILE_APPEND = True
LOG_SHORT_NAMES = False
+
SCHEDULER_DEBUG = False
+
LOGSTATS_INTERVAL = 60.0
-MAIL_HOST = 'localhost'
+
+MAIL_HOST = "localhost"
MAIL_PORT = 25
-MAIL_FROM = 'scrapy@localhost'
+MAIL_FROM = "scrapy@localhost"
MAIL_PASS = None
MAIL_USER = None
-MEMDEBUG_ENABLED = False
-MEMDEBUG_NOTIFY = []
+
+MEMDEBUG_ENABLED = False # enable memory debugging
+MEMDEBUG_NOTIFY = [] # send memory debugging report by mail at engine shutdown
+
MEMUSAGE_CHECK_INTERVAL_SECONDS = 60.0
MEMUSAGE_ENABLED = True
MEMUSAGE_LIMIT_MB = 0
MEMUSAGE_NOTIFY_MAIL = []
MEMUSAGE_WARNING_MB = 0
+
METAREFRESH_ENABLED = True
-METAREFRESH_IGNORE_TAGS = ['noscript']
+METAREFRESH_IGNORE_TAGS = ["noscript"]
METAREFRESH_MAXDELAY = 100
-NEWSPIDER_MODULE = ''
+
+NEWSPIDER_MODULE = ""
+
PERIODIC_LOG_DELTA = None
PERIODIC_LOG_STATS = None
PERIODIC_LOG_TIMING_ENABLED = False
+
RANDOMIZE_DOWNLOAD_DELAY = True
+
REACTOR_THREADPOOL_MAXSIZE = 10
+
REDIRECT_ENABLED = True
-REDIRECT_MAX_TIMES = 20
+REDIRECT_MAX_TIMES = 20 # uses Firefox default setting
REDIRECT_PRIORITY_ADJUST = +2
+
REFERER_ENABLED = True
-REFERRER_POLICY = 'scrapy.spidermiddlewares.referer.DefaultReferrerPolicy'
-REQUEST_FINGERPRINTER_CLASS = 'scrapy.utils.request.RequestFingerprinter'
-REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.6'
+REFERRER_POLICY = "scrapy.spidermiddlewares.referer.DefaultReferrerPolicy"
+
+REQUEST_FINGERPRINTER_CLASS = "scrapy.utils.request.RequestFingerprinter"
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.6"
+
RETRY_ENABLED = True
-RETRY_TIMES = 2
+RETRY_TIMES = 2 # initial response + 2 retries = 3 requests
RETRY_HTTP_CODES = [500, 502, 503, 504, 522, 524, 408, 429]
RETRY_PRIORITY_ADJUST = -1
-RETRY_EXCEPTIONS = ['twisted.internet.defer.TimeoutError',
- 'twisted.internet.error.TimeoutError',
- 'twisted.internet.error.DNSLookupError',
- 'twisted.internet.error.ConnectionRefusedError',
- 'twisted.internet.error.ConnectionDone',
- 'twisted.internet.error.ConnectError',
- 'twisted.internet.error.ConnectionLost',
- 'twisted.internet.error.TCPTimedOutError',
- 'twisted.web.client.ResponseFailed', OSError,
- 'scrapy.core.downloader.handlers.http11.TunnelError']
+RETRY_EXCEPTIONS = [
+ "twisted.internet.defer.TimeoutError",
+ "twisted.internet.error.TimeoutError",
+ "twisted.internet.error.DNSLookupError",
+ "twisted.internet.error.ConnectionRefusedError",
+ "twisted.internet.error.ConnectionDone",
+ "twisted.internet.error.ConnectError",
+ "twisted.internet.error.ConnectionLost",
+ "twisted.internet.error.TCPTimedOutError",
+ "twisted.web.client.ResponseFailed",
+ # OSError is raised by the HttpCompression middleware when trying to
+ # decompress an empty response
+ OSError,
+ "scrapy.core.downloader.handlers.http11.TunnelError",
+]
+
ROBOTSTXT_OBEY = False
-ROBOTSTXT_PARSER = 'scrapy.robotstxt.ProtegoRobotParser'
+ROBOTSTXT_PARSER = "scrapy.robotstxt.ProtegoRobotParser"
ROBOTSTXT_USER_AGENT = None
-SCHEDULER = 'scrapy.core.scheduler.Scheduler'
-SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleLifoDiskQueue'
-SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.LifoMemoryQueue'
-SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.ScrapyPriorityQueue'
+
+SCHEDULER = "scrapy.core.scheduler.Scheduler"
+SCHEDULER_DISK_QUEUE = "scrapy.squeues.PickleLifoDiskQueue"
+SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
+SCHEDULER_PRIORITY_QUEUE = "scrapy.pqueues.ScrapyPriorityQueue"
+
SCRAPER_SLOT_MAX_ACTIVE_SIZE = 5000000
-SPIDER_LOADER_CLASS = 'scrapy.spiderloader.SpiderLoader'
+
+SPIDER_LOADER_CLASS = "scrapy.spiderloader.SpiderLoader"
SPIDER_LOADER_WARN_ONLY = False
+
SPIDER_MIDDLEWARES = {}
+
SPIDER_MIDDLEWARES_BASE = {
- 'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware': 50,
- 'scrapy.spidermiddlewares.referer.RefererMiddleware': 700,
- 'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware': 800,
- 'scrapy.spidermiddlewares.depth.DepthMiddleware': 900}
+ # Engine side
+ "scrapy.spidermiddlewares.httperror.HttpErrorMiddleware": 50,
+ "scrapy.spidermiddlewares.referer.RefererMiddleware": 700,
+ "scrapy.spidermiddlewares.urllength.UrlLengthMiddleware": 800,
+ "scrapy.spidermiddlewares.depth.DepthMiddleware": 900,
+ # Spider side
+}
+
SPIDER_MODULES = []
-STATS_CLASS = 'scrapy.statscollectors.MemoryStatsCollector'
+
+STATS_CLASS = "scrapy.statscollectors.MemoryStatsCollector"
STATS_DUMP = True
+
STATSMAILER_RCPTS = []
-TEMPLATES_DIR = str((Path(__file__).parent / '..' / 'templates').resolve())
+
+TEMPLATES_DIR = str((Path(__file__).parent / ".." / "templates").resolve())
+
URLLENGTH_LIMIT = 2083
-USER_AGENT = (
- f"Scrapy/{import_module('scrapy').__version__} (+https://scrapy.org)")
+
+USER_AGENT = f'Scrapy/{import_module("scrapy").__version__} (+https://scrapy.org)'
+
TELNETCONSOLE_ENABLED = 1
TELNETCONSOLE_PORT = [6023, 6073]
-TELNETCONSOLE_HOST = '127.0.0.1'
-TELNETCONSOLE_USERNAME = 'scrapy'
+TELNETCONSOLE_HOST = "127.0.0.1"
+TELNETCONSOLE_USERNAME = "scrapy"
TELNETCONSOLE_PASSWORD = None
+
TWISTED_REACTOR = None
+
SPIDER_CONTRACTS = {}
-SPIDER_CONTRACTS_BASE = {'scrapy.contracts.default.UrlContract': 1,
- 'scrapy.contracts.default.CallbackKeywordArgumentsContract': 1,
- 'scrapy.contracts.default.ReturnsContract': 2,
- 'scrapy.contracts.default.ScrapesContract': 3}
+SPIDER_CONTRACTS_BASE = {
+ "scrapy.contracts.default.UrlContract": 1,
+ "scrapy.contracts.default.CallbackKeywordArgumentsContract": 1,
+ "scrapy.contracts.default.ReturnsContract": 2,
+ "scrapy.contracts.default.ScrapesContract": 3,
+}
diff --git a/scrapy/shell.py b/scrapy/shell.py
index f8a94309b..bb3b1461c 100644
--- a/scrapy/shell.py
+++ b/scrapy/shell.py
@@ -5,10 +5,12 @@ See documentation in docs/topics/shell.rst
"""
import os
import signal
+
from itemadapter import is_item
from twisted.internet import defer, threads
from twisted.python import threadable
from w3lib.url import any_to_uri
+
from scrapy.crawler import Crawler
from scrapy.exceptions import IgnoreRequest
from scrapy.http import Request, Response
@@ -23,21 +25,159 @@ from scrapy.utils.response import open_in_browser
class Shell:
- relevant_classes = Crawler, Spider, Request, Response, Settings
+ relevant_classes = (Crawler, Spider, Request, Response, Settings)
def __init__(self, crawler, update_vars=None, code=None):
self.crawler = crawler
self.update_vars = update_vars or (lambda x: None)
- self.item_class = load_object(crawler.settings['DEFAULT_ITEM_CLASS'])
+ self.item_class = load_object(crawler.settings["DEFAULT_ITEM_CLASS"])
self.spider = None
self.inthread = not threadable.isInIOThread()
self.code = code
self.vars = {}
+ def start(self, url=None, request=None, response=None, spider=None, redirect=True):
+ # disable accidental Ctrl-C key press from shutting down the engine
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
+ if url:
+ self.fetch(url, spider, redirect=redirect)
+ elif request:
+ self.fetch(request, spider)
+ elif response:
+ request = response.request
+ self.populate_vars(response, request, spider)
+ else:
+ self.populate_vars()
+ if self.code:
+ print(eval(self.code, globals(), self.vars))
+ else:
+ """
+ Detect interactive shell setting in scrapy.cfg
+ e.g.: ~/.config/scrapy.cfg or ~/.scrapy.cfg
+ [settings]
+ # shell can be one of ipython, bpython or python;
+ # to be used as the interactive python console, if available.
+ # (default is ipython, fallbacks in the order listed above)
+ shell = python
+ """
+ cfg = get_config()
+ section, option = "settings", "shell"
+ env = os.environ.get("SCRAPY_PYTHON_SHELL")
+ shells = []
+ if env:
+ shells += env.strip().lower().split(",")
+ elif cfg.has_option(section, option):
+ shells += [cfg.get(section, option).strip().lower()]
+ else: # try all by default
+ shells += DEFAULT_PYTHON_SHELLS.keys()
+ # always add standard shell as fallback
+ shells += ["python"]
+ start_python_console(
+ self.vars, shells=shells, banner=self.vars.pop("banner", "")
+ )
+
+ def _schedule(self, request, spider):
+ if is_asyncio_reactor_installed():
+ # set the asyncio event loop for the current thread
+ event_loop_path = self.crawler.settings["ASYNCIO_EVENT_LOOP"]
+ set_asyncio_event_loop(event_loop_path)
+ spider = self._open_spider(request, spider)
+ d = _request_deferred(request)
+ d.addCallback(lambda x: (x, spider))
+ self.crawler.engine.crawl(request)
+ return d
+
+ def _open_spider(self, request, spider):
+ if self.spider:
+ return self.spider
+
+ if spider is None:
+ spider = self.crawler.spider or self.crawler._create_spider()
+
+ self.crawler.spider = spider
+ self.crawler.engine.open_spider(spider, close_if_idle=False)
+ self.spider = spider
+ return spider
+
+ def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
+ from twisted.internet import reactor
+
+ if isinstance(request_or_url, Request):
+ request = request_or_url
+ else:
+ url = any_to_uri(request_or_url)
+ request = Request(url, dont_filter=True, **kwargs)
+ if redirect:
+ request.meta["handle_httpstatus_list"] = SequenceExclude(
+ range(300, 400)
+ )
+ else:
+ request.meta["handle_httpstatus_all"] = True
+ response = None
+ try:
+ response, spider = threads.blockingCallFromThread(
+ reactor, self._schedule, request, spider
+ )
+ except IgnoreRequest:
+ pass
+ self.populate_vars(response, request, spider)
+
+ def populate_vars(self, response=None, request=None, spider=None):
+ import scrapy
+
+ self.vars["scrapy"] = scrapy
+ self.vars["crawler"] = self.crawler
+ self.vars["item"] = self.item_class()
+ self.vars["settings"] = self.crawler.settings
+ self.vars["spider"] = spider
+ self.vars["request"] = request
+ self.vars["response"] = response
+ if self.inthread:
+ self.vars["fetch"] = self.fetch
+ self.vars["view"] = open_in_browser
+ self.vars["shelp"] = self.print_help
+ self.update_vars(self.vars)
+ if not self.code:
+ self.vars["banner"] = self.get_help()
+
+ def print_help(self):
+ print(self.get_help())
+
+ def get_help(self):
+ b = []
+ b.append("Available Scrapy objects:")
+ b.append(
+ " scrapy scrapy module (contains scrapy.Request, scrapy.Selector, etc)"
+ )
+ for k, v in sorted(self.vars.items()):
+ if self._is_relevant(v):
+ b.append(f" {k:<10} {v}")
+ b.append("Useful shortcuts:")
+ if self.inthread:
+ b.append(
+ " fetch(url[, redirect=True]) "
+ "Fetch URL and update local objects (by default, redirects are followed)"
+ )
+ b.append(
+ " fetch(req) "
+ "Fetch a scrapy.Request and update local objects "
+ )
+ b.append(" shelp() Shell help (print this help)")
+ b.append(" view(response) View response in a browser")
+
+ return "\n".join(f"[s] {line}" for line in b)
+
+ def _is_relevant(self, value):
+ return isinstance(value, self.relevant_classes) or is_item(value)
+
def inspect_response(response, spider):
"""Open a shell to inspect the given response"""
- pass
+ # Shell.start removes the SIGINT handler, so save it and re-add it after
+ # the shell has closed
+ sigint_handler = signal.getsignal(signal.SIGINT)
+ Shell(spider.crawler).start(response=response, spider=spider)
+ signal.signal(signal.SIGINT, sigint_handler)
def _request_deferred(request):
@@ -51,4 +191,18 @@ def _request_deferred(request):
WARNING: Do not call request.replace() until after the deferred is called.
"""
- pass
+ request_callback = request.callback
+ request_errback = request.errback
+
+ def _restore_callbacks(result):
+ request.callback = request_callback
+ request.errback = request_errback
+ return result
+
+ d = defer.Deferred()
+ d.addBoth(_restore_callbacks)
+ if request.callback:
+ d.addCallbacks(request.callback, request.errback)
+
+ request.callback, request.errback = d.callback, d.errback
+ return d
diff --git a/scrapy/signalmanager.py b/scrapy/signalmanager.py
index e85b12c05..f6df191d8 100644
--- a/scrapy/signalmanager.py
+++ b/scrapy/signalmanager.py
@@ -1,15 +1,16 @@
from typing import Any, List, Tuple
+
from pydispatch import dispatcher
from twisted.internet.defer import Deferred
+
from scrapy.utils import signal as _signal
class SignalManager:
-
- def __init__(self, sender: Any=dispatcher.Anonymous):
+ def __init__(self, sender: Any = dispatcher.Anonymous):
self.sender: Any = sender
- def connect(self, receiver: Any, signal: Any, **kwargs: Any) ->None:
+ def connect(self, receiver: Any, signal: Any, **kwargs: Any) -> None:
"""
Connect a receiver function to a signal.
@@ -23,27 +24,29 @@ class SignalManager:
:param signal: the signal to connect to
:type signal: object
"""
- pass
+ kwargs.setdefault("sender", self.sender)
+ dispatcher.connect(receiver, signal, **kwargs)
- def disconnect(self, receiver: Any, signal: Any, **kwargs: Any) ->None:
+ def disconnect(self, receiver: Any, signal: Any, **kwargs: Any) -> None:
"""
Disconnect a receiver function from a signal. This has the
opposite effect of the :meth:`connect` method, and the arguments
are the same.
"""
- pass
+ kwargs.setdefault("sender", self.sender)
+ dispatcher.disconnect(receiver, signal, **kwargs)
- def send_catch_log(self, signal: Any, **kwargs: Any) ->List[Tuple[Any, Any]
- ]:
+ def send_catch_log(self, signal: Any, **kwargs: Any) -> List[Tuple[Any, Any]]:
"""
Send a signal, catch exceptions and log them.
The keyword arguments are passed to the signal handlers (connected
through the :meth:`connect` method).
"""
- pass
+ kwargs.setdefault("sender", self.sender)
+ return _signal.send_catch_log(signal, **kwargs)
- def send_catch_log_deferred(self, signal: Any, **kwargs: Any) ->Deferred:
+ def send_catch_log_deferred(self, signal: Any, **kwargs: Any) -> Deferred:
"""
Like :meth:`send_catch_log` but supports returning
:class:`~twisted.internet.defer.Deferred` objects from signal handlers.
@@ -54,13 +57,15 @@ class SignalManager:
The keyword arguments are passed to the signal handlers (connected
through the :meth:`connect` method).
"""
- pass
+ kwargs.setdefault("sender", self.sender)
+ return _signal.send_catch_log_deferred(signal, **kwargs)
- def disconnect_all(self, signal: Any, **kwargs: Any) ->None:
+ def disconnect_all(self, signal: Any, **kwargs: Any) -> None:
"""
Disconnect all receivers from the given signal.
:param signal: the signal to disconnect from
:type signal: object
"""
- pass
+ kwargs.setdefault("sender", self.sender)
+ _signal.disconnect_all(signal, **kwargs)
diff --git a/scrapy/signals.py b/scrapy/signals.py
index 0d08d829c..0090f1c8b 100644
--- a/scrapy/signals.py
+++ b/scrapy/signals.py
@@ -4,6 +4,7 @@ Scrapy signals
These signals are documented in docs/topics/signals.rst. Please don't add new
signals here without documenting them there.
"""
+
engine_started = object()
engine_stopped = object()
spider_opened = object()
@@ -23,8 +24,12 @@ item_dropped = object()
item_error = object()
feed_slot_closed = object()
feed_exporter_closed = object()
+
+# for backward compatibility
stats_spider_opened = spider_opened
stats_spider_closing = spider_closed
stats_spider_closed = spider_closed
+
item_passed = item_scraped
+
request_received = request_scheduled
diff --git a/scrapy/spiderloader.py b/scrapy/spiderloader.py
index 9d53190bb..d855c962c 100644
--- a/scrapy/spiderloader.py
+++ b/scrapy/spiderloader.py
@@ -1,16 +1,21 @@
from __future__ import annotations
+
import traceback
import warnings
from collections import defaultdict
from types import ModuleType
from typing import TYPE_CHECKING, DefaultDict, Dict, List, Tuple, Type
+
from zope.interface import implementer
+
from scrapy import Request, Spider
from scrapy.interfaces import ISpiderLoader
from scrapy.settings import BaseSettings
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
@@ -22,28 +27,77 @@ class SpiderLoader:
"""
def __init__(self, settings: BaseSettings):
- self.spider_modules: List[str] = settings.getlist('SPIDER_MODULES')
- self.warn_only: bool = settings.getbool('SPIDER_LOADER_WARN_ONLY')
+ self.spider_modules: List[str] = settings.getlist("SPIDER_MODULES")
+ self.warn_only: bool = settings.getbool("SPIDER_LOADER_WARN_ONLY")
self._spiders: Dict[str, Type[Spider]] = {}
- self._found: DefaultDict[str, List[Tuple[str, str]]] = defaultdict(list
- )
+ self._found: DefaultDict[str, List[Tuple[str, str]]] = defaultdict(list)
self._load_all_spiders()
- def load(self, spider_name: str) ->Type[Spider]:
+ def _check_name_duplicates(self) -> None:
+ dupes = []
+ for name, locations in self._found.items():
+ dupes.extend(
+ [
+ f" {cls} named {name!r} (in {mod})"
+ for mod, cls in locations
+ if len(locations) > 1
+ ]
+ )
+
+ if dupes:
+ dupes_string = "\n\n".join(dupes)
+ warnings.warn(
+ "There are several spiders with the same name:\n\n"
+ f"{dupes_string}\n\n This can cause unexpected behavior.",
+ category=UserWarning,
+ )
+
+ def _load_spiders(self, module: ModuleType) -> None:
+ for spcls in iter_spider_classes(module):
+ self._found[spcls.name].append((module.__name__, spcls.__name__))
+ self._spiders[spcls.name] = spcls
+
+ def _load_all_spiders(self) -> None:
+ for name in self.spider_modules:
+ try:
+ for module in walk_modules(name):
+ self._load_spiders(module)
+ except ImportError:
+ if self.warn_only:
+ warnings.warn(
+ f"\n{traceback.format_exc()}Could not load spiders "
+ f"from module '{name}'. "
+ "See above traceback for details.",
+ category=RuntimeWarning,
+ )
+ else:
+ raise
+ self._check_name_duplicates()
+
+ @classmethod
+ def from_settings(cls, settings: BaseSettings) -> Self:
+ return cls(settings)
+
+ def load(self, spider_name: str) -> Type[Spider]:
"""
Return the Spider class for the given spider name. If the spider
name is not found, raise a KeyError.
"""
- pass
+ try:
+ return self._spiders[spider_name]
+ except KeyError:
+ raise KeyError(f"Spider not found: {spider_name}")
- def find_by_request(self, request: Request) ->List[str]:
+ def find_by_request(self, request: Request) -> List[str]:
"""
Return the list of spider names that can handle the given request.
"""
- pass
+ return [
+ name for name, cls in self._spiders.items() if cls.handles_request(request)
+ ]
- def list(self) ->List[str]:
+ def list(self) -> List[str]:
"""
Return a list with the names of all spiders available in the project.
"""
- pass
+ return list(self._spiders.keys())
diff --git a/scrapy/spidermiddlewares/depth.py b/scrapy/spidermiddlewares/depth.py
index 6b9fdb9ee..eadc7c6ab 100644
--- a/scrapy/spidermiddlewares/depth.py
+++ b/scrapy/spidermiddlewares/depth.py
@@ -3,15 +3,61 @@ Depth Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
+
import logging
+
from scrapy.http import Request
+
logger = logging.getLogger(__name__)
class DepthMiddleware:
-
def __init__(self, maxdepth, stats, verbose_stats=False, prio=1):
self.maxdepth = maxdepth
self.stats = stats
self.verbose_stats = verbose_stats
self.prio = prio
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ settings = crawler.settings
+ maxdepth = settings.getint("DEPTH_LIMIT")
+ verbose = settings.getbool("DEPTH_STATS_VERBOSE")
+ prio = settings.getint("DEPTH_PRIORITY")
+ return cls(maxdepth, crawler.stats, verbose, prio)
+
+ def process_spider_output(self, response, result, spider):
+ self._init_depth(response, spider)
+ return (r for r in result or () if self._filter(r, response, spider))
+
+ async def process_spider_output_async(self, response, result, spider):
+ self._init_depth(response, spider)
+ async for r in result or ():
+ if self._filter(r, response, spider):
+ yield r
+
+ def _init_depth(self, response, spider):
+ # base case (depth=0)
+ if "depth" not in response.meta:
+ response.meta["depth"] = 0
+ if self.verbose_stats:
+ self.stats.inc_value("request_depth_count/0", spider=spider)
+
+ def _filter(self, request, response, spider):
+ if not isinstance(request, Request):
+ return True
+ depth = response.meta["depth"] + 1
+ request.meta["depth"] = depth
+ if self.prio:
+ request.priority -= depth * self.prio
+ if self.maxdepth and depth > self.maxdepth:
+ logger.debug(
+ "Ignoring link (depth > %(maxdepth)d): %(requrl)s ",
+ {"maxdepth": self.maxdepth, "requrl": request.url},
+ extra={"spider": spider},
+ )
+ return False
+ if self.verbose_stats:
+ self.stats.inc_value(f"request_depth_count/{depth}", spider=spider)
+ self.stats.max_value("request_depth_max", depth, spider=spider)
+ return True
diff --git a/scrapy/spidermiddlewares/httperror.py b/scrapy/spidermiddlewares/httperror.py
index 001661412..0d3e5fe0b 100644
--- a/scrapy/spidermiddlewares/httperror.py
+++ b/scrapy/spidermiddlewares/httperror.py
@@ -4,7 +4,9 @@ HttpError Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
import logging
+
from scrapy.exceptions import IgnoreRequest
+
logger = logging.getLogger(__name__)
@@ -17,8 +19,41 @@ class HttpError(IgnoreRequest):
class HttpErrorMiddleware:
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler.settings)
def __init__(self, settings):
- self.handle_httpstatus_all = settings.getbool('HTTPERROR_ALLOW_ALL')
- self.handle_httpstatus_list = settings.getlist(
- 'HTTPERROR_ALLOWED_CODES')
+ self.handle_httpstatus_all = settings.getbool("HTTPERROR_ALLOW_ALL")
+ self.handle_httpstatus_list = settings.getlist("HTTPERROR_ALLOWED_CODES")
+
+ def process_spider_input(self, response, spider):
+ if 200 <= response.status < 300: # common case
+ return
+ meta = response.meta
+ if meta.get("handle_httpstatus_all", False):
+ return
+ if "handle_httpstatus_list" in meta:
+ allowed_statuses = meta["handle_httpstatus_list"]
+ elif self.handle_httpstatus_all:
+ return
+ else:
+ allowed_statuses = getattr(
+ spider, "handle_httpstatus_list", self.handle_httpstatus_list
+ )
+ if response.status in allowed_statuses:
+ return
+ raise HttpError(response, "Ignoring non-200 response")
+
+ def process_spider_exception(self, response, exception, spider):
+ if isinstance(exception, HttpError):
+ spider.crawler.stats.inc_value("httperror/response_ignored_count")
+ spider.crawler.stats.inc_value(
+ f"httperror/response_ignored_status_count/{response.status}"
+ )
+ logger.info(
+ "Ignoring response %(response)r: HTTP status code is not handled or not allowed",
+ {"response": response},
+ extra={"spider": spider},
+ )
+ return []
diff --git a/scrapy/spidermiddlewares/offsite.py b/scrapy/spidermiddlewares/offsite.py
index 5b86596e3..243055d89 100644
--- a/scrapy/spidermiddlewares/offsite.py
+++ b/scrapy/spidermiddlewares/offsite.py
@@ -6,24 +6,93 @@ See documentation in docs/topics/spider-middleware.rst
import logging
import re
import warnings
+
from scrapy import signals
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Request
from scrapy.utils.httpobj import urlparse_cached
+
warnings.warn(
- 'The scrapy.spidermiddlewares.offsite module is deprecated, use scrapy.downloadermiddlewares.offsite instead.'
- , ScrapyDeprecationWarning)
+ "The scrapy.spidermiddlewares.offsite module is deprecated, use "
+ "scrapy.downloadermiddlewares.offsite instead.",
+ ScrapyDeprecationWarning,
+)
+
logger = logging.getLogger(__name__)
class OffsiteMiddleware:
-
def __init__(self, stats):
self.stats = stats
+ @classmethod
+ def from_crawler(cls, crawler):
+ o = cls(crawler.stats)
+ crawler.signals.connect(o.spider_opened, signal=signals.spider_opened)
+ return o
+
+ def process_spider_output(self, response, result, spider):
+ return (r for r in result or () if self._filter(r, spider))
+
+ async def process_spider_output_async(self, response, result, spider):
+ async for r in result or ():
+ if self._filter(r, spider):
+ yield r
+
+ def _filter(self, request, spider) -> bool:
+ if not isinstance(request, Request):
+ return True
+ if request.dont_filter or self.should_follow(request, spider):
+ return True
+ domain = urlparse_cached(request).hostname
+ if domain and domain not in self.domains_seen:
+ self.domains_seen.add(domain)
+ logger.debug(
+ "Filtered offsite request to %(domain)r: %(request)s",
+ {"domain": domain, "request": request},
+ extra={"spider": spider},
+ )
+ self.stats.inc_value("offsite/domains", spider=spider)
+ self.stats.inc_value("offsite/filtered", spider=spider)
+ return False
+
+ def should_follow(self, request, spider):
+ regex = self.host_regex
+ # hostname can be None for wrong urls (like javascript links)
+ host = urlparse_cached(request).hostname or ""
+ return bool(regex.search(host))
+
def get_host_regex(self, spider):
"""Override this method to implement a different offsite policy"""
- pass
+ allowed_domains = getattr(spider, "allowed_domains", None)
+ if not allowed_domains:
+ return re.compile("") # allow all by default
+ url_pattern = re.compile(r"^https?://.*$")
+ port_pattern = re.compile(r":\d+$")
+ domains = []
+ for domain in allowed_domains:
+ if domain is None:
+ continue
+ if url_pattern.match(domain):
+ message = (
+ "allowed_domains accepts only domains, not URLs. "
+ f"Ignoring URL entry {domain} in allowed_domains."
+ )
+ warnings.warn(message, URLWarning)
+ elif port_pattern.search(domain):
+ message = (
+ "allowed_domains accepts only domains without ports. "
+ f"Ignoring entry {domain} in allowed_domains."
+ )
+ warnings.warn(message, PortWarning)
+ else:
+ domains.append(re.escape(domain))
+ regex = rf'^(.*\.)?({"|".join(domains)})$'
+ return re.compile(regex)
+
+ def spider_opened(self, spider):
+ self.host_regex = self.get_host_regex(spider)
+ self.domains_seen = set()
class URLWarning(Warning):
diff --git a/scrapy/spidermiddlewares/referer.py b/scrapy/spidermiddlewares/referer.py
index a92a7e327..fd91e658b 100644
--- a/scrapy/spidermiddlewares/referer.py
+++ b/scrapy/spidermiddlewares/referer.py
@@ -5,29 +5,49 @@ originated it.
import warnings
from typing import Tuple
from urllib.parse import urlparse
+
from w3lib.url import safe_url_string
+
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_unicode
from scrapy.utils.url import strip_url
-LOCAL_SCHEMES = 'about', 'blob', 'data', 'filesystem'
-POLICY_NO_REFERRER = 'no-referrer'
-POLICY_NO_REFERRER_WHEN_DOWNGRADE = 'no-referrer-when-downgrade'
-POLICY_SAME_ORIGIN = 'same-origin'
-POLICY_ORIGIN = 'origin'
-POLICY_STRICT_ORIGIN = 'strict-origin'
-POLICY_ORIGIN_WHEN_CROSS_ORIGIN = 'origin-when-cross-origin'
-POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = 'strict-origin-when-cross-origin'
-POLICY_UNSAFE_URL = 'unsafe-url'
-POLICY_SCRAPY_DEFAULT = 'scrapy-default'
+
+LOCAL_SCHEMES = (
+ "about",
+ "blob",
+ "data",
+ "filesystem",
+)
+
+POLICY_NO_REFERRER = "no-referrer"
+POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
+POLICY_SAME_ORIGIN = "same-origin"
+POLICY_ORIGIN = "origin"
+POLICY_STRICT_ORIGIN = "strict-origin"
+POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
+POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
+POLICY_UNSAFE_URL = "unsafe-url"
+POLICY_SCRAPY_DEFAULT = "scrapy-default"
class ReferrerPolicy:
NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES
name: str
+ def referrer(self, response_url, request_url):
+ raise NotImplementedError()
+
+ def stripped_referrer(self, url):
+ if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
+ return self.strip_url(url)
+
+ def origin_referrer(self, url):
+ if urlparse(url).scheme not in self.NOREFERRER_SCHEMES:
+ return self.origin(url)
+
def strip_url(self, url, origin_only=False):
"""
https://www.w3.org/TR/referrer-policy/#strip-url
@@ -42,11 +62,29 @@ class ReferrerPolicy:
Set url's query to null.
Return url.
"""
- pass
+ if not url:
+ return None
+ return strip_url(
+ url,
+ strip_credentials=True,
+ strip_fragment=True,
+ strip_default_port=True,
+ origin_only=origin_only,
+ )
def origin(self, url):
"""Return serialized origin (scheme, host, path) for a request or response URL."""
- pass
+ return self.strip_url(url, origin_only=True)
+
+ def potentially_trustworthy(self, url):
+ # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy
+ parsed_url = urlparse(url)
+ if parsed_url.scheme in ("data",):
+ return False
+ return self.tls_protected(url)
+
+ def tls_protected(self, url):
+ return urlparse(url).scheme in ("https", "ftps")
class NoReferrerPolicy(ReferrerPolicy):
@@ -57,8 +95,12 @@ class NoReferrerPolicy(ReferrerPolicy):
is to be sent along with requests made from a particular request client to any origin.
The header will be omitted entirely.
"""
+
name: str = POLICY_NO_REFERRER
+ def referrer(self, response_url, request_url):
+ return None
+
class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
"""
@@ -74,8 +116,13 @@ class NoReferrerWhenDowngradePolicy(ReferrerPolicy):
This is a user agent's default behavior, if no policy is otherwise specified.
"""
+
name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE
+ def referrer(self, response_url, request_url):
+ if not self.tls_protected(response_url) or self.tls_protected(request_url):
+ return self.stripped_referrer(response_url)
+
class SameOriginPolicy(ReferrerPolicy):
"""
@@ -87,8 +134,13 @@ class SameOriginPolicy(ReferrerPolicy):
Cross-origin requests, on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
+
name: str = POLICY_SAME_ORIGIN
+ def referrer(self, response_url, request_url):
+ if self.origin(response_url) == self.origin(request_url):
+ return self.stripped_referrer(response_url)
+
class OriginPolicy(ReferrerPolicy):
"""
@@ -99,8 +151,12 @@ class OriginPolicy(ReferrerPolicy):
when making both same-origin requests and cross-origin requests
from a particular request client.
"""
+
name: str = POLICY_ORIGIN
+ def referrer(self, response_url, request_url):
+ return self.origin_referrer(response_url)
+
class StrictOriginPolicy(ReferrerPolicy):
"""
@@ -115,8 +171,17 @@ class StrictOriginPolicy(ReferrerPolicy):
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
+
name: str = POLICY_STRICT_ORIGIN
+ def referrer(self, response_url, request_url):
+ if (
+ self.tls_protected(response_url)
+ and self.potentially_trustworthy(request_url)
+ or not self.tls_protected(response_url)
+ ):
+ return self.origin_referrer(response_url)
+
class OriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
@@ -129,8 +194,15 @@ class OriginWhenCrossOriginPolicy(ReferrerPolicy):
is sent as referrer information when making cross-origin requests
from a particular request client.
"""
+
name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN
+ def referrer(self, response_url, request_url):
+ origin = self.origin(response_url)
+ if origin == self.origin(request_url):
+ return self.stripped_referrer(response_url)
+ return origin
+
class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
"""
@@ -149,8 +221,20 @@ class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy):
on the other hand, will contain no referrer information.
A Referer HTTP header will not be sent.
"""
+
name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN
+ def referrer(self, response_url, request_url):
+ origin = self.origin(response_url)
+ if origin == self.origin(request_url):
+ return self.stripped_referrer(response_url)
+ if (
+ self.tls_protected(response_url)
+ and self.potentially_trustworthy(request_url)
+ or not self.tls_protected(response_url)
+ ):
+ return self.origin_referrer(response_url)
+
class UnsafeUrlPolicy(ReferrerPolicy):
"""
@@ -165,8 +249,12 @@ class UnsafeUrlPolicy(ReferrerPolicy):
to insecure origins.
Carefully consider the impact of setting such a policy for potentially sensitive documents.
"""
+
name: str = POLICY_UNSAFE_URL
+ def referrer(self, response_url, request_url):
+ return self.stripped_referrer(response_url)
+
class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
"""
@@ -174,15 +262,28 @@ class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy):
with the addition that "Referer" is not sent if the parent request was
using ``file://`` or ``s3://`` scheme.
"""
- NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES + ('file', 's3')
+
+ NOREFERRER_SCHEMES: Tuple[str, ...] = LOCAL_SCHEMES + ("file", "s3")
name: str = POLICY_SCRAPY_DEFAULT
-_policy_classes = {p.name: p for p in (NoReferrerPolicy,
- NoReferrerWhenDowngradePolicy, SameOriginPolicy, OriginPolicy,
- StrictOriginPolicy, OriginWhenCrossOriginPolicy,
- StrictOriginWhenCrossOriginPolicy, UnsafeUrlPolicy, DefaultReferrerPolicy)}
-_policy_classes[''] = NoReferrerWhenDowngradePolicy
+_policy_classes = {
+ p.name: p
+ for p in (
+ NoReferrerPolicy,
+ NoReferrerWhenDowngradePolicy,
+ SameOriginPolicy,
+ OriginPolicy,
+ StrictOriginPolicy,
+ OriginWhenCrossOriginPolicy,
+ StrictOriginWhenCrossOriginPolicy,
+ UnsafeUrlPolicy,
+ DefaultReferrerPolicy,
+ )
+}
+
+# Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string
+_policy_classes[""] = NoReferrerWhenDowngradePolicy
def _load_policy_class(policy, warning_only=False):
@@ -191,16 +292,36 @@ def _load_policy_class(policy, warning_only=False):
otherwise try to interpret the string as a standard value
from https://www.w3.org/TR/referrer-policy/#referrer-policies
"""
- pass
+ try:
+ return load_object(policy)
+ except ValueError:
+ try:
+ return _policy_classes[policy.lower()]
+ except KeyError:
+ msg = f"Could not load referrer policy {policy!r}"
+ if not warning_only:
+ raise RuntimeError(msg)
+ else:
+ warnings.warn(msg, RuntimeWarning)
+ return None
class RefererMiddleware:
-
def __init__(self, settings=None):
self.default_policy = DefaultReferrerPolicy
if settings is not None:
- self.default_policy = _load_policy_class(settings.get(
- 'REFERRER_POLICY'))
+ self.default_policy = _load_policy_class(settings.get("REFERRER_POLICY"))
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ if not crawler.settings.getbool("REFERER_ENABLED"):
+ raise NotConfigured
+ mw = cls(crawler.settings)
+
+ # Note: this hook is a bit of a hack to intercept redirections
+ crawler.signals.connect(mw.request_scheduled, signal=signals.request_scheduled)
+
+ return mw
def policy(self, resp_or_url, request):
"""
@@ -215,4 +336,50 @@ class RefererMiddleware:
it is used if valid
- otherwise, the policy from settings is used.
"""
- pass
+ policy_name = request.meta.get("referrer_policy")
+ if policy_name is None:
+ if isinstance(resp_or_url, Response):
+ policy_header = resp_or_url.headers.get("Referrer-Policy")
+ if policy_header is not None:
+ policy_name = to_unicode(policy_header.decode("latin1"))
+ if policy_name is None:
+ return self.default_policy()
+
+ cls = _load_policy_class(policy_name, warning_only=True)
+ return cls() if cls else self.default_policy()
+
+ def process_spider_output(self, response, result, spider):
+ return (self._set_referer(r, response) for r in result or ())
+
+ async def process_spider_output_async(self, response, result, spider):
+ async for r in result or ():
+ yield self._set_referer(r, response)
+
+ def _set_referer(self, r, response):
+ if isinstance(r, Request):
+ referrer = self.policy(response, r).referrer(response.url, r.url)
+ if referrer is not None:
+ r.headers.setdefault("Referer", referrer)
+ return r
+
+ def request_scheduled(self, request, spider):
+ # check redirected request to patch "Referer" header if necessary
+ redirected_urls = request.meta.get("redirect_urls", [])
+ if redirected_urls:
+ request_referrer = request.headers.get("Referer")
+ # we don't patch the referrer value if there is none
+ if request_referrer is not None:
+ # the request's referrer header value acts as a surrogate
+ # for the parent response URL
+ #
+ # Note: if the 3xx response contained a Referrer-Policy header,
+ # the information is not available using this hook
+ parent_url = safe_url_string(request_referrer)
+ policy_referrer = self.policy(parent_url, request).referrer(
+ parent_url, request.url
+ )
+ if policy_referrer != request_referrer:
+ if policy_referrer is None:
+ request.headers.pop("Referer")
+ else:
+ request.headers["Referer"] = policy_referrer
diff --git a/scrapy/spidermiddlewares/urllength.py b/scrapy/spidermiddlewares/urllength.py
index 1844c1465..f6d92e53a 100644
--- a/scrapy/spidermiddlewares/urllength.py
+++ b/scrapy/spidermiddlewares/urllength.py
@@ -3,13 +3,43 @@ Url Length Spider Middleware
See documentation in docs/topics/spider-middleware.rst
"""
+
import logging
+
from scrapy.exceptions import NotConfigured
from scrapy.http import Request
+
logger = logging.getLogger(__name__)
class UrlLengthMiddleware:
-
def __init__(self, maxlength):
self.maxlength = maxlength
+
+ @classmethod
+ def from_settings(cls, settings):
+ maxlength = settings.getint("URLLENGTH_LIMIT")
+ if not maxlength:
+ raise NotConfigured
+ return cls(maxlength)
+
+ def process_spider_output(self, response, result, spider):
+ return (r for r in result or () if self._filter(r, spider))
+
+ async def process_spider_output_async(self, response, result, spider):
+ async for r in result or ():
+ if self._filter(r, spider):
+ yield r
+
+ def _filter(self, request, spider):
+ if isinstance(request, Request) and len(request.url) > self.maxlength:
+ logger.info(
+ "Ignoring link (url length > %(maxlength)d): %(url)s ",
+ {"maxlength": self.maxlength, "url": request.url},
+ extra={"spider": spider},
+ )
+ spider.crawler.stats.inc_value(
+ "urllength/request_ignored_count", spider=spider
+ )
+ return False
+ return True
diff --git a/scrapy/spiders/crawl.py b/scrapy/spiders/crawl.py
index 291994af6..31e845716 100644
--- a/scrapy/spiders/crawl.py
+++ b/scrapy/spiders/crawl.py
@@ -4,20 +4,46 @@ for scraping typical web sites that requires crawling pages.
See documentation in docs/topics/spiders.rst
"""
+
import copy
from typing import AsyncIterable, Awaitable, Sequence
+
from scrapy.http import HtmlResponse, Request, Response
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
from scrapy.utils.asyncgen import collect_asyncgen
from scrapy.utils.spider import iterate_spider_output
+
+
+def _identity(x):
+ return x
+
+
+def _identity_process_request(request, response):
+ return request
+
+
+def _get_method(method, spider):
+ if callable(method):
+ return method
+ if isinstance(method, str):
+ return getattr(spider, method, None)
+
+
_default_link_extractor = LinkExtractor()
class Rule:
-
- def __init__(self, link_extractor=None, callback=None, cb_kwargs=None,
- follow=None, process_links=None, process_request=None, errback=None):
+ def __init__(
+ self,
+ link_extractor=None,
+ callback=None,
+ cb_kwargs=None,
+ follow=None,
+ process_links=None,
+ process_request=None,
+ errback=None,
+ ):
self.link_extractor = link_extractor or _default_link_extractor
self.callback = callback
self.errback = errback
@@ -26,6 +52,12 @@ class Rule:
self.process_request = process_request or _identity_process_request
self.follow = follow if follow is not None else not callback
+ def _compile(self, spider):
+ self.callback = _get_method(self.callback, spider)
+ self.errback = _get_method(self.errback, spider)
+ self.process_links = _get_method(self.process_links, spider)
+ self.process_request = _get_method(self.process_request, spider)
+
class CrawlSpider(Spider):
rules: Sequence[Rule] = ()
@@ -33,3 +65,85 @@ class CrawlSpider(Spider):
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
self._compile_rules()
+
+ def _parse(self, response, **kwargs):
+ return self._parse_response(
+ response=response,
+ callback=self.parse_start_url,
+ cb_kwargs=kwargs,
+ follow=True,
+ )
+
+ def parse_start_url(self, response, **kwargs):
+ return []
+
+ def process_results(self, response: Response, results: list):
+ return results
+
+ def _build_request(self, rule_index, link):
+ return Request(
+ url=link.url,
+ callback=self._callback,
+ errback=self._errback,
+ meta=dict(rule=rule_index, link_text=link.text),
+ )
+
+ def _requests_to_follow(self, response):
+ if not isinstance(response, HtmlResponse):
+ return
+ seen = set()
+ for rule_index, rule in enumerate(self._rules):
+ links = [
+ lnk
+ for lnk in rule.link_extractor.extract_links(response)
+ if lnk not in seen
+ ]
+ for link in rule.process_links(links):
+ seen.add(link)
+ request = self._build_request(rule_index, link)
+ yield rule.process_request(request, response)
+
+ def _callback(self, response, **cb_kwargs):
+ rule = self._rules[response.meta["rule"]]
+ return self._parse_response(
+ response, rule.callback, {**rule.cb_kwargs, **cb_kwargs}, rule.follow
+ )
+
+ def _errback(self, failure):
+ rule = self._rules[failure.request.meta["rule"]]
+ return self._handle_failure(failure, rule.errback)
+
+ async def _parse_response(self, response, callback, cb_kwargs, follow=True):
+ if callback:
+ cb_res = callback(response, **cb_kwargs) or ()
+ if isinstance(cb_res, AsyncIterable):
+ cb_res = await collect_asyncgen(cb_res)
+ elif isinstance(cb_res, Awaitable):
+ cb_res = await cb_res
+ cb_res = self.process_results(response, cb_res)
+ for request_or_item in iterate_spider_output(cb_res):
+ yield request_or_item
+
+ if follow and self._follow_links:
+ for request_or_item in self._requests_to_follow(response):
+ yield request_or_item
+
+ def _handle_failure(self, failure, errback):
+ if errback:
+ results = errback(failure) or ()
+ for request_or_item in iterate_spider_output(results):
+ yield request_or_item
+
+ def _compile_rules(self):
+ self._rules = []
+ for rule in self.rules:
+ self._rules.append(copy.copy(rule))
+ self._rules[-1]._compile(self)
+
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ spider = super().from_crawler(crawler, *args, **kwargs)
+ spider._follow_links = crawler.settings.getbool(
+ "CRAWLSPIDER_FOLLOW_LINKS", True
+ )
+ return spider
diff --git a/scrapy/spiders/feed.py b/scrapy/spiders/feed.py
index 4c0801928..42675c76a 100644
--- a/scrapy/spiders/feed.py
+++ b/scrapy/spiders/feed.py
@@ -20,8 +20,9 @@ class XMLFeedSpider(Spider):
'xml' selector, or an 'html' selector. In most cases, it's convenient to
use iternodes, since it's a faster and cleaner.
"""
- iterator = 'iternodes'
- itertag = 'item'
+
+ iterator = "iternodes"
+ itertag = "item"
namespaces = ()
def process_results(self, response, results):
@@ -32,18 +33,20 @@ class XMLFeedSpider(Spider):
the response which originated that results. It must return a list of
results (items or requests).
"""
- pass
+ return results
def adapt_response(self, response):
"""You can override this function in order to make any changes you want
to into the feed before parsing it. This function must return a
response.
"""
- pass
+ return response
def parse_node(self, response, selector):
"""This method must be overridden with your custom spider functionality"""
- pass
+ if hasattr(self, "parse_item"): # backward compatibility
+ return self.parse_item(response, selector)
+ raise NotImplementedError
def parse_nodes(self, response, nodes):
"""This method is called for the nodes matching the provided tag name
@@ -52,7 +55,42 @@ class XMLFeedSpider(Spider):
This method must return either an item, a request, or a list
containing any of them.
"""
- pass
+
+ for selector in nodes:
+ ret = iterate_spider_output(self.parse_node(response, selector))
+ for result_item in self.process_results(response, ret):
+ yield result_item
+
+ def _parse(self, response, **kwargs):
+ if not hasattr(self, "parse_node"):
+ raise NotConfigured(
+ "You must define parse_node method in order to scrape this XML feed"
+ )
+
+ response = self.adapt_response(response)
+ if self.iterator == "iternodes":
+ nodes = self._iternodes(response)
+ elif self.iterator == "xml":
+ selector = Selector(response, type="xml")
+ self._register_namespaces(selector)
+ nodes = selector.xpath(f"//{self.itertag}")
+ elif self.iterator == "html":
+ selector = Selector(response, type="html")
+ self._register_namespaces(selector)
+ nodes = selector.xpath(f"//{self.itertag}")
+ else:
+ raise NotSupported("Unsupported node iterator")
+
+ return self.parse_nodes(response, nodes)
+
+ def _iternodes(self, response):
+ for node in xmliter_lxml(response, self.itertag):
+ self._register_namespaces(node)
+ yield node
+
+ def _register_namespaces(self, selector):
+ for prefix, uri in self.namespaces:
+ selector.register_namespace(prefix, uri)
class CSVFeedSpider(Spider):
@@ -63,21 +101,26 @@ class CSVFeedSpider(Spider):
You can set some options regarding the CSV file, such as the delimiter, quotechar
and the file's headers.
"""
- delimiter = None
- quotechar = None
+
+ delimiter = (
+ None # When this is None, python's csv module's default delimiter is used
+ )
+ quotechar = (
+ None # When this is None, python's csv module's default quotechar is used
+ )
headers = None
def process_results(self, response, results):
"""This method has the same purpose as the one in XMLFeedSpider"""
- pass
+ return results
def adapt_response(self, response):
"""This method has the same purpose as the one in XMLFeedSpider"""
- pass
+ return response
def parse_row(self, response, row):
"""This method must be overridden with your custom spider functionality"""
- pass
+ raise NotImplementedError
def parse_rows(self, response):
"""Receives a response and a dict (representing each row) with a key for
@@ -85,4 +128,18 @@ class CSVFeedSpider(Spider):
gives the opportunity to override adapt_response and
process_results methods for pre and post-processing purposes.
"""
- pass
+
+ for row in csviter(
+ response, self.delimiter, self.headers, quotechar=self.quotechar
+ ):
+ ret = iterate_spider_output(self.parse_row(response, row))
+ for result_item in self.process_results(response, ret):
+ yield result_item
+
+ def _parse(self, response, **kwargs):
+ if not hasattr(self, "parse_row"):
+ raise NotConfigured(
+ "You must define parse_row method in order to scrape this CSV feed"
+ )
+ response = self.adapt_response(response)
+ return self.parse_rows(response)
diff --git a/scrapy/spiders/init.py b/scrapy/spiders/init.py
index 7f6f7eefe..3cb215b0f 100644
--- a/scrapy/spiders/init.py
+++ b/scrapy/spiders/init.py
@@ -5,11 +5,15 @@ from scrapy.utils.spider import iterate_spider_output
class InitSpider(Spider):
"""Base Spider with initialization facilities"""
+ def start_requests(self):
+ self._postinit_reqs = super().start_requests()
+ return iterate_spider_output(self.init_request())
+
def initialized(self, response=None):
"""This method must be set as the callback of your last initialization
request. See self.init_request() docstring for more info.
"""
- pass
+ return self.__dict__.pop("_postinit_reqs")
def init_request(self):
"""This function should return one initialization request, with the
@@ -24,4 +28,4 @@ class InitSpider(Spider):
overridden only when you need to perform requests to initialize your
spider
"""
- pass
+ return self.initialized()
diff --git a/scrapy/spiders/sitemap.py b/scrapy/spiders/sitemap.py
index 97a4c2aed..386aa6a6e 100644
--- a/scrapy/spiders/sitemap.py
+++ b/scrapy/spiders/sitemap.py
@@ -1,25 +1,41 @@
import logging
import re
from typing import TYPE_CHECKING, Any
+
from scrapy.http import Request, XmlResponse
from scrapy.spiders import Spider
from scrapy.utils._compression import _DecompressionMaxSizeExceeded
from scrapy.utils.gz import gunzip, gzip_magic_number
from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
from scrapy.crawler import Crawler
+
logger = logging.getLogger(__name__)
class SitemapSpider(Spider):
sitemap_urls = ()
- sitemap_rules = [('', 'parse')]
- sitemap_follow = ['']
+ sitemap_rules = [("", "parse")]
+ sitemap_follow = [""]
sitemap_alternate_links = False
_max_size: int
_warn_size: int
+ @classmethod
+ def from_crawler(cls, crawler: "Crawler", *args: Any, **kwargs: Any) -> "Self":
+ spider = super().from_crawler(crawler, *args, **kwargs)
+ spider._max_size = getattr(
+ spider, "download_maxsize", spider.settings.getint("DOWNLOAD_MAXSIZE")
+ )
+ spider._warn_size = getattr(
+ spider, "download_warnsize", spider.settings.getint("DOWNLOAD_WARNSIZE")
+ )
+ return spider
+
def __init__(self, *a, **kw):
super().__init__(*a, **kw)
self._cbs = []
@@ -29,15 +45,89 @@ class SitemapSpider(Spider):
self._cbs.append((regex(r), c))
self._follow = [regex(x) for x in self.sitemap_follow]
+ def start_requests(self):
+ for url in self.sitemap_urls:
+ yield Request(url, self._parse_sitemap)
+
def sitemap_filter(self, entries):
"""This method can be used to filter sitemap entries by their
attributes, for example, you can filter locs with lastmod greater
than a given date (see docs).
"""
- pass
+ for entry in entries:
+ yield entry
+
+ def _parse_sitemap(self, response):
+ if response.url.endswith("/robots.txt"):
+ for url in sitemap_urls_from_robots(response.text, base_url=response.url):
+ yield Request(url, callback=self._parse_sitemap)
+ else:
+ body = self._get_sitemap_body(response)
+ if body is None:
+ logger.warning(
+ "Ignoring invalid sitemap: %(response)s",
+ {"response": response},
+ extra={"spider": self},
+ )
+ return
+
+ s = Sitemap(body)
+ it = self.sitemap_filter(s)
+
+ if s.type == "sitemapindex":
+ for loc in iterloc(it, self.sitemap_alternate_links):
+ if any(x.search(loc) for x in self._follow):
+ yield Request(loc, callback=self._parse_sitemap)
+ elif s.type == "urlset":
+ for loc in iterloc(it, self.sitemap_alternate_links):
+ for r, c in self._cbs:
+ if r.search(loc):
+ yield Request(loc, callback=c)
+ break
def _get_sitemap_body(self, response):
"""Return the sitemap body contained in the given response,
or None if the response is not a sitemap.
"""
- pass
+ if isinstance(response, XmlResponse):
+ return response.body
+ if gzip_magic_number(response):
+ uncompressed_size = len(response.body)
+ max_size = response.meta.get("download_maxsize", self._max_size)
+ warn_size = response.meta.get("download_warnsize", self._warn_size)
+ try:
+ body = gunzip(response.body, max_size=max_size)
+ except _DecompressionMaxSizeExceeded:
+ return None
+ if uncompressed_size < warn_size <= len(body):
+ logger.warning(
+ f"{response} body size after decompression ({len(body)} B) "
+ f"is larger than the download warning size ({warn_size} B)."
+ )
+ return body
+ # actual gzipped sitemap files are decompressed above ;
+ # if we are here (response body is not gzipped)
+ # and have a response for .xml.gz,
+ # it usually means that it was already gunzipped
+ # by HttpCompression middleware,
+ # the HTTP response being sent with "Content-Encoding: gzip"
+ # without actually being a .xml.gz file in the first place,
+ # merely XML gzip-compressed on the fly,
+ # in other word, here, we have plain XML
+ if response.url.endswith(".xml") or response.url.endswith(".xml.gz"):
+ return response.body
+
+
+def regex(x):
+ if isinstance(x, str):
+ return re.compile(x)
+ return x
+
+
+def iterloc(it, alt=False):
+ for d in it:
+ yield d["loc"]
+
+ # Also consider alternate URLs (xhtml:link rel="alternate")
+ if alt and "alternate" in d:
+ yield from d["alternate"]
diff --git a/scrapy/squeues.py b/scrapy/squeues.py
index 612da342c..f665ad88c 100644
--- a/scrapy/squeues.py
+++ b/scrapy/squeues.py
@@ -1,28 +1,144 @@
"""
Scheduler queues
"""
+
import marshal
import pickle
from os import PathLike
from pathlib import Path
from typing import Union
+
from queuelib import queue
+
from scrapy.utils.request import request_from_dict
-_PickleFifoSerializationDiskQueue = _serializable_queue(_with_mkdir(queue.
- FifoDiskQueue), _pickle_serialize, pickle.loads)
-_PickleLifoSerializationDiskQueue = _serializable_queue(_with_mkdir(queue.
- LifoDiskQueue), _pickle_serialize, pickle.loads)
-_MarshalFifoSerializationDiskQueue = _serializable_queue(_with_mkdir(queue.
- FifoDiskQueue), marshal.dumps, marshal.loads)
-_MarshalLifoSerializationDiskQueue = _serializable_queue(_with_mkdir(queue.
- LifoDiskQueue), marshal.dumps, marshal.loads)
-PickleFifoDiskQueue = _scrapy_serialization_queue(
- _PickleFifoSerializationDiskQueue)
-PickleLifoDiskQueue = _scrapy_serialization_queue(
- _PickleLifoSerializationDiskQueue)
-MarshalFifoDiskQueue = _scrapy_serialization_queue(
- _MarshalFifoSerializationDiskQueue)
-MarshalLifoDiskQueue = _scrapy_serialization_queue(
- _MarshalLifoSerializationDiskQueue)
+
+
+def _with_mkdir(queue_class):
+ class DirectoriesCreated(queue_class):
+ def __init__(self, path: Union[str, PathLike], *args, **kwargs):
+ dirname = Path(path).parent
+ if not dirname.exists():
+ dirname.mkdir(parents=True, exist_ok=True)
+ super().__init__(path, *args, **kwargs)
+
+ return DirectoriesCreated
+
+
+def _serializable_queue(queue_class, serialize, deserialize):
+ class SerializableQueue(queue_class):
+ def push(self, obj):
+ s = serialize(obj)
+ super().push(s)
+
+ def pop(self):
+ s = super().pop()
+ if s:
+ return deserialize(s)
+
+ def peek(self):
+ """Returns the next object to be returned by :meth:`pop`,
+ but without removing it from the queue.
+
+ Raises :exc:`NotImplementedError` if the underlying queue class does
+ not implement a ``peek`` method, which is optional for queues.
+ """
+ try:
+ s = super().peek()
+ except AttributeError as ex:
+ raise NotImplementedError(
+ "The underlying queue class does not implement 'peek'"
+ ) from ex
+ if s:
+ return deserialize(s)
+
+ return SerializableQueue
+
+
+def _scrapy_serialization_queue(queue_class):
+ class ScrapyRequestQueue(queue_class):
+ def __init__(self, crawler, key):
+ self.spider = crawler.spider
+ super().__init__(key)
+
+ @classmethod
+ def from_crawler(cls, crawler, key, *args, **kwargs):
+ return cls(crawler, key)
+
+ def push(self, request):
+ request = request.to_dict(spider=self.spider)
+ return super().push(request)
+
+ def pop(self):
+ request = super().pop()
+ if not request:
+ return None
+ return request_from_dict(request, spider=self.spider)
+
+ def peek(self):
+ """Returns the next object to be returned by :meth:`pop`,
+ but without removing it from the queue.
+
+ Raises :exc:`NotImplementedError` if the underlying queue class does
+ not implement a ``peek`` method, which is optional for queues.
+ """
+ request = super().peek()
+ if not request:
+ return None
+ return request_from_dict(request, spider=self.spider)
+
+ return ScrapyRequestQueue
+
+
+def _scrapy_non_serialization_queue(queue_class):
+ class ScrapyRequestQueue(queue_class):
+ @classmethod
+ def from_crawler(cls, crawler, *args, **kwargs):
+ return cls()
+
+ def peek(self):
+ """Returns the next object to be returned by :meth:`pop`,
+ but without removing it from the queue.
+
+ Raises :exc:`NotImplementedError` if the underlying queue class does
+ not implement a ``peek`` method, which is optional for queues.
+ """
+ try:
+ s = super().peek()
+ except AttributeError as ex:
+ raise NotImplementedError(
+ "The underlying queue class does not implement 'peek'"
+ ) from ex
+ return s
+
+ return ScrapyRequestQueue
+
+
+def _pickle_serialize(obj):
+ try:
+ return pickle.dumps(obj, protocol=4)
+ # Both pickle.PicklingError and AttributeError can be raised by pickle.dump(s)
+ # TypeError is raised from parsel.Selector
+ except (pickle.PicklingError, AttributeError, TypeError) as e:
+ raise ValueError(str(e)) from e
+
+
+_PickleFifoSerializationDiskQueue = _serializable_queue(
+ _with_mkdir(queue.FifoDiskQueue), _pickle_serialize, pickle.loads
+)
+_PickleLifoSerializationDiskQueue = _serializable_queue(
+ _with_mkdir(queue.LifoDiskQueue), _pickle_serialize, pickle.loads
+)
+_MarshalFifoSerializationDiskQueue = _serializable_queue(
+ _with_mkdir(queue.FifoDiskQueue), marshal.dumps, marshal.loads
+)
+_MarshalLifoSerializationDiskQueue = _serializable_queue(
+ _with_mkdir(queue.LifoDiskQueue), marshal.dumps, marshal.loads
+)
+
+# public queue classes
+PickleFifoDiskQueue = _scrapy_serialization_queue(_PickleFifoSerializationDiskQueue)
+PickleLifoDiskQueue = _scrapy_serialization_queue(_PickleLifoSerializationDiskQueue)
+MarshalFifoDiskQueue = _scrapy_serialization_queue(_MarshalFifoSerializationDiskQueue)
+MarshalLifoDiskQueue = _scrapy_serialization_queue(_MarshalLifoSerializationDiskQueue)
FifoMemoryQueue = _scrapy_non_serialization_queue(queue.FifoMemoryQueue)
LifoMemoryQueue = _scrapy_non_serialization_queue(queue.LifoMemoryQueue)
diff --git a/scrapy/statscollectors.py b/scrapy/statscollectors.py
index dac77ae91..15193aac5 100644
--- a/scrapy/statscollectors.py
+++ b/scrapy/statscollectors.py
@@ -4,26 +4,95 @@ Scrapy extension for collecting scraping stats
import logging
import pprint
from typing import TYPE_CHECKING, Any, Dict, Optional
+
from scrapy import Spider
+
if TYPE_CHECKING:
from scrapy.crawler import Crawler
+
logger = logging.getLogger(__name__)
+
+
StatsT = Dict[str, Any]
class StatsCollector:
-
- def __init__(self, crawler: 'Crawler'):
- self._dump: bool = crawler.settings.getbool('STATS_DUMP')
+ def __init__(self, crawler: "Crawler"):
+ self._dump: bool = crawler.settings.getbool("STATS_DUMP")
self._stats: StatsT = {}
+ def get_value(
+ self, key: str, default: Any = None, spider: Optional[Spider] = None
+ ) -> Any:
+ return self._stats.get(key, default)
-class MemoryStatsCollector(StatsCollector):
+ def get_stats(self, spider: Optional[Spider] = None) -> StatsT:
+ return self._stats
+
+ def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ self._stats[key] = value
+
+ def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
+ self._stats = stats
+
+ def inc_value(
+ self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
+ ) -> None:
+ d = self._stats
+ d[key] = d.setdefault(key, start) + count
+
+ def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ self._stats[key] = max(self._stats.setdefault(key, value), value)
+
+ def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ self._stats[key] = min(self._stats.setdefault(key, value), value)
- def __init__(self, crawler: 'Crawler'):
+ def clear_stats(self, spider: Optional[Spider] = None) -> None:
+ self._stats.clear()
+
+ def open_spider(self, spider: Spider) -> None:
+ pass
+
+ def close_spider(self, spider: Spider, reason: str) -> None:
+ if self._dump:
+ logger.info(
+ "Dumping Scrapy stats:\n" + pprint.pformat(self._stats),
+ extra={"spider": spider},
+ )
+ self._persist_stats(self._stats, spider)
+
+ def _persist_stats(self, stats: StatsT, spider: Spider) -> None:
+ pass
+
+
+class MemoryStatsCollector(StatsCollector):
+ def __init__(self, crawler: "Crawler"):
super().__init__(crawler)
self.spider_stats: Dict[str, StatsT] = {}
+ def _persist_stats(self, stats: StatsT, spider: Spider) -> None:
+ self.spider_stats[spider.name] = stats
+
class DummyStatsCollector(StatsCollector):
- pass
+ def get_value(
+ self, key: str, default: Any = None, spider: Optional[Spider] = None
+ ) -> Any:
+ return default
+
+ def set_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ pass
+
+ def set_stats(self, stats: StatsT, spider: Optional[Spider] = None) -> None:
+ pass
+
+ def inc_value(
+ self, key: str, count: int = 1, start: int = 0, spider: Optional[Spider] = None
+ ) -> None:
+ pass
+
+ def max_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ pass
+
+ def min_value(self, key: str, value: Any, spider: Optional[Spider] = None) -> None:
+ pass
diff --git a/scrapy/utils/_compression.py b/scrapy/utils/_compression.py
index 106ea8c09..7c40d0a02 100644
--- a/scrapy/utils/_compression.py
+++ b/scrapy/utils/_compression.py
@@ -1,7 +1,9 @@
import zlib
from io import BytesIO
from warnings import warn
+
from scrapy.exceptions import ScrapyDeprecationWarning
+
try:
import brotli
except ImportError:
@@ -11,14 +13,111 @@ else:
brotli.Decompressor.process
except AttributeError:
warn(
- 'You have brotlipy installed, and Scrapy will use it, but Scrapy support for brotlipy is deprecated and will stop working in a future version of Scrapy. brotlipy itself is deprecated, it has been superseded by brotlicffi (not currently supported by Scrapy). Please, uninstall brotlipy and install brotli instead. brotlipy has the same import name as brotli, so keeping both installed is strongly discouraged.'
- , ScrapyDeprecationWarning)
+ (
+ "You have brotlipy installed, and Scrapy will use it, but "
+ "Scrapy support for brotlipy is deprecated and will stop "
+ "working in a future version of Scrapy. brotlipy itself is "
+ "deprecated, it has been superseded by brotlicffi (not "
+ "currently supported by Scrapy). Please, uninstall brotlipy "
+ "and install brotli instead. brotlipy has the same import "
+ "name as brotli, so keeping both installed is strongly "
+ "discouraged."
+ ),
+ ScrapyDeprecationWarning,
+ )
+
+ def _brotli_decompress(decompressor, data):
+ return decompressor.decompress(data)
+
+ else:
+
+ def _brotli_decompress(decompressor, data):
+ return decompressor.process(data)
+
+
try:
import zstandard
except ImportError:
pass
-_CHUNK_SIZE = 65536
+
+
+_CHUNK_SIZE = 65536 # 64 KiB
class _DecompressionMaxSizeExceeded(ValueError):
pass
+
+
+def _inflate(data: bytes, *, max_size: int = 0) -> bytes:
+ decompressor = zlib.decompressobj()
+ raw_decompressor = zlib.decompressobj(wbits=-15)
+ input_stream = BytesIO(data)
+ output_stream = BytesIO()
+ output_chunk = b"."
+ decompressed_size = 0
+ while output_chunk:
+ input_chunk = input_stream.read(_CHUNK_SIZE)
+ try:
+ output_chunk = decompressor.decompress(input_chunk)
+ except zlib.error:
+ if decompressor != raw_decompressor:
+ # ugly hack to work with raw deflate content that may
+ # be sent by microsoft servers. For more information, see:
+ # http://carsten.codimi.de/gzip.yaws/
+ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx
+ # http://www.gzip.org/zlib/zlib_faq.html#faq38
+ decompressor = raw_decompressor
+ output_chunk = decompressor.decompress(input_chunk)
+ else:
+ raise
+ decompressed_size += len(output_chunk)
+ if max_size and decompressed_size > max_size:
+ raise _DecompressionMaxSizeExceeded(
+ f"The number of bytes decompressed so far "
+ f"({decompressed_size} B) exceed the specified maximum "
+ f"({max_size} B)."
+ )
+ output_stream.write(output_chunk)
+ output_stream.seek(0)
+ return output_stream.read()
+
+
+def _unbrotli(data: bytes, *, max_size: int = 0) -> bytes:
+ decompressor = brotli.Decompressor()
+ input_stream = BytesIO(data)
+ output_stream = BytesIO()
+ output_chunk = b"."
+ decompressed_size = 0
+ while output_chunk:
+ input_chunk = input_stream.read(_CHUNK_SIZE)
+ output_chunk = _brotli_decompress(decompressor, input_chunk)
+ decompressed_size += len(output_chunk)
+ if max_size and decompressed_size > max_size:
+ raise _DecompressionMaxSizeExceeded(
+ f"The number of bytes decompressed so far "
+ f"({decompressed_size} B) exceed the specified maximum "
+ f"({max_size} B)."
+ )
+ output_stream.write(output_chunk)
+ output_stream.seek(0)
+ return output_stream.read()
+
+
+def _unzstd(data: bytes, *, max_size: int = 0) -> bytes:
+ decompressor = zstandard.ZstdDecompressor()
+ stream_reader = decompressor.stream_reader(BytesIO(data))
+ output_stream = BytesIO()
+ output_chunk = b"."
+ decompressed_size = 0
+ while output_chunk:
+ output_chunk = stream_reader.read(_CHUNK_SIZE)
+ decompressed_size += len(output_chunk)
+ if max_size and decompressed_size > max_size:
+ raise _DecompressionMaxSizeExceeded(
+ f"The number of bytes decompressed so far "
+ f"({decompressed_size} B) exceed the specified maximum "
+ f"({max_size} B)."
+ )
+ output_stream.write(output_chunk)
+ output_stream.seek(0)
+ return output_stream.read()
diff --git a/scrapy/utils/asyncgen.py b/scrapy/utils/asyncgen.py
index bcdb5eb14..0505db343 100644
--- a/scrapy/utils/asyncgen.py
+++ b/scrapy/utils/asyncgen.py
@@ -1,7 +1,18 @@
from typing import AsyncGenerator, AsyncIterable, Iterable, Union
-async def as_async_generator(it: Union[Iterable, AsyncIterable]
- ) ->AsyncGenerator:
+async def collect_asyncgen(result: AsyncIterable) -> list:
+ results = []
+ async for x in result:
+ results.append(x)
+ return results
+
+
+async def as_async_generator(it: Union[Iterable, AsyncIterable]) -> AsyncGenerator:
"""Wraps an iterable (sync or async) into an async generator."""
- pass
+ if isinstance(it, AsyncIterable):
+ async for r in it:
+ yield r
+ else:
+ for r in it:
+ yield r
diff --git a/scrapy/utils/benchserver.py b/scrapy/utils/benchserver.py
index fd25c425c..38884a9f0 100644
--- a/scrapy/utils/benchserver.py
+++ b/scrapy/utils/benchserver.py
@@ -1,5 +1,6 @@
import random
from urllib.parse import urlencode
+
from twisted.web.resource import Resource
from twisted.web.server import Site
@@ -7,11 +8,39 @@ from twisted.web.server import Site
class Root(Resource):
isLeaf = True
+ def getChild(self, name, request):
+ return self
+
+ def render(self, request):
+ total = _getarg(request, b"total", 100, int)
+ show = _getarg(request, b"show", 10, int)
+ nlist = [random.randint(1, total) for _ in range(show)]
+ request.write(b"<html><head></head><body>")
+ args = request.args.copy()
+ for nl in nlist:
+ args["n"] = nl
+ argstr = urlencode(args, doseq=True)
+ request.write(
+ f"<a href='/follow?{argstr}'>follow {nl}</a><br>".encode("utf8")
+ )
+ request.write(b"</body></html>")
+ return b""
+
+
+def _getarg(request, name, default=None, type=str):
+ return type(request.args[name][0]) if name in request.args else default
-if __name__ == '__main__':
+
+if __name__ == "__main__":
from twisted.internet import reactor
+
root = Root()
factory = Site(root)
httpPort = reactor.listenTCP(8998, Site(root))
+
+ def _print_listening():
+ httpHost = httpPort.getHost()
+ print(f"Bench server at http://{httpHost.host}:{httpHost.port}")
+
reactor.callWhenRunning(_print_listening)
reactor.run()
diff --git a/scrapy/utils/boto.py b/scrapy/utils/boto.py
index 94a28dddb..53cfeddd0 100644
--- a/scrapy/utils/boto.py
+++ b/scrapy/utils/boto.py
@@ -1 +1,10 @@
"""Boto/botocore helpers"""
+
+
+def is_botocore_available() -> bool:
+ try:
+ import botocore # noqa: F401
+
+ return True
+ except ImportError:
+ return False
diff --git a/scrapy/utils/conf.py b/scrapy/utils/conf.py
index d81cbfbe5..641dfa4a2 100644
--- a/scrapy/utils/conf.py
+++ b/scrapy/utils/conf.py
@@ -5,53 +5,240 @@ import warnings
from configparser import ConfigParser
from operator import itemgetter
from pathlib import Path
-from typing import Any, Callable, Collection, Dict, Iterable, List, Mapping, MutableMapping, Optional, Union
+from typing import (
+ Any,
+ Callable,
+ Collection,
+ Dict,
+ Iterable,
+ List,
+ Mapping,
+ MutableMapping,
+ Optional,
+ Union,
+)
+
from scrapy.exceptions import ScrapyDeprecationWarning, UsageError
from scrapy.settings import BaseSettings
from scrapy.utils.deprecate import update_classpath
from scrapy.utils.python import without_none_values
-def build_component_list(compdict: MutableMapping[Any, Any], custom: Any=
- None, convert: Callable[[Any], Any]=update_classpath) ->List[Any]:
+def build_component_list(
+ compdict: MutableMapping[Any, Any],
+ custom: Any = None,
+ convert: Callable[[Any], Any] = update_classpath,
+) -> List[Any]:
"""Compose a component list from a { class: order } dictionary."""
- pass
+ def _check_components(complist: Collection[Any]) -> None:
+ if len({convert(c) for c in complist}) != len(complist):
+ raise ValueError(
+ f"Some paths in {complist!r} convert to the same object, "
+ "please update your settings"
+ )
+
+ def _map_keys(compdict: Mapping[Any, Any]) -> Union[BaseSettings, Dict[Any, Any]]:
+ if isinstance(compdict, BaseSettings):
+ compbs = BaseSettings()
+ for k, v in compdict.items():
+ prio = compdict.getpriority(k)
+ assert prio is not None
+ if compbs.getpriority(convert(k)) == prio:
+ raise ValueError(
+ f"Some paths in {list(compdict.keys())!r} "
+ "convert to the same "
+ "object, please update your settings"
+ )
+ else:
+ compbs.set(convert(k), v, priority=prio)
+ return compbs
+ _check_components(compdict)
+ return {convert(k): v for k, v in compdict.items()}
+
+ def _validate_values(compdict: Mapping[Any, Any]) -> None:
+ """Fail if a value in the components dict is not a real number or None."""
+ for name, value in compdict.items():
+ if value is not None and not isinstance(value, numbers.Real):
+ raise ValueError(
+ f"Invalid value {value} for component {name}, "
+ "please provide a real number or None instead"
+ )
+
+ if custom is not None:
+ warnings.warn(
+ "The 'custom' attribute of build_component_list() is deprecated. "
+ "Please merge its value into 'compdict' manually or change your "
+ "code to use Settings.getwithbase().",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ if isinstance(custom, (list, tuple)):
+ _check_components(custom)
+ return type(custom)(convert(c) for c in custom) # type: ignore[return-value]
+ compdict.update(custom)
-def arglist_to_dict(arglist: List[str]) ->Dict[str, str]:
+ _validate_values(compdict)
+ compdict = without_none_values(_map_keys(compdict))
+ return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
+
+
+def arglist_to_dict(arglist: List[str]) -> Dict[str, str]:
"""Convert a list of arguments like ['arg1=val1', 'arg2=val2', ...] to a
dict
"""
- pass
+ return dict(x.split("=", 1) for x in arglist)
-def closest_scrapy_cfg(path: Union[str, os.PathLike]='.', prevpath:
- Optional[Union[str, os.PathLike]]=None) ->str:
+def closest_scrapy_cfg(
+ path: Union[str, os.PathLike] = ".",
+ prevpath: Optional[Union[str, os.PathLike]] = None,
+) -> str:
"""Return the path to the closest scrapy.cfg file by traversing the current
directory and its parents
"""
- pass
+ if prevpath is not None and str(path) == str(prevpath):
+ return ""
+ path = Path(path).resolve()
+ cfgfile = path / "scrapy.cfg"
+ if cfgfile.exists():
+ return str(cfgfile)
+ return closest_scrapy_cfg(path.parent, path)
-def init_env(project: str='default', set_syspath: bool=True) ->None:
+def init_env(project: str = "default", set_syspath: bool = True) -> None:
"""Initialize environment to use command-line tool from inside a project
dir. This sets the Scrapy settings module and modifies the Python path to
be able to locate the project module.
"""
- pass
+ cfg = get_config()
+ if cfg.has_option("settings", project):
+ os.environ["SCRAPY_SETTINGS_MODULE"] = cfg.get("settings", project)
+ closest = closest_scrapy_cfg()
+ if closest:
+ projdir = str(Path(closest).parent)
+ if set_syspath and projdir not in sys.path:
+ sys.path.append(projdir)
-def get_config(use_closest: bool=True) ->ConfigParser:
+def get_config(use_closest: bool = True) -> ConfigParser:
"""Get Scrapy config file as a ConfigParser"""
- pass
+ sources = get_sources(use_closest)
+ cfg = ConfigParser()
+ cfg.read(sources)
+ return cfg
+
+
+def get_sources(use_closest: bool = True) -> List[str]:
+ xdg_config_home = (
+ os.environ.get("XDG_CONFIG_HOME") or Path("~/.config").expanduser()
+ )
+ sources = [
+ "/etc/scrapy.cfg",
+ r"c:\scrapy\scrapy.cfg",
+ str(Path(xdg_config_home) / "scrapy.cfg"),
+ str(Path("~/.scrapy.cfg").expanduser()),
+ ]
+ if use_closest:
+ sources.append(closest_scrapy_cfg())
+ return sources
-def feed_process_params_from_cli(settings: BaseSettings, output: List[str],
- output_format: Optional[str]=None, overwrite_output: Optional[List[str]
- ]=None) ->Dict[str, Dict[str, Any]]:
+def feed_complete_default_values_from_settings(
+ feed: Dict[str, Any], settings: BaseSettings
+) -> Dict[str, Any]:
+ out = feed.copy()
+ out.setdefault("batch_item_count", settings.getint("FEED_EXPORT_BATCH_ITEM_COUNT"))
+ out.setdefault("encoding", settings["FEED_EXPORT_ENCODING"])
+ out.setdefault("fields", settings.getdictorlist("FEED_EXPORT_FIELDS") or None)
+ out.setdefault("store_empty", settings.getbool("FEED_STORE_EMPTY"))
+ out.setdefault("uri_params", settings["FEED_URI_PARAMS"])
+ out.setdefault("item_export_kwargs", {})
+ if settings["FEED_EXPORT_INDENT"] is None:
+ out.setdefault("indent", None)
+ else:
+ out.setdefault("indent", settings.getint("FEED_EXPORT_INDENT"))
+ return out
+
+
+def feed_process_params_from_cli(
+ settings: BaseSettings,
+ output: List[str],
+ output_format: Optional[str] = None,
+ overwrite_output: Optional[List[str]] = None,
+) -> Dict[str, Dict[str, Any]]:
"""
Receives feed export params (from the 'crawl' or 'runspider' commands),
checks for inconsistencies in their quantities and returns a dictionary
suitable to be used as the FEEDS setting.
"""
- pass
+ valid_output_formats: Iterable[str] = without_none_values(
+ settings.getwithbase("FEED_EXPORTERS")
+ ).keys()
+
+ def check_valid_format(output_format: str) -> None:
+ if output_format not in valid_output_formats:
+ raise UsageError(
+ f"Unrecognized output format '{output_format}'. "
+ f"Set a supported one ({tuple(valid_output_formats)}) "
+ "after a colon at the end of the output URI (i.e. -o/-O "
+ "<URI>:<FORMAT>) or as a file extension."
+ )
+
+ overwrite = False
+ if overwrite_output:
+ if output:
+ raise UsageError(
+ "Please use only one of -o/--output and -O/--overwrite-output"
+ )
+ if output_format:
+ raise UsageError(
+ "-t/--output-format is a deprecated command line option"
+ " and does not work in combination with -O/--overwrite-output."
+ " To specify a format please specify it after a colon at the end of the"
+ " output URI (i.e. -O <URI>:<FORMAT>)."
+ " Example working in the tutorial: "
+ "scrapy crawl quotes -O quotes.json:json"
+ )
+ output = overwrite_output
+ overwrite = True
+
+ if output_format:
+ if len(output) == 1:
+ check_valid_format(output_format)
+ message = (
+ "The -t/--output-format command line option is deprecated in favor of "
+ "specifying the output format within the output URI using the -o/--output or the"
+ " -O/--overwrite-output option (i.e. -o/-O <URI>:<FORMAT>). See the documentation"
+ " of the -o or -O option or the following examples for more information. "
+ "Examples working in the tutorial: "
+ "scrapy crawl quotes -o quotes.csv:csv or "
+ "scrapy crawl quotes -O quotes.json:json"
+ )
+ warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
+ return {output[0]: {"format": output_format}}
+ raise UsageError(
+ "The -t command-line option cannot be used if multiple output "
+ "URIs are specified"
+ )
+
+ result: Dict[str, Dict[str, Any]] = {}
+ for element in output:
+ try:
+ feed_uri, feed_format = element.rsplit(":", 1)
+ check_valid_format(feed_format)
+ except (ValueError, UsageError):
+ feed_uri = element
+ feed_format = Path(element).suffix.replace(".", "")
+ else:
+ if feed_uri == "-":
+ feed_uri = "stdout:"
+ check_valid_format(feed_format)
+ result[feed_uri] = {"format": feed_format}
+ if overwrite:
+ result[feed_uri]["overwrite"] = True
+
+ # FEEDS setting should take precedence over the matching CLI options
+ result.update(settings.getdict("FEEDS"))
+
+ return result
diff --git a/scrapy/utils/console.py b/scrapy/utils/console.py
index b5bd733b2..100f040bb 100644
--- a/scrapy/utils/console.py
+++ b/scrapy/utils/console.py
@@ -1,40 +1,110 @@
from functools import wraps
-def _embed_ipython_shell(namespace={}, banner=''):
+def _embed_ipython_shell(namespace={}, banner=""):
"""Start an IPython Shell"""
- pass
+ try:
+ from IPython.terminal.embed import InteractiveShellEmbed
+ from IPython.terminal.ipapp import load_default_config
+ except ImportError:
+ from IPython.frontend.terminal.embed import InteractiveShellEmbed
+ from IPython.frontend.terminal.ipapp import load_default_config
+ @wraps(_embed_ipython_shell)
+ def wrapper(namespace=namespace, banner=""):
+ config = load_default_config()
+ # Always use .instance() to ensure _instance propagation to all parents
+ # this is needed for <TAB> completion works well for new imports
+ # and clear the instance to always have the fresh env
+ # on repeated breaks like with inspect_response()
+ InteractiveShellEmbed.clear_instance()
+ shell = InteractiveShellEmbed.instance(
+ banner1=banner, user_ns=namespace, config=config
+ )
+ shell()
-def _embed_bpython_shell(namespace={}, banner=''):
+ return wrapper
+
+
+def _embed_bpython_shell(namespace={}, banner=""):
"""Start a bpython shell"""
- pass
+ import bpython
+
+ @wraps(_embed_bpython_shell)
+ def wrapper(namespace=namespace, banner=""):
+ bpython.embed(locals_=namespace, banner=banner)
+ return wrapper
-def _embed_ptpython_shell(namespace={}, banner=''):
+
+def _embed_ptpython_shell(namespace={}, banner=""):
"""Start a ptpython shell"""
- pass
+ import ptpython.repl
+
+ @wraps(_embed_ptpython_shell)
+ def wrapper(namespace=namespace, banner=""):
+ print(banner)
+ ptpython.repl.embed(locals=namespace)
+
+ return wrapper
-def _embed_standard_shell(namespace={}, banner=''):
+def _embed_standard_shell(namespace={}, banner=""):
"""Start a standard python shell"""
- pass
+ import code
+ try: # readline module is only available on unix systems
+ import readline
+ except ImportError:
+ pass
+ else:
+ import rlcompleter # noqa: F401
-DEFAULT_PYTHON_SHELLS = {'ptpython': _embed_ptpython_shell, 'ipython':
- _embed_ipython_shell, 'bpython': _embed_bpython_shell, 'python':
- _embed_standard_shell}
+ readline.parse_and_bind("tab:complete")
+
+ @wraps(_embed_standard_shell)
+ def wrapper(namespace=namespace, banner=""):
+ code.interact(banner=banner, local=namespace)
+
+ return wrapper
+
+
+DEFAULT_PYTHON_SHELLS = {
+ "ptpython": _embed_ptpython_shell,
+ "ipython": _embed_ipython_shell,
+ "bpython": _embed_bpython_shell,
+ "python": _embed_standard_shell,
+}
def get_shell_embed_func(shells=None, known_shells=None):
"""Return the first acceptable shell-embed function
from a given list of shell names.
"""
- pass
+ if shells is None: # list, preference order of shells
+ shells = DEFAULT_PYTHON_SHELLS.keys()
+ if known_shells is None: # available embeddable shells
+ known_shells = DEFAULT_PYTHON_SHELLS.copy()
+ for shell in shells:
+ if shell in known_shells:
+ try:
+ # function test: run all setup code (imports),
+ # but dont fall into the shell
+ return known_shells[shell]()
+ except ImportError:
+ continue
-def start_python_console(namespace=None, banner='', shells=None):
+def start_python_console(namespace=None, banner="", shells=None):
"""Start Python console bound to the given namespace.
Readline support and tab completion will be used on Unix, if available.
"""
- pass
+ if namespace is None:
+ namespace = {}
+
+ try:
+ shell = get_shell_embed_func(shells)
+ if shell is not None:
+ shell(namespace=namespace, banner=banner)
+ except SystemExit: # raised when using exit() in python code.interact
+ pass
diff --git a/scrapy/utils/curl.py b/scrapy/utils/curl.py
index fec4cdf21..f5dbbd64e 100644
--- a/scrapy/utils/curl.py
+++ b/scrapy/utils/curl.py
@@ -3,37 +3,68 @@ import warnings
from http.cookies import SimpleCookie
from shlex import split
from urllib.parse import urlparse
+
from w3lib.http import basic_auth_header
class DataAction(argparse.Action):
-
def __call__(self, parser, namespace, values, option_string=None):
value = str(values)
- if value.startswith('$'):
+ if value.startswith("$"):
value = value[1:]
setattr(namespace, self.dest, value)
class CurlParser(argparse.ArgumentParser):
- pass
+ def error(self, message):
+ error_msg = f"There was an error parsing the curl command: {message}"
+ raise ValueError(error_msg)
curl_parser = CurlParser()
-curl_parser.add_argument('url')
-curl_parser.add_argument('-H', '--header', dest='headers', action='append')
-curl_parser.add_argument('-X', '--request', dest='method')
-curl_parser.add_argument('-d', '--data', '--data-raw', dest='data', action=
- DataAction)
-curl_parser.add_argument('-u', '--user', dest='auth')
-safe_to_ignore_arguments = [['--compressed'], ['-s', '--silent'], ['-v',
- '--verbose'], ['-#', '--progress-bar']]
+curl_parser.add_argument("url")
+curl_parser.add_argument("-H", "--header", dest="headers", action="append")
+curl_parser.add_argument("-X", "--request", dest="method")
+curl_parser.add_argument("-d", "--data", "--data-raw", dest="data", action=DataAction)
+curl_parser.add_argument("-u", "--user", dest="auth")
+
+
+safe_to_ignore_arguments = [
+ ["--compressed"],
+ # `--compressed` argument is not safe to ignore, but it's included here
+ # because the `HttpCompressionMiddleware` is enabled by default
+ ["-s", "--silent"],
+ ["-v", "--verbose"],
+ ["-#", "--progress-bar"],
+]
+
for argument in safe_to_ignore_arguments:
- curl_parser.add_argument(*argument, action='store_true')
+ curl_parser.add_argument(*argument, action="store_true")
-def curl_to_request_kwargs(curl_command: str, ignore_unknown_options: bool=True
- ) ->dict:
+def _parse_headers_and_cookies(parsed_args):
+ headers = []
+ cookies = {}
+ for header in parsed_args.headers or ():
+ name, val = header.split(":", 1)
+ name = name.strip()
+ val = val.strip()
+ if name.title() == "Cookie":
+ for name, morsel in SimpleCookie(val).items():
+ cookies[name] = morsel.value
+ else:
+ headers.append((name, val))
+
+ if parsed_args.auth:
+ user, password = parsed_args.auth.split(":", 1)
+ headers.append(("Authorization", basic_auth_header(user, password)))
+
+ return headers, cookies
+
+
+def curl_to_request_kwargs(
+ curl_command: str, ignore_unknown_options: bool = True
+) -> dict:
"""Convert a cURL command syntax to Request kwargs.
:param str curl_command: string containing the curl command
@@ -42,4 +73,44 @@ def curl_to_request_kwargs(curl_command: str, ignore_unknown_options: bool=True
raises an error. (default: True)
:return: dictionary of Request kwargs
"""
- pass
+
+ curl_args = split(curl_command)
+
+ if curl_args[0] != "curl":
+ raise ValueError('A curl command must start with "curl"')
+
+ parsed_args, argv = curl_parser.parse_known_args(curl_args[1:])
+
+ if argv:
+ msg = f'Unrecognized options: {", ".join(argv)}'
+ if ignore_unknown_options:
+ warnings.warn(msg)
+ else:
+ raise ValueError(msg)
+
+ url = parsed_args.url
+
+ # curl automatically prepends 'http' if the scheme is missing, but Request
+ # needs the scheme to work
+ parsed_url = urlparse(url)
+ if not parsed_url.scheme:
+ url = "http://" + url
+
+ method = parsed_args.method or "GET"
+
+ result = {"method": method.upper(), "url": url}
+
+ headers, cookies = _parse_headers_and_cookies(parsed_args)
+
+ if headers:
+ result["headers"] = headers
+ if cookies:
+ result["cookies"] = cookies
+ if parsed_args.data:
+ result["body"] = parsed_args.data
+ if not parsed_args.method:
+ # if the "data" is specified but the "method" is not specified,
+ # the default method is 'POST'
+ result["method"] = "POST"
+
+ return result
diff --git a/scrapy/utils/datatypes.py b/scrapy/utils/datatypes.py
index 3ec4ef789..d5b9544cc 100644
--- a/scrapy/utils/datatypes.py
+++ b/scrapy/utils/datatypes.py
@@ -4,14 +4,17 @@ Python Standard Library.
This module must not depend on any module outside the Standard Library.
"""
+
import collections
import warnings
import weakref
from collections.abc import Mapping
from typing import Any, AnyStr, Optional, OrderedDict, Sequence, TypeVar
+
from scrapy.exceptions import ScrapyDeprecationWarning
-_KT = TypeVar('_KT')
-_VT = TypeVar('_VT')
+
+_KT = TypeVar("_KT")
+_VT = TypeVar("_VT")
class CaselessDict(dict):
@@ -19,10 +22,14 @@ class CaselessDict(dict):
def __new__(cls, *args, **kwargs):
from scrapy.http.headers import Headers
+
if issubclass(cls, CaselessDict) and not issubclass(cls, Headers):
warnings.warn(
- 'scrapy.utils.datatypes.CaselessDict is deprecated, please use scrapy.utils.datatypes.CaseInsensitiveDict instead'
- , category=ScrapyDeprecationWarning, stacklevel=2)
+ "scrapy.utils.datatypes.CaselessDict is deprecated,"
+ " please use scrapy.utils.datatypes.CaseInsensitiveDict instead",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
return super().__new__(cls, *args, **kwargs)
def __init__(self, seq=None):
@@ -41,19 +48,39 @@ class CaselessDict(dict):
def __contains__(self, key):
return dict.__contains__(self, self.normkey(key))
+
has_key = __contains__
def __copy__(self):
return self.__class__(self)
+
copy = __copy__
def normkey(self, key):
"""Method to normalize dictionary key access"""
- pass
+ return key.lower()
def normvalue(self, value):
"""Method to normalize values prior to be set"""
- pass
+ return value
+
+ def get(self, key, def_val=None):
+ return dict.get(self, self.normkey(key), self.normvalue(def_val))
+
+ def setdefault(self, key, def_val=None):
+ return dict.setdefault(self, self.normkey(key), self.normvalue(def_val))
+
+ def update(self, seq):
+ seq = seq.items() if isinstance(seq, Mapping) else seq
+ iseq = ((self.normkey(k), self.normvalue(v)) for k, v in seq)
+ super().update(iseq)
+
+ @classmethod
+ def fromkeys(cls, keys, value=None):
+ return cls((k, value) for k in keys)
+
+ def pop(self, key, *args):
+ return dict.pop(self, self.normkey(key), *args)
class CaseInsensitiveDict(collections.UserDict):
@@ -61,15 +88,15 @@ class CaseInsensitiveDict(collections.UserDict):
as keys and allows case-insensitive lookups.
"""
- def __init__(self, *args, **kwargs) ->None:
+ def __init__(self, *args, **kwargs) -> None:
self._keys: dict = {}
super().__init__(*args, **kwargs)
- def __getitem__(self, key: AnyStr) ->Any:
+ def __getitem__(self, key: AnyStr) -> Any:
normalized_key = self._normkey(key)
return super().__getitem__(self._keys[normalized_key.lower()])
- def __setitem__(self, key: AnyStr, value: Any) ->None:
+ def __setitem__(self, key: AnyStr, value: Any) -> None:
normalized_key = self._normkey(key)
try:
lower_key = self._keys[normalized_key.lower()]
@@ -79,17 +106,23 @@ class CaseInsensitiveDict(collections.UserDict):
super().__setitem__(normalized_key, self._normvalue(value))
self._keys[normalized_key.lower()] = normalized_key
- def __delitem__(self, key: AnyStr) ->None:
+ def __delitem__(self, key: AnyStr) -> None:
normalized_key = self._normkey(key)
stored_key = self._keys.pop(normalized_key.lower())
super().__delitem__(stored_key)
- def __contains__(self, key: AnyStr) ->bool:
+ def __contains__(self, key: AnyStr) -> bool: # type: ignore[override]
normalized_key = self._normkey(key)
return normalized_key.lower() in self._keys
- def __repr__(self) ->str:
- return f'<{self.__class__.__name__}: {super().__repr__()}>'
+ def __repr__(self) -> str:
+ return f"<{self.__class__.__name__}: {super().__repr__()}>"
+
+ def _normkey(self, key: AnyStr) -> AnyStr:
+ return key
+
+ def _normvalue(self, value: Any) -> Any:
+ return value
class LocalCache(OrderedDict[_KT, _VT]):
@@ -98,11 +131,11 @@ class LocalCache(OrderedDict[_KT, _VT]):
Older items expires first.
"""
- def __init__(self, limit: Optional[int]=None):
+ def __init__(self, limit: Optional[int] = None):
super().__init__()
self.limit: Optional[int] = limit
- def __setitem__(self, key: _KT, value: _VT) ->None:
+ def __setitem__(self, key: _KT, value: _VT) -> None:
if self.limit:
while len(self) >= self.limit:
self.popitem(last=False)
@@ -121,21 +154,21 @@ class LocalWeakReferencedCache(weakref.WeakKeyDictionary):
it cannot be instantiated with an initial dictionary.
"""
- def __init__(self, limit: Optional[int]=None):
+ def __init__(self, limit: Optional[int] = None):
super().__init__()
self.data: LocalCache = LocalCache(limit=limit)
- def __setitem__(self, key: _KT, value: _VT) ->None:
+ def __setitem__(self, key: _KT, value: _VT) -> None:
try:
super().__setitem__(key, value)
except TypeError:
- pass
+ pass # key is not weak-referenceable, skip caching
- def __getitem__(self, key: _KT) ->Optional[_VT]:
+ def __getitem__(self, key: _KT) -> Optional[_VT]: # type: ignore[override]
try:
return super().__getitem__(key)
except (TypeError, KeyError):
- return None
+ return None # key is either not weak-referenceable or not cached
class SequenceExclude:
@@ -144,5 +177,5 @@ class SequenceExclude:
def __init__(self, seq: Sequence):
self.seq: Sequence = seq
- def __contains__(self, item: Any) ->bool:
+ def __contains__(self, item: Any) -> bool:
return item not in self.seq
diff --git a/scrapy/utils/decorators.py b/scrapy/utils/decorators.py
index b441b495f..04186559f 100644
--- a/scrapy/utils/decorators.py
+++ b/scrapy/utils/decorators.py
@@ -1,25 +1,52 @@
import warnings
from functools import wraps
from typing import Any, Callable
+
from twisted.internet import defer, threads
from twisted.internet.defer import Deferred
+
from scrapy.exceptions import ScrapyDeprecationWarning
-def deprecated(use_instead: Any=None) ->Callable:
+def deprecated(use_instead: Any = None) -> Callable:
"""This is a decorator which can be used to mark functions
as deprecated. It will result in a warning being emitted
when the function is used."""
- pass
+
+ def deco(func: Callable) -> Callable:
+ @wraps(func)
+ def wrapped(*args: Any, **kwargs: Any) -> Any:
+ message = f"Call to deprecated function {func.__name__}."
+ if use_instead:
+ message += f" Use {use_instead} instead."
+ warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
+ return func(*args, **kwargs)
+
+ return wrapped
+
+ if callable(use_instead):
+ deco = deco(use_instead)
+ use_instead = None
+ return deco
-def defers(func: Callable) ->Callable[..., Deferred]:
+def defers(func: Callable) -> Callable[..., Deferred]:
"""Decorator to make sure a function always returns a deferred"""
- pass
+ @wraps(func)
+ def wrapped(*a: Any, **kw: Any) -> Deferred:
+ return defer.maybeDeferred(func, *a, **kw)
-def inthread(func: Callable) ->Callable[..., Deferred]:
+ return wrapped
+
+
+def inthread(func: Callable) -> Callable[..., Deferred]:
"""Decorator to call a function in a thread and return a deferred with the
result
"""
- pass
+
+ @wraps(func)
+ def wrapped(*a: Any, **kw: Any) -> Deferred:
+ return threads.deferToThread(func, *a, **kw)
+
+ return wrapped
diff --git a/scrapy/utils/defer.py b/scrapy/utils/defer.py
index aff03f895..bf3c5ef5b 100644
--- a/scrapy/utils/defer.py
+++ b/scrapy/utils/defer.py
@@ -6,51 +6,101 @@ import inspect
from asyncio import Future
from functools import wraps
from types import CoroutineType
-from typing import Any, AsyncGenerator, AsyncIterable, AsyncIterator, Awaitable, Callable, Coroutine, Dict, Generator, Iterable, Iterator, List, Optional, Tuple, TypeVar, Union, cast, overload
+from typing import (
+ Any,
+ AsyncGenerator,
+ AsyncIterable,
+ AsyncIterator,
+ Awaitable,
+ Callable,
+ Coroutine,
+ Dict,
+ Generator,
+ Iterable,
+ Iterator,
+ List,
+ Optional,
+ Tuple,
+ TypeVar,
+ Union,
+ cast,
+ overload,
+)
+
from twisted.internet import defer
from twisted.internet.defer import Deferred, DeferredList, ensureDeferred
from twisted.internet.task import Cooperator
from twisted.python import failure
from twisted.python.failure import Failure
+
from scrapy.exceptions import IgnoreRequest
from scrapy.utils.reactor import _get_asyncio_event_loop, is_asyncio_reactor_installed
-def defer_fail(_failure: Failure) ->Deferred:
+def defer_fail(_failure: Failure) -> Deferred:
"""Same as twisted.internet.defer.fail but delay calling errback until
next reactor loop
It delays by 100ms so reactor has a chance to go through readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
- pass
+ from twisted.internet import reactor
+
+ d: Deferred = Deferred()
+ reactor.callLater(0.1, d.errback, _failure)
+ return d
-def defer_succeed(result: Any) ->Deferred:
+def defer_succeed(result: Any) -> Deferred:
"""Same as twisted.internet.defer.succeed but delay calling callback until
next reactor loop
It delays by 100ms so reactor has a chance to go through readers and writers
before attending pending delayed calls, so do not set delay to zero.
"""
- pass
+ from twisted.internet import reactor
+
+ d: Deferred = Deferred()
+ reactor.callLater(0.1, d.callback, result)
+ return d
+
+
+def defer_result(result: Any) -> Deferred:
+ if isinstance(result, Deferred):
+ return result
+ if isinstance(result, failure.Failure):
+ return defer_fail(result)
+ return defer_succeed(result)
-def mustbe_deferred(f: Callable, *args: Any, **kw: Any) ->Deferred:
+def mustbe_deferred(f: Callable, *args: Any, **kw: Any) -> Deferred:
"""Same as twisted.internet.defer.maybeDeferred, but delay calling
callback/errback to next reactor loop
"""
- pass
-
-
-def parallel(iterable: Iterable, count: int, callable: Callable, *args: Any,
- **named: Any) ->Deferred:
+ try:
+ result = f(*args, **kw)
+ # FIXME: Hack to avoid introspecting tracebacks. This to speed up
+ # processing of IgnoreRequest errors which are, by far, the most common
+ # exception in Scrapy - see #125
+ except IgnoreRequest as e:
+ return defer_fail(failure.Failure(e))
+ except Exception:
+ return defer_fail(failure.Failure())
+ else:
+ return defer_result(result)
+
+
+def parallel(
+ iterable: Iterable, count: int, callable: Callable, *args: Any, **named: Any
+) -> Deferred:
"""Execute a callable over the objects in the given iterable, in parallel,
using no more than ``count`` concurrent calls.
Taken from: https://jcalderone.livejournal.com/24285.html
"""
- pass
+ coop = Cooperator()
+ work = (callable(elem, *args, **named) for elem in iterable)
+ return DeferredList([coop.coiterate(work) for _ in range(count)])
class _AsyncCooperatorAdapter(Iterator):
@@ -99,8 +149,13 @@ class _AsyncCooperatorAdapter(Iterator):
goal.
"""
- def __init__(self, aiterable: AsyncIterable, callable: Callable, *
- callable_args: Any, **callable_kwargs: Any):
+ def __init__(
+ self,
+ aiterable: AsyncIterable,
+ callable: Callable,
+ *callable_args: Any,
+ **callable_kwargs: Any,
+ ):
self.aiterator: AsyncIterator = aiterable.__aiter__()
self.callable: Callable = callable
self.callable_args: Tuple[Any, ...] = callable_args
@@ -109,7 +164,38 @@ class _AsyncCooperatorAdapter(Iterator):
self.waiting_deferreds: List[Deferred] = []
self.anext_deferred: Optional[Deferred] = None
- def __next__(self) ->Deferred:
+ def _callback(self, result: Any) -> None:
+ # This gets called when the result from aiterator.__anext__() is available.
+ # It calls the callable on it and sends the result to the oldest waiting Deferred
+ # (by chaining if the result is a Deferred too or by firing if not).
+ self.anext_deferred = None
+ result = self.callable(result, *self.callable_args, **self.callable_kwargs)
+ d = self.waiting_deferreds.pop(0)
+ if isinstance(result, Deferred):
+ result.chainDeferred(d)
+ else:
+ d.callback(None)
+ if self.waiting_deferreds:
+ self._call_anext()
+
+ def _errback(self, failure: Failure) -> None:
+ # This gets called on any exceptions in aiterator.__anext__().
+ # It handles StopAsyncIteration by stopping the iteration and reraises all others.
+ self.anext_deferred = None
+ failure.trap(StopAsyncIteration)
+ self.finished = True
+ for d in self.waiting_deferreds:
+ d.callback(None)
+
+ def _call_anext(self) -> None:
+ # This starts waiting for the next result from aiterator.
+ # If aiterator is exhausted, _errback will be called.
+ self.anext_deferred = deferred_from_coro(self.aiterator.__anext__())
+ self.anext_deferred.addCallbacks(self._callback, self._errback)
+
+ def __next__(self) -> Deferred:
+ # This puts a new Deferred into self.waiting_deferreds and returns it.
+ # It also calls __anext__() if needed.
if self.finished:
raise StopIteration
d: Deferred = Deferred()
@@ -119,72 +205,160 @@ class _AsyncCooperatorAdapter(Iterator):
return d
-def parallel_async(async_iterable: AsyncIterable, count: int, callable:
- Callable, *args: Any, **named: Any) ->Deferred:
+def parallel_async(
+ async_iterable: AsyncIterable,
+ count: int,
+ callable: Callable,
+ *args: Any,
+ **named: Any,
+) -> Deferred:
"""Like parallel but for async iterators"""
- pass
+ coop = Cooperator()
+ work = _AsyncCooperatorAdapter(async_iterable, callable, *args, **named)
+ dl: Deferred = DeferredList([coop.coiterate(work) for _ in range(count)])
+ return dl
-def process_chain(callbacks: Iterable[Callable], input: Any, *a: Any, **kw: Any
- ) ->Deferred:
+def process_chain(
+ callbacks: Iterable[Callable], input: Any, *a: Any, **kw: Any
+) -> Deferred:
"""Return a Deferred built by chaining the given callbacks"""
- pass
-
-
-def process_chain_both(callbacks: Iterable[Callable], errbacks: Iterable[
- Callable], input: Any, *a: Any, **kw: Any) ->Deferred:
+ d: Deferred = Deferred()
+ for x in callbacks:
+ d.addCallback(x, *a, **kw)
+ d.callback(input)
+ return d
+
+
+def process_chain_both(
+ callbacks: Iterable[Callable],
+ errbacks: Iterable[Callable],
+ input: Any,
+ *a: Any,
+ **kw: Any,
+) -> Deferred:
"""Return a Deferred built by chaining the given callbacks and errbacks"""
- pass
-
-
-def process_parallel(callbacks: Iterable[Callable], input: Any, *a: Any, **
- kw: Any) ->Deferred:
+ d: Deferred = Deferred()
+ for cb, eb in zip(callbacks, errbacks):
+ d.addCallbacks(
+ callback=cb,
+ errback=eb,
+ callbackArgs=a,
+ callbackKeywords=kw,
+ errbackArgs=a,
+ errbackKeywords=kw,
+ )
+ if isinstance(input, failure.Failure):
+ d.errback(input)
+ else:
+ d.callback(input)
+ return d
+
+
+def process_parallel(
+ callbacks: Iterable[Callable], input: Any, *a: Any, **kw: Any
+) -> Deferred:
"""Return a Deferred with the output of all successful calls to the given
callbacks
"""
- pass
+ dfds = [defer.succeed(input).addCallback(x, *a, **kw) for x in callbacks]
+ d: Deferred = DeferredList(dfds, fireOnOneErrback=True, consumeErrors=True)
+ d.addCallbacks(lambda r: [x[1] for x in r], lambda f: f.value.subFailure)
+ return d
-def iter_errback(iterable: Iterable, errback: Callable, *a: Any, **kw: Any
- ) ->Generator:
+def iter_errback(
+ iterable: Iterable, errback: Callable, *a: Any, **kw: Any
+) -> Generator:
"""Wraps an iterable calling an errback if an error is caught while
iterating it.
"""
- pass
-
-
-async def aiter_errback(aiterable: AsyncIterable, errback: Callable, *a:
- Any, **kw: Any) ->AsyncGenerator:
+ it = iter(iterable)
+ while True:
+ try:
+ yield next(it)
+ except StopIteration:
+ break
+ except Exception:
+ errback(failure.Failure(), *a, **kw)
+
+
+async def aiter_errback(
+ aiterable: AsyncIterable, errback: Callable, *a: Any, **kw: Any
+) -> AsyncGenerator:
"""Wraps an async iterable calling an errback if an error is caught while
iterating it. Similar to scrapy.utils.defer.iter_errback()
"""
- pass
+ it = aiterable.__aiter__()
+ while True:
+ try:
+ yield await it.__anext__()
+ except StopAsyncIteration:
+ break
+ except Exception:
+ errback(failure.Failure(), *a, **kw)
-_CT = TypeVar('_CT', bound=Union[Awaitable, CoroutineType, Future])
-_T = TypeVar('_T')
+_CT = TypeVar("_CT", bound=Union[Awaitable, CoroutineType, Future])
+_T = TypeVar("_T")
-def deferred_from_coro(o: _T) ->Union[Deferred, _T]:
- """Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
- pass
+@overload
+def deferred_from_coro(o: _CT) -> Deferred:
+ ...
+
+@overload
+def deferred_from_coro(o: _T) -> _T:
+ ...
-def deferred_f_from_coro_f(coro_f: Callable[..., Coroutine]) ->Callable:
+
+def deferred_from_coro(o: _T) -> Union[Deferred, _T]:
+ """Converts a coroutine into a Deferred, or returns the object as is if it isn't a coroutine"""
+ if isinstance(o, Deferred):
+ return o
+ if asyncio.isfuture(o) or inspect.isawaitable(o):
+ if not is_asyncio_reactor_installed():
+ # wrapping the coroutine directly into a Deferred, this doesn't work correctly with coroutines
+ # that use asyncio, e.g. "await asyncio.sleep(1)"
+ return ensureDeferred(cast(Coroutine[Deferred, Any, Any], o))
+ # wrapping the coroutine into a Future and then into a Deferred, this requires AsyncioSelectorReactor
+ event_loop = _get_asyncio_event_loop()
+ return Deferred.fromFuture(asyncio.ensure_future(o, loop=event_loop))
+ return o
+
+
+def deferred_f_from_coro_f(coro_f: Callable[..., Coroutine]) -> Callable:
"""Converts a coroutine function into a function that returns a Deferred.
The coroutine function will be called at the time when the wrapper is called. Wrapper args will be passed to it.
This is useful for callback chains, as callback functions are called with the previous callback result.
"""
- pass
+ @wraps(coro_f)
+ def f(*coro_args: Any, **coro_kwargs: Any) -> Any:
+ return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
+
+ return f
-def maybeDeferred_coro(f: Callable, *args: Any, **kw: Any) ->Deferred:
+
+def maybeDeferred_coro(f: Callable, *args: Any, **kw: Any) -> Deferred:
"""Copy of defer.maybeDeferred that also converts coroutines to Deferreds."""
- pass
+ try:
+ result = f(*args, **kw)
+ except: # noqa: E722
+ return defer.fail(failure.Failure(captureVars=Deferred.debug))
+
+ if isinstance(result, Deferred):
+ return result
+ if asyncio.isfuture(result) or inspect.isawaitable(result):
+ return deferred_from_coro(result)
+ if isinstance(result, failure.Failure):
+ return defer.fail(result)
+ return defer.succeed(result)
-def deferred_to_future(d: Deferred) ->Future:
+def deferred_to_future(d: Deferred) -> Future:
"""
.. versionadded:: 2.6.0
@@ -203,10 +377,10 @@ def deferred_to_future(d: Deferred) ->Future:
deferred = self.crawler.engine.download(additional_request)
additional_response = await deferred_to_future(deferred)
"""
- pass
+ return d.asFuture(_get_asyncio_event_loop())
-def maybe_deferred_to_future(d: Deferred) ->Union[Deferred, Future]:
+def maybe_deferred_to_future(d: Deferred) -> Union[Deferred, Future]:
"""
.. versionadded:: 2.6.0
@@ -232,4 +406,6 @@ def maybe_deferred_to_future(d: Deferred) ->Union[Deferred, Future]:
deferred = self.crawler.engine.download(additional_request)
additional_response = await maybe_deferred_to_future(deferred)
"""
- pass
+ if not is_asyncio_reactor_installed():
+ return d
+ return deferred_to_future(d)
diff --git a/scrapy/utils/deprecate.py b/scrapy/utils/deprecate.py
index 42dcda1fa..ea577c44a 100644
--- a/scrapy/utils/deprecate.py
+++ b/scrapy/utils/deprecate.py
@@ -1,18 +1,33 @@
"""Some helpers for deprecation messages"""
+
import inspect
import warnings
from typing import Any, Dict, List, Optional, Tuple, Type, overload
+
from scrapy.exceptions import ScrapyDeprecationWarning
-def create_deprecated_class(name: str, new_class: type, clsdict: Optional[
- Dict[str, Any]]=None, warn_category: Type[Warning]=
- ScrapyDeprecationWarning, warn_once: bool=True, old_class_path:
- Optional[str]=None, new_class_path: Optional[str]=None,
- subclass_warn_message: str=
- '{cls} inherits from deprecated class {old}, please inherit from {new}.',
- instance_warn_message: str=
- '{cls} is deprecated, instantiate {new} instead.') ->type:
+def attribute(obj: Any, oldattr: str, newattr: str, version: str = "0.12") -> None:
+ cname = obj.__class__.__name__
+ warnings.warn(
+ f"{cname}.{oldattr} attribute is deprecated and will be no longer supported "
+ f"in Scrapy {version}, use {cname}.{newattr} attribute instead",
+ ScrapyDeprecationWarning,
+ stacklevel=3,
+ )
+
+
+def create_deprecated_class(
+ name: str,
+ new_class: type,
+ clsdict: Optional[Dict[str, Any]] = None,
+ warn_category: Type[Warning] = ScrapyDeprecationWarning,
+ warn_once: bool = True,
+ old_class_path: Optional[str] = None,
+ new_class_path: Optional[str] = None,
+ subclass_warn_message: str = "{cls} inherits from deprecated class {old}, please inherit from {new}.",
+ instance_warn_message: str = "{cls} is deprecated, instantiate {new} instead.",
+) -> type:
"""
Return a "deprecated" class that causes its subclasses to issue a warning.
Subclasses of ``new_class`` are considered subclasses of this class.
@@ -37,19 +52,115 @@ def create_deprecated_class(name: str, new_class: type, clsdict: Optional[
checks they'll still return True if sub is a subclass of NewName instead of
OldName.
"""
- pass
+
+ # https://github.com/python/mypy/issues/4177
+ class DeprecatedClass(new_class.__class__): # type: ignore[misc, name-defined]
+ deprecated_class: Optional[type] = None
+ warned_on_subclass: bool = False
+
+ def __new__(
+ metacls, name: str, bases: Tuple[type, ...], clsdict_: Dict[str, Any]
+ ) -> type:
+ cls = super().__new__(metacls, name, bases, clsdict_)
+ if metacls.deprecated_class is None:
+ metacls.deprecated_class = cls
+ return cls
+
+ def __init__(cls, name: str, bases: Tuple[type, ...], clsdict_: Dict[str, Any]):
+ meta = cls.__class__
+ old = meta.deprecated_class
+ if old in bases and not (warn_once and meta.warned_on_subclass):
+ meta.warned_on_subclass = True
+ msg = subclass_warn_message.format(
+ cls=_clspath(cls),
+ old=_clspath(old, old_class_path),
+ new=_clspath(new_class, new_class_path),
+ )
+ if warn_once:
+ msg += " (warning only on first subclass, there may be others)"
+ warnings.warn(msg, warn_category, stacklevel=2)
+ super().__init__(name, bases, clsdict_)
+
+ # see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass
+ # and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks
+ # for implementation details
+ def __instancecheck__(cls, inst: Any) -> bool:
+ return any(cls.__subclasscheck__(c) for c in (type(inst), inst.__class__))
+
+ def __subclasscheck__(cls, sub: type) -> bool:
+ if cls is not DeprecatedClass.deprecated_class:
+ # we should do the magic only if second `issubclass` argument
+ # is the deprecated class itself - subclasses of the
+ # deprecated class should not use custom `__subclasscheck__`
+ # method.
+ return super().__subclasscheck__(sub)
+
+ if not inspect.isclass(sub):
+ raise TypeError("issubclass() arg 1 must be a class")
+
+ mro = getattr(sub, "__mro__", ())
+ return any(c in {cls, new_class} for c in mro)
+
+ def __call__(cls, *args: Any, **kwargs: Any) -> Any:
+ old = DeprecatedClass.deprecated_class
+ if cls is old:
+ msg = instance_warn_message.format(
+ cls=_clspath(cls, old_class_path),
+ new=_clspath(new_class, new_class_path),
+ )
+ warnings.warn(msg, warn_category, stacklevel=2)
+ return super().__call__(*args, **kwargs)
+
+ deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {})
+
+ try:
+ frm = inspect.stack()[1]
+ parent_module = inspect.getmodule(frm[0])
+ if parent_module is not None:
+ deprecated_cls.__module__ = parent_module.__name__
+ except Exception as e:
+ # Sometimes inspect.stack() fails (e.g. when the first import of
+ # deprecated class is in jinja2 template). __module__ attribute is not
+ # important enough to raise an exception as users may be unable
+ # to fix inspect.stack() errors.
+ warnings.warn(f"Error detecting parent module: {e!r}")
+
+ return deprecated_cls
+
+
+def _clspath(cls: type, forced: Optional[str] = None) -> str:
+ if forced is not None:
+ return forced
+ return f"{cls.__module__}.{cls.__name__}"
DEPRECATION_RULES: List[Tuple[str, str]] = []
-def update_classpath(path: Any) ->Any:
+@overload
+def update_classpath(path: str) -> str:
+ ...
+
+
+@overload
+def update_classpath(path: Any) -> Any:
+ ...
+
+
+def update_classpath(path: Any) -> Any:
"""Update a deprecated path from an object with its new location"""
- pass
+ for prefix, replacement in DEPRECATION_RULES:
+ if isinstance(path, str) and path.startswith(prefix):
+ new_path = path.replace(prefix, replacement, 1)
+ warnings.warn(
+ f"`{path}` class is deprecated, use `{new_path}` instead",
+ ScrapyDeprecationWarning,
+ )
+ return new_path
+ return path
-def method_is_overridden(subclass: type, base_class: type, method_name: str
- ) ->bool:
+def method_is_overridden(subclass: type, base_class: type, method_name: str) -> bool:
"""
Return True if a method named ``method_name`` of a ``base_class``
is overridden in a ``subclass``.
@@ -76,4 +187,6 @@ def method_is_overridden(subclass: type, base_class: type, method_name: str
>>> method_is_overridden(Sub4, Base, 'foo')
True
"""
- pass
+ base_method = getattr(base_class, method_name)
+ sub_method = getattr(subclass, method_name)
+ return base_method.__code__ is not sub_method.__code__
diff --git a/scrapy/utils/display.py b/scrapy/utils/display.py
index 887a1f2c6..596cf89e4 100644
--- a/scrapy/utils/display.py
+++ b/scrapy/utils/display.py
@@ -1,9 +1,51 @@
"""
pprint and pformat wrappers with colorization support
"""
+
import ctypes
import platform
import sys
from pprint import pformat as pformat_
from typing import Any
+
from packaging.version import Version as parse_version
+
+
+def _enable_windows_terminal_processing() -> bool:
+ # https://stackoverflow.com/a/36760881
+ kernel32 = ctypes.windll.kernel32 # type: ignore[attr-defined]
+ return bool(kernel32.SetConsoleMode(kernel32.GetStdHandle(-11), 7))
+
+
+def _tty_supports_color() -> bool:
+ if sys.platform != "win32":
+ return True
+
+ if parse_version(platform.version()) < parse_version("10.0.14393"):
+ return True
+
+ # Windows >= 10.0.14393 interprets ANSI escape sequences providing terminal
+ # processing is enabled.
+ return _enable_windows_terminal_processing()
+
+
+def _colorize(text: str, colorize: bool = True) -> str:
+ if not colorize or not sys.stdout.isatty() or not _tty_supports_color():
+ return text
+ try:
+ from pygments import highlight
+ except ImportError:
+ return text
+ else:
+ from pygments.formatters import TerminalFormatter
+ from pygments.lexers import PythonLexer
+
+ return highlight(text, PythonLexer(), TerminalFormatter())
+
+
+def pformat(obj: Any, *args: Any, **kwargs: Any) -> str:
+ return _colorize(pformat_(obj), kwargs.pop("colorize", True))
+
+
+def pprint(obj: Any, *args: Any, **kwargs: Any) -> None:
+ print(pformat(obj, *args, **kwargs))
diff --git a/scrapy/utils/engine.py b/scrapy/utils/engine.py
index 2861f1a69..a5f2a8c6e 100644
--- a/scrapy/utils/engine.py
+++ b/scrapy/utils/engine.py
@@ -1,10 +1,51 @@
"""Some debugging functions for working with the Scrapy engine"""
-from time import time
+
+# used in global tests code
+from time import time # noqa: F401
from typing import TYPE_CHECKING, Any, List, Tuple
+
if TYPE_CHECKING:
from scrapy.core.engine import ExecutionEngine
-def get_engine_status(engine: 'ExecutionEngine') ->List[Tuple[str, Any]]:
+def get_engine_status(engine: "ExecutionEngine") -> List[Tuple[str, Any]]:
"""Return a report of the current engine status"""
- pass
+ tests = [
+ "time()-engine.start_time",
+ "len(engine.downloader.active)",
+ "engine.scraper.is_idle()",
+ "engine.spider.name",
+ "engine.spider_is_idle()",
+ "engine.slot.closing",
+ "len(engine.slot.inprogress)",
+ "len(engine.slot.scheduler.dqs or [])",
+ "len(engine.slot.scheduler.mqs)",
+ "len(engine.scraper.slot.queue)",
+ "len(engine.scraper.slot.active)",
+ "engine.scraper.slot.active_size",
+ "engine.scraper.slot.itemproc_size",
+ "engine.scraper.slot.needs_backout()",
+ ]
+
+ checks: List[Tuple[str, Any]] = []
+ for test in tests:
+ try:
+ checks += [(test, eval(test))]
+ except Exception as e:
+ checks += [(test, f"{type(e).__name__} (exception)")]
+
+ return checks
+
+
+def format_engine_status(engine: "ExecutionEngine") -> str:
+ checks = get_engine_status(engine)
+ s = "Execution engine status\n\n"
+ for test, result in checks:
+ s += f"{test:<47} : {result}\n"
+ s += "\n"
+
+ return s
+
+
+def print_engine_status(engine: "ExecutionEngine") -> None:
+ print(format_engine_status(engine))
diff --git a/scrapy/utils/ftp.py b/scrapy/utils/ftp.py
index 6acf0ecb9..c77681a53 100644
--- a/scrapy/utils/ftp.py
+++ b/scrapy/utils/ftp.py
@@ -4,18 +4,42 @@ from posixpath import dirname
from typing import IO
-def ftp_makedirs_cwd(ftp: FTP, path: str, first_call: bool=True) ->None:
+def ftp_makedirs_cwd(ftp: FTP, path: str, first_call: bool = True) -> None:
"""Set the current directory of the FTP connection given in the ``ftp``
argument (as a ftplib.FTP object), creating all parent directories if they
don't exist. The ftplib.FTP object must be already connected and logged in.
"""
- pass
+ try:
+ ftp.cwd(path)
+ except error_perm:
+ ftp_makedirs_cwd(ftp, dirname(path), False)
+ ftp.mkd(path)
+ if first_call:
+ ftp.cwd(path)
-def ftp_store_file(*, path: str, file: IO, host: str, port: int, username:
- str, password: str, use_active_mode: bool=False, overwrite: bool=True
- ) ->None:
+def ftp_store_file(
+ *,
+ path: str,
+ file: IO,
+ host: str,
+ port: int,
+ username: str,
+ password: str,
+ use_active_mode: bool = False,
+ overwrite: bool = True,
+) -> None:
"""Opens a FTP connection with passed credentials,sets current directory
to the directory extracted from given path, then uploads the file to server
"""
- pass
+ with FTP() as ftp:
+ ftp.connect(host, port)
+ ftp.login(username, password)
+ if use_active_mode:
+ ftp.set_pasv(False)
+ file.seek(0)
+ dirname, filename = posixpath.split(path)
+ ftp_makedirs_cwd(ftp, dirname)
+ command = "STOR" if overwrite else "APPE"
+ ftp.storbinary(f"{command} {filename}", file)
+ file.close()
diff --git a/scrapy/utils/gz.py b/scrapy/utils/gz.py
index 69751b6b3..2e487d88b 100644
--- a/scrapy/utils/gz.py
+++ b/scrapy/utils/gz.py
@@ -1,13 +1,42 @@
import struct
from gzip import GzipFile
from io import BytesIO
+
from scrapy.http import Response
+
from ._compression import _CHUNK_SIZE, _DecompressionMaxSizeExceeded
-def gunzip(data: bytes, *, max_size: int=0) ->bytes:
+def gunzip(data: bytes, *, max_size: int = 0) -> bytes:
"""Gunzip the given data and return as much data as possible.
This is resilient to CRC checksum errors.
"""
- pass
+ f = GzipFile(fileobj=BytesIO(data))
+ output_stream = BytesIO()
+ chunk = b"."
+ decompressed_size = 0
+ while chunk:
+ try:
+ chunk = f.read1(_CHUNK_SIZE)
+ except (OSError, EOFError, struct.error):
+ # complete only if there is some data, otherwise re-raise
+ # see issue 87 about catching struct.error
+ # some pages are quite small so output_stream is empty
+ if output_stream.getbuffer().nbytes > 0:
+ break
+ raise
+ decompressed_size += len(chunk)
+ if max_size and decompressed_size > max_size:
+ raise _DecompressionMaxSizeExceeded(
+ f"The number of bytes decompressed so far "
+ f"({decompressed_size} B) exceed the specified maximum "
+ f"({max_size} B)."
+ )
+ output_stream.write(chunk)
+ output_stream.seek(0)
+ return output_stream.read()
+
+
+def gzip_magic_number(response: Response) -> bool:
+ return response.body[:3] == b"\x1f\x8b\x08"
diff --git a/scrapy/utils/httpobj.py b/scrapy/utils/httpobj.py
index 8ecd2b938..d502e8910 100644
--- a/scrapy/utils/httpobj.py
+++ b/scrapy/utils/httpobj.py
@@ -1,15 +1,20 @@
"""Helper functions for scrapy.http objects (Request, Response)"""
+
from typing import Union
from urllib.parse import ParseResult, urlparse
from weakref import WeakKeyDictionary
+
from scrapy.http import Request, Response
-(_urlparse_cache: 'WeakKeyDictionary[Union[Request, Response], ParseResult]'
- ) = WeakKeyDictionary()
+
+_urlparse_cache: "WeakKeyDictionary[Union[Request, Response], ParseResult]" = (
+ WeakKeyDictionary()
+)
-def urlparse_cached(request_or_response: Union[Request, Response]
- ) ->ParseResult:
+def urlparse_cached(request_or_response: Union[Request, Response]) -> ParseResult:
"""Return urlparse.urlparse caching the result, where the argument can be a
Request or Response object
"""
- pass
+ if request_or_response not in _urlparse_cache:
+ _urlparse_cache[request_or_response] = urlparse(request_or_response.url)
+ return _urlparse_cache[request_or_response]
diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py
index 8b4ca3eac..db86af2c3 100644
--- a/scrapy/utils/iterators.py
+++ b/scrapy/utils/iterators.py
@@ -2,20 +2,38 @@ import csv
import logging
import re
from io import StringIO
-from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Union, cast, overload
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Dict,
+ Generator,
+ Iterable,
+ List,
+ Literal,
+ Optional,
+ Union,
+ cast,
+ overload,
+)
from warnings import warn
-from lxml import etree
+
+from lxml import etree # nosec
+
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.http import Response, TextResponse
from scrapy.selector import Selector
from scrapy.utils.python import re_rsearch, to_unicode
+
if TYPE_CHECKING:
- from lxml._types import SupportsReadClose
+ from lxml._types import SupportsReadClose # nosec
+
logger = logging.getLogger(__name__)
-def xmliter(obj: Union[Response, str, bytes], nodename: str) ->Generator[
- Selector, Any, None]:
+def xmliter(
+ obj: Union[Response, str, bytes], nodename: str
+) -> Generator[Selector, Any, None]:
"""Return a iterator of Selector's over all nodes of a XML document,
given the name of the node to iterate. Useful for parsing XML feeds.
@@ -24,27 +42,137 @@ def xmliter(obj: Union[Response, str, bytes], nodename: str) ->Generator[
- a unicode string
- a string encoded as utf-8
"""
- pass
+ warn(
+ (
+ "xmliter is deprecated and its use strongly discouraged because "
+ "it is vulnerable to ReDoS attacks. Use xmliter_lxml instead. See "
+ "https://github.com/scrapy/scrapy/security/advisories/GHSA-cc65-xxvf-f7r9"
+ ),
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ nodename_patt = re.escape(nodename)
+
+ DOCUMENT_HEADER_RE = re.compile(r"<\?xml[^>]+>\s*", re.S)
+ HEADER_END_RE = re.compile(rf"<\s*/{nodename_patt}\s*>", re.S)
+ END_TAG_RE = re.compile(r"<\s*/([^\s>]+)\s*>", re.S)
+ NAMESPACE_RE = re.compile(r"((xmlns[:A-Za-z]*)=[^>\s]+)", re.S)
+ text = _body_or_str(obj)
+
+ document_header_match = re.search(DOCUMENT_HEADER_RE, text)
+ document_header = (
+ document_header_match.group().strip() if document_header_match else ""
+ )
+ header_end_idx = re_rsearch(HEADER_END_RE, text)
+ header_end = text[header_end_idx[1] :].strip() if header_end_idx else ""
+ namespaces: Dict[str, str] = {}
+ if header_end:
+ for tagname in reversed(re.findall(END_TAG_RE, header_end)):
+ assert header_end_idx
+ tag = re.search(
+ rf"<\s*{tagname}.*?xmlns[:=][^>]*>", text[: header_end_idx[1]], re.S
+ )
+ if tag:
+ for x in re.findall(NAMESPACE_RE, tag.group()):
+ namespaces[x[1]] = x[0]
+
+ r = re.compile(rf"<{nodename_patt}[\s>].*?</{nodename_patt}>", re.DOTALL)
+ for match in r.finditer(text):
+ nodetext = (
+ document_header
+ + match.group().replace(
+ nodename, f'{nodename} {" ".join(namespaces.values())}', 1
+ )
+ + header_end
+ )
+ yield Selector(text=nodetext, type="xml")
-class _StreamReader:
+def xmliter_lxml(
+ obj: Union[Response, str, bytes],
+ nodename: str,
+ namespace: Optional[str] = None,
+ prefix: str = "x",
+) -> Generator[Selector, Any, None]:
+ reader = _StreamReader(obj)
+ tag = f"{{{namespace}}}{nodename}" if namespace else nodename
+ iterable = etree.iterparse(
+ cast("SupportsReadClose[bytes]", reader),
+ encoding=reader.encoding,
+ events=("end", "start-ns"),
+ resolve_entities=False,
+ huge_tree=True,
+ )
+ selxpath = "//" + (f"{prefix}:{nodename}" if namespace else nodename)
+ needs_namespace_resolution = not namespace and ":" in nodename
+ if needs_namespace_resolution:
+ prefix, nodename = nodename.split(":", maxsplit=1)
+ for event, data in iterable:
+ if event == "start-ns":
+ assert isinstance(data, tuple)
+ if needs_namespace_resolution:
+ _prefix, _namespace = data
+ if _prefix != prefix:
+ continue
+ namespace = _namespace
+ needs_namespace_resolution = False
+ selxpath = f"//{prefix}:{nodename}"
+ tag = f"{{{namespace}}}{nodename}"
+ continue
+ assert isinstance(data, etree._Element)
+ node = data
+ if node.tag != tag:
+ continue
+ nodetext = etree.tostring(node, encoding="unicode")
+ node.clear()
+ xs = Selector(text=nodetext, type="xml")
+ if namespace:
+ xs.register_namespace(prefix, namespace)
+ yield xs.xpath(selxpath)[0]
+
+
+class _StreamReader:
def __init__(self, obj: Union[Response, str, bytes]):
self._ptr: int = 0
self._text: Union[str, bytes]
if isinstance(obj, TextResponse):
self._text, self.encoding = obj.body, obj.encoding
elif isinstance(obj, Response):
- self._text, self.encoding = obj.body, 'utf-8'
+ self._text, self.encoding = obj.body, "utf-8"
else:
- self._text, self.encoding = obj, 'utf-8'
+ self._text, self.encoding = obj, "utf-8"
self._is_unicode: bool = isinstance(self._text, str)
self._is_first_read: bool = True
+ def read(self, n: int = 65535) -> bytes:
+ method: Callable[[int], bytes] = (
+ self._read_unicode if self._is_unicode else self._read_string
+ )
+ result = method(n)
+ if self._is_first_read:
+ self._is_first_read = False
+ result = result.lstrip()
+ return result
+
+ def _read_string(self, n: int = 65535) -> bytes:
+ s, e = self._ptr, self._ptr + n
+ self._ptr = e
+ return cast(bytes, self._text)[s:e]
-def csviter(obj: Union[Response, str, bytes], delimiter: Optional[str]=None,
- headers: Optional[List[str]]=None, encoding: Optional[str]=None,
- quotechar: Optional[str]=None) ->Generator[Dict[str, str], Any, None]:
+ def _read_unicode(self, n: int = 65535) -> bytes:
+ s, e = self._ptr, self._ptr + n
+ self._ptr = e
+ return cast(str, self._text)[s:e].encode("utf-8")
+
+
+def csviter(
+ obj: Union[Response, str, bytes],
+ delimiter: Optional[str] = None,
+ headers: Optional[List[str]] = None,
+ encoding: Optional[str] = None,
+ quotechar: Optional[str] = None,
+) -> Generator[Dict[str, str], Any, None]:
"""Returns an iterator of dictionaries from the given csv object
obj can be:
@@ -59,4 +187,74 @@ def csviter(obj: Union[Response, str, bytes], delimiter: Optional[str]=None,
quotechar is the character used to enclosure fields on the given obj.
"""
- pass
+
+ encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or "utf-8"
+
+ def row_to_unicode(row_: Iterable) -> List[str]:
+ return [to_unicode(field, encoding) for field in row_]
+
+ lines = StringIO(_body_or_str(obj, unicode=True))
+
+ kwargs: Dict[str, Any] = {}
+ if delimiter:
+ kwargs["delimiter"] = delimiter
+ if quotechar:
+ kwargs["quotechar"] = quotechar
+ csv_r = csv.reader(lines, **kwargs)
+
+ if not headers:
+ try:
+ row = next(csv_r)
+ except StopIteration:
+ return
+ headers = row_to_unicode(row)
+
+ for row in csv_r:
+ row = row_to_unicode(row)
+ if len(row) != len(headers):
+ logger.warning(
+ "ignoring row %(csvlnum)d (length: %(csvrow)d, "
+ "should be: %(csvheader)d)",
+ {
+ "csvlnum": csv_r.line_num,
+ "csvrow": len(row),
+ "csvheader": len(headers),
+ },
+ )
+ continue
+ yield dict(zip(headers, row))
+
+
+@overload
+def _body_or_str(obj: Union[Response, str, bytes]) -> str:
+ ...
+
+
+@overload
+def _body_or_str(obj: Union[Response, str, bytes], unicode: Literal[True]) -> str:
+ ...
+
+
+@overload
+def _body_or_str(obj: Union[Response, str, bytes], unicode: Literal[False]) -> bytes:
+ ...
+
+
+def _body_or_str(
+ obj: Union[Response, str, bytes], unicode: bool = True
+) -> Union[str, bytes]:
+ expected_types = (Response, str, bytes)
+ if not isinstance(obj, expected_types):
+ expected_types_str = " or ".join(t.__name__ for t in expected_types)
+ raise TypeError(
+ f"Object {obj!r} must be {expected_types_str}, not {type(obj).__name__}"
+ )
+ if isinstance(obj, Response):
+ if not unicode:
+ return cast(bytes, obj.body)
+ if isinstance(obj, TextResponse):
+ return obj.text
+ return cast(bytes, obj.body).decode("utf-8")
+ if isinstance(obj, str):
+ return obj if unicode else obj.encode("utf-8")
+ return obj.decode("utf-8") if unicode else obj
diff --git a/scrapy/utils/job.py b/scrapy/utils/job.py
index 1149db8f5..e230e4235 100644
--- a/scrapy/utils/job.py
+++ b/scrapy/utils/job.py
@@ -1,3 +1,13 @@
from pathlib import Path
from typing import Optional
+
from scrapy.settings import BaseSettings
+
+
+def job_dir(settings: BaseSettings) -> Optional[str]:
+ path: Optional[str] = settings["JOBDIR"]
+ if not path:
+ return None
+ if not Path(path).exists():
+ Path(path).mkdir(parents=True)
+ return path
diff --git a/scrapy/utils/log.py b/scrapy/utils/log.py
index cbe04ec83..e85082963 100644
--- a/scrapy/utils/log.py
+++ b/scrapy/utils/log.py
@@ -1,25 +1,49 @@
from __future__ import annotations
+
import logging
import sys
import warnings
from logging.config import dictConfig
from types import TracebackType
-from typing import TYPE_CHECKING, Any, List, MutableMapping, Optional, Tuple, Type, Union, cast
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ List,
+ MutableMapping,
+ Optional,
+ Tuple,
+ Type,
+ Union,
+ cast,
+)
+
from twisted.python import log as twisted_log
from twisted.python.failure import Failure
+
import scrapy
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.settings import Settings
from scrapy.utils.versions import scrapy_components_versions
+
if TYPE_CHECKING:
from scrapy.crawler import Crawler
+
logger = logging.getLogger(__name__)
-def failure_to_exc_info(failure: Failure) ->Optional[Tuple[Type[
- BaseException], BaseException, Optional[TracebackType]]]:
+def failure_to_exc_info(
+ failure: Failure,
+) -> Optional[Tuple[Type[BaseException], BaseException, Optional[TracebackType]]]:
"""Extract exc_info from Failure instances"""
- pass
+ if isinstance(failure, Failure):
+ assert failure.type
+ assert failure.value
+ return (
+ failure.type,
+ failure.value,
+ cast(Optional[TracebackType], failure.getTracebackObject()),
+ )
+ return None
class TopLevelFormatter(logging.Filter):
@@ -34,17 +58,38 @@ class TopLevelFormatter(logging.Filter):
``loggers`` list where it should act.
"""
- def __init__(self, loggers: Optional[List[str]]=None):
+ def __init__(self, loggers: Optional[List[str]] = None):
self.loggers: List[str] = loggers or []
-
-DEFAULT_LOGGING = {'version': 1, 'disable_existing_loggers': False,
- 'loggers': {'filelock': {'level': 'ERROR'}, 'hpack': {'level': 'ERROR'},
- 'scrapy': {'level': 'DEBUG'}, 'twisted': {'level': 'ERROR'}}}
-
-
-def configure_logging(settings: Union[Settings, dict, None]=None,
- install_root_handler: bool=True) ->None:
+ def filter(self, record: logging.LogRecord) -> bool:
+ if any(record.name.startswith(logger + ".") for logger in self.loggers):
+ record.name = record.name.split(".", 1)[0]
+ return True
+
+
+DEFAULT_LOGGING = {
+ "version": 1,
+ "disable_existing_loggers": False,
+ "loggers": {
+ "filelock": {
+ "level": "ERROR",
+ },
+ "hpack": {
+ "level": "ERROR",
+ },
+ "scrapy": {
+ "level": "DEBUG",
+ },
+ "twisted": {
+ "level": "ERROR",
+ },
+ },
+}
+
+
+def configure_logging(
+ settings: Union[Settings, dict, None] = None, install_root_handler: bool = True
+) -> None:
"""
Initialize logging defaults for Scrapy.
@@ -68,15 +113,93 @@ def configure_logging(settings: Union[Settings, dict, None]=None,
using ``settings`` argument. When ``settings`` is empty or None, defaults
are used.
"""
- pass
+ if not sys.warnoptions:
+ # Route warnings through python logging
+ logging.captureWarnings(True)
+
+ observer = twisted_log.PythonLoggingObserver("twisted")
+ observer.start()
+
+ dictConfig(DEFAULT_LOGGING)
+
+ if isinstance(settings, dict) or settings is None:
+ settings = Settings(settings)
+
+ if settings.getbool("LOG_STDOUT"):
+ sys.stdout = StreamLogger(logging.getLogger("stdout")) # type: ignore[assignment]
+
+ if install_root_handler:
+ install_scrapy_root_handler(settings)
_scrapy_root_handler: Optional[logging.Handler] = None
-def _get_handler(settings: Settings) ->logging.Handler:
+def install_scrapy_root_handler(settings: Settings) -> None:
+ global _scrapy_root_handler
+
+ if (
+ _scrapy_root_handler is not None
+ and _scrapy_root_handler in logging.root.handlers
+ ):
+ logging.root.removeHandler(_scrapy_root_handler)
+ logging.root.setLevel(logging.NOTSET)
+ _scrapy_root_handler = _get_handler(settings)
+ logging.root.addHandler(_scrapy_root_handler)
+
+
+def get_scrapy_root_handler() -> Optional[logging.Handler]:
+ return _scrapy_root_handler
+
+
+def _get_handler(settings: Settings) -> logging.Handler:
"""Return a log handler object according to settings"""
- pass
+ filename = settings.get("LOG_FILE")
+ handler: logging.Handler
+ if filename:
+ mode = "a" if settings.getbool("LOG_FILE_APPEND") else "w"
+ encoding = settings.get("LOG_ENCODING")
+ handler = logging.FileHandler(filename, mode=mode, encoding=encoding)
+ elif settings.getbool("LOG_ENABLED"):
+ handler = logging.StreamHandler()
+ else:
+ handler = logging.NullHandler()
+
+ formatter = logging.Formatter(
+ fmt=settings.get("LOG_FORMAT"), datefmt=settings.get("LOG_DATEFORMAT")
+ )
+ handler.setFormatter(formatter)
+ handler.setLevel(settings.get("LOG_LEVEL"))
+ if settings.getbool("LOG_SHORT_NAMES"):
+ handler.addFilter(TopLevelFormatter(["scrapy"]))
+ return handler
+
+
+def log_scrapy_info(settings: Settings) -> None:
+ logger.info(
+ "Scrapy %(version)s started (bot: %(bot)s)",
+ {"version": scrapy.__version__, "bot": settings["BOT_NAME"]},
+ )
+ versions = [
+ f"{name} {version}"
+ for name, version in scrapy_components_versions()
+ if name != "Scrapy"
+ ]
+ logger.info("Versions: %(versions)s", {"versions": ", ".join(versions)})
+
+
+def log_reactor_info() -> None:
+ from twisted.internet import reactor
+
+ logger.debug("Using reactor: %s.%s", reactor.__module__, reactor.__class__.__name__)
+ from twisted.internet import asyncioreactor
+
+ if isinstance(reactor, asyncioreactor.AsyncioSelectorReactor):
+ logger.debug(
+ "Using asyncio event loop: %s.%s",
+ reactor._asyncioEventloop.__module__,
+ reactor._asyncioEventloop.__class__.__name__,
+ )
class StreamLogger:
@@ -86,10 +209,18 @@ class StreamLogger:
https://www.electricmonk.nl/log/2011/08/14/redirect-stdout-and-stderr-to-a-logger-in-python/
"""
- def __init__(self, logger: logging.Logger, log_level: int=logging.INFO):
+ def __init__(self, logger: logging.Logger, log_level: int = logging.INFO):
self.logger: logging.Logger = logger
self.log_level: int = log_level
- self.linebuf: str = ''
+ self.linebuf: str = ""
+
+ def write(self, buf: str) -> None:
+ for line in buf.rstrip().splitlines():
+ self.logger.log(self.log_level, line.rstrip())
+
+ def flush(self) -> None:
+ for h in self.logger.handlers:
+ h.flush()
class LogCounterHandler(logging.Handler):
@@ -99,19 +230,45 @@ class LogCounterHandler(logging.Handler):
super().__init__(*args, **kwargs)
self.crawler: Crawler = crawler
+ def emit(self, record: logging.LogRecord) -> None:
+ sname = f"log_count/{record.levelname}"
+ assert self.crawler.stats
+ self.crawler.stats.inc_value(sname)
-def logformatter_adapter(logkws: dict) ->Tuple[int, str, dict]:
+
+def logformatter_adapter(logkws: dict) -> Tuple[int, str, dict]:
"""
Helper that takes the dictionary output from the methods in LogFormatter
and adapts it into a tuple of positional arguments for logger.log calls,
handling backward compatibility as well.
"""
- pass
+ if not {"level", "msg", "args"} <= set(logkws):
+ warnings.warn("Missing keys in LogFormatter method", ScrapyDeprecationWarning)
+ if "format" in logkws:
+ warnings.warn(
+ "`format` key in LogFormatter methods has been "
+ "deprecated, use `msg` instead",
+ ScrapyDeprecationWarning,
+ )
-class SpiderLoggerAdapter(logging.LoggerAdapter):
+ level = logkws.get("level", logging.INFO)
+ message = logkws.get("format", logkws.get("msg"))
+ # NOTE: This also handles 'args' being an empty dict, that case doesn't
+ # play well in logger.log calls
+ args = logkws if not logkws.get("args") else logkws["args"]
+
+ return (level, message, args)
- def process(self, msg: str, kwargs: MutableMapping[str, Any]) ->Tuple[
- str, MutableMapping[str, Any]]:
+
+class SpiderLoggerAdapter(logging.LoggerAdapter):
+ def process(
+ self, msg: str, kwargs: MutableMapping[str, Any]
+ ) -> Tuple[str, MutableMapping[str, Any]]:
"""Method that augments logging with additional 'extra' data"""
- pass
+ if isinstance(kwargs.get("extra"), MutableMapping):
+ kwargs["extra"].update(self.extra)
+ else:
+ kwargs["extra"] = self.extra
+
+ return msg, kwargs
diff --git a/scrapy/utils/misc.py b/scrapy/utils/misc.py
index 6e45b6a2b..b3c28da92 100644
--- a/scrapy/utils/misc.py
+++ b/scrapy/utils/misc.py
@@ -11,27 +11,49 @@ from functools import partial
from importlib import import_module
from pkgutil import iter_modules
from types import ModuleType
-from typing import IO, TYPE_CHECKING, Any, Callable, Deque, Generator, Iterable, List, Optional, Pattern, Union, cast
+from typing import (
+ IO,
+ TYPE_CHECKING,
+ Any,
+ Callable,
+ Deque,
+ Generator,
+ Iterable,
+ List,
+ Optional,
+ Pattern,
+ Union,
+ cast,
+)
+
from w3lib.html import replace_entities
+
from scrapy.item import Item
from scrapy.utils.datatypes import LocalWeakReferencedCache
from scrapy.utils.deprecate import ScrapyDeprecationWarning
from scrapy.utils.python import flatten, to_unicode
+
if TYPE_CHECKING:
from scrapy import Spider
+
+
_ITERABLE_SINGLE_VALUES = dict, Item, str, bytes
-def arg_to_iter(arg: Any) ->Iterable[Any]:
+def arg_to_iter(arg: Any) -> Iterable[Any]:
"""Convert an argument to an iterable. The argument can be a None, single
value, or an iterable.
Exception: if arg is a dict, [arg] will be returned
"""
- pass
+ if arg is None:
+ return []
+ if not isinstance(arg, _ITERABLE_SINGLE_VALUES) and hasattr(arg, "__iter__"):
+ return cast(Iterable[Any], arg)
+ return [arg]
-def load_object(path: Union[str, Callable]) ->Any:
+def load_object(path: Union[str, Callable]) -> Any:
"""Load an object given its absolute object path, and return it.
The object can be the import path of a class, function, variable or an
@@ -40,31 +62,86 @@ def load_object(path: Union[str, Callable]) ->Any:
If ``path`` is not a string, but is a callable object, such as a class or
a function, then return it as is.
"""
- pass
+
+ if not isinstance(path, str):
+ if callable(path):
+ return path
+ raise TypeError(
+ f"Unexpected argument type, expected string or object, got: {type(path)}"
+ )
+
+ try:
+ dot = path.rindex(".")
+ except ValueError:
+ raise ValueError(f"Error loading object '{path}': not a full path")
+
+ module, name = path[:dot], path[dot + 1 :]
+ mod = import_module(module)
+
+ try:
+ obj = getattr(mod, name)
+ except AttributeError:
+ raise NameError(f"Module '{module}' doesn't define any object named '{name}'")
+
+ return obj
-def walk_modules(path: str) ->List[ModuleType]:
+def walk_modules(path: str) -> List[ModuleType]:
"""Loads a module and all its submodules from the given module path and
returns them. If *any* module throws an exception while importing, that
exception is thrown back.
For example: walk_modules('scrapy.utils')
"""
- pass
-
-def extract_regex(regex: Union[str, Pattern], text: str, encoding: str='utf-8'
- ) ->List[str]:
+ mods: List[ModuleType] = []
+ mod = import_module(path)
+ mods.append(mod)
+ if hasattr(mod, "__path__"):
+ for _, subpath, ispkg in iter_modules(mod.__path__):
+ fullpath = path + "." + subpath
+ if ispkg:
+ mods += walk_modules(fullpath)
+ else:
+ submod = import_module(fullpath)
+ mods.append(submod)
+ return mods
+
+
+def extract_regex(
+ regex: Union[str, Pattern], text: str, encoding: str = "utf-8"
+) -> List[str]:
"""Extract a list of unicode strings from the given text/encoding using the following policies:
* if the regex contains a named group called "extract" that will be returned
* if the regex contains multiple numbered groups, all those will be returned (flattened)
* if the regex doesn't contain any group the entire regex matching is returned
"""
- pass
-
-
-def md5sum(file: IO) ->str:
+ warnings.warn(
+ "scrapy.utils.misc.extract_regex has moved to parsel.utils.extract_regex.",
+ ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+
+ if isinstance(regex, str):
+ regex = re.compile(regex, re.UNICODE)
+
+ try:
+ # named group
+ strings = [regex.search(text).group("extract")] # type: ignore[union-attr]
+ except Exception:
+ # full regex or numbered groups
+ strings = regex.findall(text)
+ strings = flatten(strings)
+
+ if isinstance(text, str):
+ return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
+ return [
+ replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings
+ ]
+
+
+def md5sum(file: IO) -> str:
"""Calculate the md5 checksum of a file-like object without reading its
whole content in memory.
@@ -72,12 +149,18 @@ def md5sum(file: IO) ->str:
>>> md5sum(BytesIO(b'file content to hash'))
'784406af91dd5a54fbb9c84c2236595a'
"""
- pass
+ m = hashlib.md5()
+ while True:
+ d = file.read(8096)
+ if not d:
+ break
+ m.update(d)
+ return m.hexdigest()
-def rel_has_nofollow(rel: Optional[str]) ->bool:
+def rel_has_nofollow(rel: Optional[str]) -> bool:
"""Return True if link rel attribute has nofollow type"""
- pass
+ return rel is not None and "nofollow" in rel.replace(",", " ").split()
def create_instance(objcls, settings, crawler, *args, **kwargs):
@@ -97,39 +180,120 @@ def create_instance(objcls, settings, crawler, *args, **kwargs):
Raises ``TypeError`` if the resulting instance is ``None`` (e.g. if an
extension has not been implemented correctly).
"""
- pass
+ if settings is None:
+ if crawler is None:
+ raise ValueError("Specify at least one of settings and crawler.")
+ settings = crawler.settings
+ if crawler and hasattr(objcls, "from_crawler"):
+ instance = objcls.from_crawler(crawler, *args, **kwargs)
+ method_name = "from_crawler"
+ elif hasattr(objcls, "from_settings"):
+ instance = objcls.from_settings(settings, *args, **kwargs)
+ method_name = "from_settings"
+ else:
+ instance = objcls(*args, **kwargs)
+ method_name = "__new__"
+ if instance is None:
+ raise TypeError(f"{objcls.__qualname__}.{method_name} returned None")
+ return instance
@contextmanager
-def set_environ(**kwargs: str) ->Generator[None, Any, None]:
+def set_environ(**kwargs: str) -> Generator[None, Any, None]:
"""Temporarily set environment variables inside the context manager and
fully restore previous environment afterwards
"""
- pass
+ original_env = {k: os.environ.get(k) for k in kwargs}
+ os.environ.update(kwargs)
+ try:
+ yield
+ finally:
+ for k, v in original_env.items():
+ if v is None:
+ del os.environ[k]
+ else:
+ os.environ[k] = v
-def walk_callable(node: ast.AST) ->Generator[ast.AST, Any, None]:
+
+def walk_callable(node: ast.AST) -> Generator[ast.AST, Any, None]:
"""Similar to ``ast.walk``, but walks only function body and skips nested
functions defined within the node.
"""
- pass
+ todo: Deque[ast.AST] = deque([node])
+ walked_func_def = False
+ while todo:
+ node = todo.popleft()
+ if isinstance(node, ast.FunctionDef):
+ if walked_func_def:
+ continue
+ walked_func_def = True
+ todo.extend(ast.iter_child_nodes(node))
+ yield node
_generator_callbacks_cache = LocalWeakReferencedCache(limit=128)
-def is_generator_with_return_value(callable: Callable) ->bool:
+def is_generator_with_return_value(callable: Callable) -> bool:
"""
Returns True if a callable is a generator function which includes a
'return' statement with a value different than None, False otherwise
"""
- pass
+ if callable in _generator_callbacks_cache:
+ return bool(_generator_callbacks_cache[callable])
+
+ def returns_none(return_node: ast.Return) -> bool:
+ value = return_node.value
+ return (
+ value is None or isinstance(value, ast.NameConstant) and value.value is None
+ )
+
+ if inspect.isgeneratorfunction(callable):
+ func = callable
+ while isinstance(func, partial):
+ func = func.func
+
+ src = inspect.getsource(func)
+ pattern = re.compile(r"(^[\t ]+)")
+ code = pattern.sub("", src)
+
+ match = pattern.match(src) # finds indentation
+ if match:
+ code = re.sub(f"\n{match.group(0)}", "\n", code) # remove indentation
+
+ tree = ast.parse(code)
+ for node in walk_callable(tree):
+ if isinstance(node, ast.Return) and not returns_none(node):
+ _generator_callbacks_cache[callable] = True
+ return bool(_generator_callbacks_cache[callable])
+
+ _generator_callbacks_cache[callable] = False
+ return bool(_generator_callbacks_cache[callable])
-def warn_on_generator_with_return_value(spider: 'Spider', callable: Callable
- ) ->None:
+def warn_on_generator_with_return_value(spider: "Spider", callable: Callable) -> None:
"""
Logs a warning if a callable is a generator function and includes
a 'return' statement with a value different than None
"""
- pass
+ try:
+ if is_generator_with_return_value(callable):
+ warnings.warn(
+ f'The "{spider.__class__.__name__}.{callable.__name__}" method is '
+ 'a generator and includes a "return" statement with a value '
+ "different than None. This could lead to unexpected behaviour. Please see "
+ "https://docs.python.org/3/reference/simple_stmts.html#the-return-statement "
+ 'for details about the semantics of the "return" statement within generators',
+ stacklevel=2,
+ )
+ except IndentationError:
+ callable_name = spider.__class__.__name__ + "." + callable.__name__
+ warnings.warn(
+ f'Unable to determine whether or not "{callable_name}" is a generator with a return value. '
+ "This will not prevent your code from working, but it prevents Scrapy from detecting "
+ f'potential issues in your implementation of "{callable_name}". Please, report this in the '
+ "Scrapy issue tracker (https://github.com/scrapy/scrapy/issues), "
+ f'including the code of "{callable_name}"',
+ stacklevel=2,
+ )
diff --git a/scrapy/utils/ossignal.py b/scrapy/utils/ossignal.py
index 012a5cf9a..db9a71273 100644
--- a/scrapy/utils/ossignal.py
+++ b/scrapy/utils/ossignal.py
@@ -1,21 +1,31 @@
import signal
from types import FrameType
from typing import Any, Callable, Dict, Optional, Union
-SignalHandlerT = Union[Callable[[int, Optional[FrameType]], Any], int,
- signal.Handlers, None]
+
+# copy of _HANDLER from typeshed/stdlib/signal.pyi
+SignalHandlerT = Union[
+ Callable[[int, Optional[FrameType]], Any], int, signal.Handlers, None
+]
+
signal_names: Dict[int, str] = {}
for signame in dir(signal):
- if signame.startswith('SIG') and not signame.startswith('SIG_'):
+ if signame.startswith("SIG") and not signame.startswith("SIG_"):
signum = getattr(signal, signame)
if isinstance(signum, int):
signal_names[signum] = signame
-def install_shutdown_handlers(function: SignalHandlerT, override_sigint:
- bool=True) ->None:
+def install_shutdown_handlers(
+ function: SignalHandlerT, override_sigint: bool = True
+) -> None:
"""Install the given function as a signal handler for all common shutdown
signals (such as SIGINT, SIGTERM, etc). If ``override_sigint`` is ``False`` the
SIGINT handler won't be installed if there is already a handler in place
(e.g. Pdb)
"""
- pass
+ signal.signal(signal.SIGTERM, function)
+ if signal.getsignal(signal.SIGINT) == signal.default_int_handler or override_sigint:
+ signal.signal(signal.SIGINT, function)
+ # Catch Ctrl-Break in windows
+ if hasattr(signal, "SIGBREAK"):
+ signal.signal(signal.SIGBREAK, function)
diff --git a/scrapy/utils/project.py b/scrapy/utils/project.py
index ce406428a..a2c224b90 100644
--- a/scrapy/utils/project.py
+++ b/scrapy/utils/project.py
@@ -2,21 +2,87 @@ import os
import warnings
from importlib import import_module
from pathlib import Path
+
from scrapy.exceptions import NotConfigured
from scrapy.settings import Settings
from scrapy.utils.conf import closest_scrapy_cfg, get_config, init_env
-ENVVAR = 'SCRAPY_SETTINGS_MODULE'
-DATADIR_CFG_SECTION = 'datadir'
+ENVVAR = "SCRAPY_SETTINGS_MODULE"
+DATADIR_CFG_SECTION = "datadir"
+
+
+def inside_project() -> bool:
+ scrapy_module = os.environ.get(ENVVAR)
+ if scrapy_module:
+ try:
+ import_module(scrapy_module)
+ except ImportError as exc:
+ warnings.warn(
+ f"Cannot import scrapy settings module {scrapy_module}: {exc}"
+ )
+ else:
+ return True
+ return bool(closest_scrapy_cfg())
-def project_data_dir(project: str='default') ->str:
+
+def project_data_dir(project: str = "default") -> str:
"""Return the current project data dir, creating it if it doesn't exist"""
- pass
+ if not inside_project():
+ raise NotConfigured("Not inside a project")
+ cfg = get_config()
+ if cfg.has_option(DATADIR_CFG_SECTION, project):
+ d = Path(cfg.get(DATADIR_CFG_SECTION, project))
+ else:
+ scrapy_cfg = closest_scrapy_cfg()
+ if not scrapy_cfg:
+ raise NotConfigured(
+ "Unable to find scrapy.cfg file to infer project data dir"
+ )
+ d = (Path(scrapy_cfg).parent / ".scrapy").resolve()
+ if not d.exists():
+ d.mkdir(parents=True)
+ return str(d)
-def data_path(path: str, createdir: bool=False) ->str:
+def data_path(path: str, createdir: bool = False) -> str:
"""
Return the given path joined with the .scrapy data directory.
If given an absolute path, return it unmodified.
"""
- pass
+ path_obj = Path(path)
+ if not path_obj.is_absolute():
+ if inside_project():
+ path_obj = Path(project_data_dir(), path)
+ else:
+ path_obj = Path(".scrapy", path)
+ if createdir and not path_obj.exists():
+ path_obj.mkdir(parents=True)
+ return str(path_obj)
+
+
+def get_project_settings() -> Settings:
+ if ENVVAR not in os.environ:
+ project = os.environ.get("SCRAPY_PROJECT", "default")
+ init_env(project)
+
+ settings = Settings()
+ settings_module_path = os.environ.get(ENVVAR)
+ if settings_module_path:
+ settings.setmodule(settings_module_path, priority="project")
+
+ valid_envvars = {
+ "CHECK",
+ "PROJECT",
+ "PYTHON_SHELL",
+ "SETTINGS_MODULE",
+ }
+
+ scrapy_envvars = {
+ k[7:]: v
+ for k, v in os.environ.items()
+ if k.startswith("SCRAPY_") and k.replace("SCRAPY_", "") in valid_envvars
+ }
+
+ settings.setdict(scrapy_envvars, priority="project")
+
+ return settings
diff --git a/scrapy/utils/python.py b/scrapy/utils/python.py
index fa47a6995..20305a75e 100644
--- a/scrapy/utils/python.py
+++ b/scrapy/utils/python.py
@@ -9,11 +9,29 @@ import sys
import weakref
from functools import partial, wraps
from itertools import chain
-from typing import Any, AsyncGenerator, AsyncIterable, AsyncIterator, Callable, Dict, Generator, Iterable, Iterator, List, Mapping, Optional, Pattern, Tuple, Union, overload
+from typing import (
+ Any,
+ AsyncGenerator,
+ AsyncIterable,
+ AsyncIterator,
+ Callable,
+ Dict,
+ Generator,
+ Iterable,
+ Iterator,
+ List,
+ Mapping,
+ Optional,
+ Pattern,
+ Tuple,
+ Union,
+ overload,
+)
+
from scrapy.utils.asyncgen import as_async_generator
-def flatten(x: Iterable) ->list:
+def flatten(x: Iterable) -> list:
"""flatten(sequence) -> list
Returns a single, flat list which contains all elements retrieved
@@ -30,17 +48,22 @@ def flatten(x: Iterable) ->list:
>>> flatten(["foo", ["baz", 42], "bar"])
['foo', 'baz', 42, 'bar']
"""
- pass
+ return list(iflatten(x))
-def iflatten(x: Iterable) ->Iterable:
+def iflatten(x: Iterable) -> Iterable:
"""iflatten(sequence) -> iterator
Similar to ``.flatten()``, but returns iterator instead"""
- pass
+ for el in x:
+ if is_listlike(el):
+ for el_ in iflatten(el):
+ yield el_
+ else:
+ yield el
-def is_listlike(x: Any) ->bool:
+def is_listlike(x: Any) -> bool:
"""
>>> is_listlike("foo")
False
@@ -61,30 +84,58 @@ def is_listlike(x: Any) ->bool:
>>> is_listlike(range(5))
True
"""
- pass
+ return hasattr(x, "__iter__") and not isinstance(x, (str, bytes))
-def unique(list_: Iterable, key: Callable[[Any], Any]=lambda x: x) ->list:
+def unique(list_: Iterable, key: Callable[[Any], Any] = lambda x: x) -> list:
"""efficient function to uniquify a list preserving item order"""
- pass
-
-
-def to_unicode(text: Union[str, bytes], encoding: Optional[str]=None,
- errors: str='strict') ->str:
+ seen = set()
+ result = []
+ for item in list_:
+ seenkey = key(item)
+ if seenkey in seen:
+ continue
+ seen.add(seenkey)
+ result.append(item)
+ return result
+
+
+def to_unicode(
+ text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
+) -> str:
"""Return the unicode representation of a bytes object ``text``. If
``text`` is already an unicode object, return it as-is."""
- pass
-
-
-def to_bytes(text: Union[str, bytes], encoding: Optional[str]=None, errors:
- str='strict') ->bytes:
+ if isinstance(text, str):
+ return text
+ if not isinstance(text, (bytes, str)):
+ raise TypeError(
+ "to_unicode must receive a bytes or str "
+ f"object, got {type(text).__name__}"
+ )
+ if encoding is None:
+ encoding = "utf-8"
+ return text.decode(encoding, errors)
+
+
+def to_bytes(
+ text: Union[str, bytes], encoding: Optional[str] = None, errors: str = "strict"
+) -> bytes:
"""Return the binary representation of ``text``. If ``text``
is already a bytes object, return it as-is."""
- pass
-
-
-def re_rsearch(pattern: Union[str, Pattern], text: str, chunk_size: int=1024
- ) ->Optional[Tuple[int, int]]:
+ if isinstance(text, bytes):
+ return text
+ if not isinstance(text, str):
+ raise TypeError(
+ "to_bytes must receive a str or bytes " f"object, got {type(text).__name__}"
+ )
+ if encoding is None:
+ encoding = "utf-8"
+ return text.encode(encoding, errors)
+
+
+def re_rsearch(
+ pattern: Union[str, Pattern], text: str, chunk_size: int = 1024
+) -> Optional[Tuple[int, int]]:
"""
This function does a reverse search in a text using a regular expression
given in the attribute 'pattern'.
@@ -97,33 +148,87 @@ def re_rsearch(pattern: Union[str, Pattern], text: str, chunk_size: int=1024
In case the pattern wasn't found, None is returned, otherwise it returns a tuple containing
the start position of the match, and the ending (regarding the entire text).
"""
- pass
+ def _chunk_iter() -> Generator[Tuple[str, int], Any, None]:
+ offset = len(text)
+ while True:
+ offset -= chunk_size * 1024
+ if offset <= 0:
+ break
+ yield (text[offset:], offset)
+ yield (text, 0)
+
+ if isinstance(pattern, str):
+ pattern = re.compile(pattern)
-def memoizemethod_noargs(method: Callable) ->Callable:
+ for chunk, offset in _chunk_iter():
+ matches = [match for match in pattern.finditer(chunk)]
+ if matches:
+ start, end = matches[-1].span()
+ return offset + start, offset + end
+ return None
+
+
+def memoizemethod_noargs(method: Callable) -> Callable:
"""Decorator to cache the result of a method (without arguments) using a
weak reference to its object
"""
- pass
+ cache: weakref.WeakKeyDictionary[Any, Any] = weakref.WeakKeyDictionary()
+
+ @wraps(method)
+ def new_method(self: Any, *args: Any, **kwargs: Any) -> Any:
+ if self not in cache:
+ cache[self] = method(self, *args, **kwargs)
+ return cache[self]
+
+ return new_method
-_BINARYCHARS = {i for i in range(32) if to_bytes(chr(i)) not in {b'\x00',
- b'\t', b'\n', b'\r'}}
+_BINARYCHARS = {
+ i for i in range(32) if to_bytes(chr(i)) not in {b"\0", b"\t", b"\n", b"\r"}
+}
-def binary_is_text(data: bytes) ->bool:
+def binary_is_text(data: bytes) -> bool:
"""Returns ``True`` if the given ``data`` argument (a ``bytes`` object)
does not contain unprintable control characters.
"""
- pass
+ if not isinstance(data, bytes):
+ raise TypeError(f"data must be bytes, got '{type(data).__name__}'")
+ return all(c not in _BINARYCHARS for c in data)
-def get_func_args(func: Callable, stripself: bool=False) ->List[str]:
+def get_func_args(func: Callable, stripself: bool = False) -> List[str]:
"""Return the argument name list of a callable object"""
- pass
-
-
-def get_spec(func: Callable) ->Tuple[List[str], Dict[str, Any]]:
+ if not callable(func):
+ raise TypeError(f"func must be callable, got '{type(func).__name__}'")
+
+ args: List[str] = []
+ try:
+ sig = inspect.signature(func)
+ except ValueError:
+ return args
+
+ if isinstance(func, partial):
+ partial_args = func.args
+ partial_kw = func.keywords
+
+ for name, param in sig.parameters.items():
+ if param.name in partial_args:
+ continue
+ if partial_kw and param.name in partial_kw:
+ continue
+ args.append(name)
+ else:
+ for name in sig.parameters.keys():
+ args.append(name)
+
+ if stripself and args and args[0] == "self":
+ args = args[1:]
+ return args
+
+
+def get_spec(func: Callable) -> Tuple[List[str], Dict[str, Any]]:
"""Returns (args, kwargs) tuple for a function
>>> import re
>>> get_spec(re.match)
@@ -144,26 +249,66 @@ def get_spec(func: Callable) ->Tuple[List[str], Dict[str, Any]]:
>>> get_spec(Test().method)
(['self', 'val'], {'flags': 0})
"""
- pass
+ if inspect.isfunction(func) or inspect.ismethod(func):
+ spec = inspect.getfullargspec(func)
+ elif hasattr(func, "__call__"):
+ spec = inspect.getfullargspec(func.__call__)
+ else:
+ raise TypeError(f"{type(func)} is not callable")
+
+ defaults: Tuple[Any, ...] = spec.defaults or ()
-def equal_attributes(obj1: Any, obj2: Any, attributes: Optional[List[Union[
- str, Callable]]]) ->bool:
+ firstdefault = len(spec.args) - len(defaults)
+ args = spec.args[:firstdefault]
+ kwargs = dict(zip(spec.args[firstdefault:], defaults))
+ return args, kwargs
+
+
+def equal_attributes(
+ obj1: Any, obj2: Any, attributes: Optional[List[Union[str, Callable]]]
+) -> bool:
"""Compare two objects attributes"""
- pass
+ # not attributes given return False by default
+ if not attributes:
+ return False
+
+ temp1, temp2 = object(), object()
+ for attr in attributes:
+ # support callables like itemgetter
+ if callable(attr):
+ if attr(obj1) != attr(obj2):
+ return False
+ elif getattr(obj1, attr, temp1) != getattr(obj2, attr, temp2):
+ return False
+ # all attributes equal
+ return True
+
+
+@overload
+def without_none_values(iterable: Mapping) -> dict:
+ ...
+
+
+@overload
+def without_none_values(iterable: Iterable) -> Iterable:
+ ...
-def without_none_values(iterable: Union[Mapping, Iterable]) ->Union[dict,
- Iterable]:
+def without_none_values(iterable: Union[Mapping, Iterable]) -> Union[dict, Iterable]:
"""Return a copy of ``iterable`` with all ``None`` entries removed.
If ``iterable`` is a mapping, return a dictionary where all pairs that have
value ``None`` have been removed.
"""
- pass
+ if isinstance(iterable, collections.abc.Mapping):
+ return {k: v for k, v in iterable.items() if v is not None}
+ else:
+ # the iterable __init__ must take another iterable
+ return type(iterable)(v for v in iterable if v is not None) # type: ignore[call-arg]
-def global_object_name(obj: Any) ->str:
+def global_object_name(obj: Any) -> str:
"""
Return full name of a global object.
@@ -171,10 +316,20 @@ def global_object_name(obj: Any) ->str:
>>> global_object_name(Request)
'scrapy.http.request.Request'
"""
- pass
+ return f"{obj.__module__}.{obj.__qualname__}"
-if hasattr(sys, 'pypy_version_info'):
+if hasattr(sys, "pypy_version_info"):
+
+ def garbage_collect() -> None:
+ # Collecting weakreferences can take two collections on PyPy.
+ gc.collect()
+ gc.collect()
+
+else:
+
+ def garbage_collect() -> None:
+ gc.collect()
class MutableChain(Iterable):
@@ -185,13 +340,22 @@ class MutableChain(Iterable):
def __init__(self, *args: Iterable):
self.data = chain.from_iterable(args)
- def __iter__(self) ->Iterator:
+ def extend(self, *iterables: Iterable) -> None:
+ self.data = chain(self.data, chain.from_iterable(iterables))
+
+ def __iter__(self) -> Iterator:
return self
- def __next__(self) ->Any:
+ def __next__(self) -> Any:
return next(self.data)
+async def _async_chain(*iterables: Union[Iterable, AsyncIterable]) -> AsyncGenerator:
+ for it in iterables:
+ async for o in as_async_generator(it):
+ yield o
+
+
class MutableAsyncChain(AsyncIterable):
"""
Similar to MutableChain but for async iterables
@@ -200,8 +364,11 @@ class MutableAsyncChain(AsyncIterable):
def __init__(self, *args: Union[Iterable, AsyncIterable]):
self.data = _async_chain(*args)
- def __aiter__(self) ->AsyncIterator:
+ def extend(self, *iterables: Union[Iterable, AsyncIterable]) -> None:
+ self.data = _async_chain(self.data, _async_chain(*iterables))
+
+ def __aiter__(self) -> AsyncIterator:
return self
- async def __anext__(self) ->Any:
+ async def __anext__(self) -> Any:
return await self.data.__anext__()
diff --git a/scrapy/utils/reactor.py b/scrapy/utils/reactor.py
index 5f7bcdfbd..ad3d1d8bc 100644
--- a/scrapy/utils/reactor.py
+++ b/scrapy/utils/reactor.py
@@ -4,15 +4,32 @@ from asyncio import AbstractEventLoop, AbstractEventLoopPolicy
from contextlib import suppress
from typing import Any, Callable, Dict, Optional, Sequence, Type
from warnings import catch_warnings, filterwarnings, warn
+
from twisted.internet import asyncioreactor, error
from twisted.internet.base import DelayedCall
+
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.misc import load_object
def listen_tcp(portrange, host, factory):
"""Like reactor.listenTCP but tries different ports in a range."""
- pass
+ from twisted.internet import reactor
+
+ if len(portrange) > 2:
+ raise ValueError(f"invalid portrange: {portrange}")
+ if not portrange:
+ return reactor.listenTCP(0, factory, interface=host)
+ if not hasattr(portrange, "__iter__"):
+ return reactor.listenTCP(portrange, factory, interface=host)
+ if len(portrange) == 1:
+ return reactor.listenTCP(portrange[0], factory, interface=host)
+ for x in range(portrange[0], portrange[1] + 1):
+ try:
+ return reactor.listenTCP(x, factory, interface=host)
+ except error.CannotListenError:
+ if x == portrange[1]:
+ raise
class CallLaterOnce:
@@ -26,34 +43,146 @@ class CallLaterOnce:
self._kw: Dict[str, Any] = kw
self._call: Optional[DelayedCall] = None
- def __call__(self) ->Any:
+ def schedule(self, delay: float = 0) -> None:
+ from twisted.internet import reactor
+
+ if self._call is None:
+ self._call = reactor.callLater(delay, self)
+
+ def cancel(self) -> None:
+ if self._call:
+ self._call.cancel()
+
+ def __call__(self) -> Any:
self._call = None
return self._func(*self._a, **self._kw)
-def set_asyncio_event_loop_policy() ->None:
+def set_asyncio_event_loop_policy() -> None:
"""The policy functions from asyncio often behave unexpectedly,
so we restrict their use to the absolutely essential case.
This should only be used to install the reactor.
"""
- pass
+ _get_asyncio_event_loop_policy()
+
+
+def get_asyncio_event_loop_policy() -> AbstractEventLoopPolicy:
+ warn(
+ "Call to deprecated function "
+ "scrapy.utils.reactor.get_asyncio_event_loop_policy().\n"
+ "\n"
+ "Please use get_event_loop, new_event_loop and set_event_loop"
+ " from asyncio instead, as the corresponding policy methods may lead"
+ " to unexpected behaviour.\n"
+ "This function is replaced by set_asyncio_event_loop_policy and"
+ " is meant to be used only when the reactor is being installed.",
+ category=ScrapyDeprecationWarning,
+ stacklevel=2,
+ )
+ return _get_asyncio_event_loop_policy()
+
+
+def _get_asyncio_event_loop_policy() -> AbstractEventLoopPolicy:
+ policy = asyncio.get_event_loop_policy()
+ if (
+ sys.version_info >= (3, 8)
+ and sys.platform == "win32"
+ and not isinstance(policy, asyncio.WindowsSelectorEventLoopPolicy)
+ ):
+ policy = asyncio.WindowsSelectorEventLoopPolicy()
+ asyncio.set_event_loop_policy(policy)
+ return policy
-def install_reactor(reactor_path: str, event_loop_path: Optional[str]=None
- ) ->None:
+def install_reactor(reactor_path: str, event_loop_path: Optional[str] = None) -> None:
"""Installs the :mod:`~twisted.internet.reactor` with the specified
import path. Also installs the asyncio event loop with the specified import
path if the asyncio reactor is enabled"""
- pass
+ reactor_class = load_object(reactor_path)
+ if reactor_class is asyncioreactor.AsyncioSelectorReactor:
+ set_asyncio_event_loop_policy()
+ with suppress(error.ReactorAlreadyInstalledError):
+ event_loop = set_asyncio_event_loop(event_loop_path)
+ asyncioreactor.install(eventloop=event_loop)
+ else:
+ *module, _ = reactor_path.split(".")
+ installer_path = module + ["install"]
+ installer = load_object(".".join(installer_path))
+ with suppress(error.ReactorAlreadyInstalledError):
+ installer()
-def set_asyncio_event_loop(event_loop_path: Optional[str]) ->AbstractEventLoop:
+def _get_asyncio_event_loop() -> AbstractEventLoop:
+ return set_asyncio_event_loop(None)
+
+
+def set_asyncio_event_loop(event_loop_path: Optional[str]) -> AbstractEventLoop:
"""Sets and returns the event loop with specified import path."""
- pass
+ if event_loop_path is not None:
+ event_loop_class: Type[AbstractEventLoop] = load_object(event_loop_path)
+ event_loop = event_loop_class()
+ asyncio.set_event_loop(event_loop)
+ else:
+ try:
+ with catch_warnings():
+ # In Python 3.10.9, 3.11.1, 3.12 and 3.13, a DeprecationWarning
+ # is emitted about the lack of a current event loop, because in
+ # Python 3.14 and later `get_event_loop` will raise a
+ # RuntimeError in that event. Because our code is already
+ # prepared for that future behavior, we ignore the deprecation
+ # warning.
+ filterwarnings(
+ "ignore",
+ message="There is no current event loop",
+ category=DeprecationWarning,
+ )
+ event_loop = asyncio.get_event_loop()
+ except RuntimeError:
+ # `get_event_loop` raises RuntimeError when called with no asyncio
+ # event loop yet installed in the following scenarios:
+ # - Previsibly on Python 3.14 and later.
+ # https://github.com/python/cpython/issues/100160#issuecomment-1345581902
+ event_loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(event_loop)
+ return event_loop
-def verify_installed_reactor(reactor_path: str) ->None:
+def verify_installed_reactor(reactor_path: str) -> None:
"""Raises :exc:`Exception` if the installed
:mod:`~twisted.internet.reactor` does not match the specified import
path."""
- pass
+ from twisted.internet import reactor
+
+ reactor_class = load_object(reactor_path)
+ if not reactor.__class__ == reactor_class:
+ msg = (
+ "The installed reactor "
+ f"({reactor.__module__}.{reactor.__class__.__name__}) does not "
+ f"match the requested one ({reactor_path})"
+ )
+ raise Exception(msg)
+
+
+def verify_installed_asyncio_event_loop(loop_path: str) -> None:
+ from twisted.internet import reactor
+
+ loop_class = load_object(loop_path)
+ if isinstance(reactor._asyncioEventloop, loop_class):
+ return
+ installed = (
+ f"{reactor._asyncioEventloop.__class__.__module__}"
+ f".{reactor._asyncioEventloop.__class__.__qualname__}"
+ )
+ specified = f"{loop_class.__module__}.{loop_class.__qualname__}"
+ raise Exception(
+ "Scrapy found an asyncio Twisted reactor already "
+ f"installed, and its event loop class ({installed}) does "
+ "not match the one specified in the ASYNCIO_EVENT_LOOP "
+ f"setting ({specified})"
+ )
+
+
+def is_asyncio_reactor_installed() -> bool:
+ from twisted.internet import reactor
+
+ return isinstance(reactor, asyncioreactor.AsyncioSelectorReactor)
diff --git a/scrapy/utils/request.py b/scrapy/utils/request.py
index 9514d35a1..24fcbd85e 100644
--- a/scrapy/utils/request.py
+++ b/scrapy/utils/request.py
@@ -2,27 +2,57 @@
This module provides some useful functions for working with
scrapy.http.Request objects
"""
+
import hashlib
import json
import warnings
-from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Protocol, Tuple, Type, Union
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Dict,
+ Generator,
+ Iterable,
+ List,
+ Optional,
+ Protocol,
+ Tuple,
+ Type,
+ Union,
+)
from urllib.parse import urlunparse
from weakref import WeakKeyDictionary
+
from w3lib.http import basic_auth_header
from w3lib.url import canonicalize_url
+
from scrapy import Request, Spider
from scrapy.exceptions import ScrapyDeprecationWarning
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import load_object
from scrapy.utils.python import to_bytes, to_unicode
+
if TYPE_CHECKING:
from scrapy.crawler import Crawler
-_deprecated_fingerprint_cache: 'WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], str]]'
+
+_deprecated_fingerprint_cache: "WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], str]]"
_deprecated_fingerprint_cache = WeakKeyDictionary()
-def request_fingerprint(request: Request, include_headers: Optional[
- Iterable[Union[bytes, str]]]=None, keep_fragments: bool=False) ->str:
+def _serialize_headers(
+ headers: Iterable[bytes], request: Request
+) -> Generator[bytes, Any, None]:
+ for header in headers:
+ if header in request.headers:
+ yield header
+ for value in request.headers.getlist(header):
+ yield value
+
+
+def request_fingerprint(
+ request: Request,
+ include_headers: Optional[Iterable[Union[bytes, str]]] = None,
+ keep_fragments: bool = False,
+) -> str:
"""
Return the request fingerprint as an hexadecimal string.
@@ -53,15 +83,95 @@ def request_fingerprint(request: Request, include_headers: Optional[
If you want to include them, set the keep_fragments argument to True
(for instance when handling requests with a headless browser).
"""
- pass
-
-
-_fingerprint_cache: 'WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], bytes]]'
+ if include_headers or keep_fragments:
+ message = (
+ "Call to deprecated function "
+ "scrapy.utils.request.request_fingerprint().\n"
+ "\n"
+ "If you are using this function in a Scrapy component because you "
+ "need a non-default fingerprinting algorithm, and you are OK "
+ "with that non-default fingerprinting algorithm being used by "
+ "all Scrapy components and not just the one calling this "
+ "function, use crawler.request_fingerprinter.fingerprint() "
+ "instead in your Scrapy component (you can get the crawler "
+ "object from the 'from_crawler' class method), and use the "
+ "'REQUEST_FINGERPRINTER_CLASS' setting to configure your "
+ "non-default fingerprinting algorithm.\n"
+ "\n"
+ "Otherwise, consider using the "
+ "scrapy.utils.request.fingerprint() function instead.\n"
+ "\n"
+ "If you switch to 'fingerprint()', or assign the "
+ "'REQUEST_FINGERPRINTER_CLASS' setting a class that uses "
+ "'fingerprint()', the generated fingerprints will not only be "
+ "bytes instead of a string, but they will also be different from "
+ "those generated by 'request_fingerprint()'. Before you switch, "
+ "make sure that you understand the consequences of this (e.g. "
+ "cache invalidation) and are OK with them; otherwise, consider "
+ "implementing your own function which returns the same "
+ "fingerprints as the deprecated 'request_fingerprint()' function."
+ )
+ else:
+ message = (
+ "Call to deprecated function "
+ "scrapy.utils.request.request_fingerprint().\n"
+ "\n"
+ "If you are using this function in a Scrapy component, and you "
+ "are OK with users of your component changing the fingerprinting "
+ "algorithm through settings, use "
+ "crawler.request_fingerprinter.fingerprint() instead in your "
+ "Scrapy component (you can get the crawler object from the "
+ "'from_crawler' class method).\n"
+ "\n"
+ "Otherwise, consider using the "
+ "scrapy.utils.request.fingerprint() function instead.\n"
+ "\n"
+ "Either way, the resulting fingerprints will be returned as "
+ "bytes, not as a string, and they will also be different from "
+ "those generated by 'request_fingerprint()'. Before you switch, "
+ "make sure that you understand the consequences of this (e.g. "
+ "cache invalidation) and are OK with them; otherwise, consider "
+ "implementing your own function which returns the same "
+ "fingerprints as the deprecated 'request_fingerprint()' function."
+ )
+ warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
+ processed_include_headers: Optional[Tuple[bytes, ...]] = None
+ if include_headers:
+ processed_include_headers = tuple(
+ to_bytes(h.lower()) for h in sorted(include_headers)
+ )
+ cache = _deprecated_fingerprint_cache.setdefault(request, {})
+ cache_key = (processed_include_headers, keep_fragments)
+ if cache_key not in cache:
+ fp = hashlib.sha1()
+ fp.update(to_bytes(request.method))
+ fp.update(
+ to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))
+ )
+ fp.update(request.body or b"")
+ if processed_include_headers:
+ for part in _serialize_headers(processed_include_headers, request):
+ fp.update(part)
+ cache[cache_key] = fp.hexdigest()
+ return cache[cache_key]
+
+
+def _request_fingerprint_as_bytes(*args: Any, **kwargs: Any) -> bytes:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ return bytes.fromhex(request_fingerprint(*args, **kwargs))
+
+
+_fingerprint_cache: "WeakKeyDictionary[Request, Dict[Tuple[Optional[Tuple[bytes, ...]], bool], bytes]]"
_fingerprint_cache = WeakKeyDictionary()
-def fingerprint(request: Request, *, include_headers: Optional[Iterable[
- Union[bytes, str]]]=None, keep_fragments: bool=False) ->bytes:
+def fingerprint(
+ request: Request,
+ *,
+ include_headers: Optional[Iterable[Union[bytes, str]]] = None,
+ keep_fragments: bool = False,
+) -> bytes:
"""
Return the request fingerprint.
@@ -92,11 +202,38 @@ def fingerprint(request: Request, *, include_headers: Optional[Iterable[
If you want to include them, set the keep_fragments argument to True
(for instance when handling requests with a headless browser).
"""
- pass
+ processed_include_headers: Optional[Tuple[bytes, ...]] = None
+ if include_headers:
+ processed_include_headers = tuple(
+ to_bytes(h.lower()) for h in sorted(include_headers)
+ )
+ cache = _fingerprint_cache.setdefault(request, {})
+ cache_key = (processed_include_headers, keep_fragments)
+ if cache_key not in cache:
+ # To decode bytes reliably (JSON does not support bytes), regardless of
+ # character encoding, we use bytes.hex()
+ headers: Dict[str, List[str]] = {}
+ if processed_include_headers:
+ for header in processed_include_headers:
+ if header in request.headers:
+ headers[header.hex()] = [
+ header_value.hex()
+ for header_value in request.headers.getlist(header)
+ ]
+ fingerprint_data = {
+ "method": to_unicode(request.method),
+ "url": canonicalize_url(request.url, keep_fragments=keep_fragments),
+ "body": (request.body or b"").hex(),
+ "headers": headers,
+ }
+ fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
+ cache[cache_key] = hashlib.sha1(fingerprint_json.encode()).digest()
+ return cache[cache_key]
class RequestFingerprinterProtocol(Protocol):
- pass
+ def fingerprint(self, request: Request) -> bytes:
+ ...
class RequestFingerprinter:
@@ -112,70 +249,134 @@ class RequestFingerprinter:
.. seealso:: :setting:`REQUEST_FINGERPRINTER_IMPLEMENTATION`.
"""
- def __init__(self, crawler: Optional['Crawler']=None):
+ @classmethod
+ def from_crawler(cls, crawler):
+ return cls(crawler)
+
+ def __init__(self, crawler: Optional["Crawler"] = None):
if crawler:
implementation = crawler.settings.get(
- 'REQUEST_FINGERPRINTER_IMPLEMENTATION')
+ "REQUEST_FINGERPRINTER_IMPLEMENTATION"
+ )
else:
- implementation = '2.6'
- if implementation == '2.6':
- message = """'2.6' is a deprecated value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting.
-
-It is also the default value. In other words, it is normal to get this warning if you have not defined a value for the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting. This is so for backward compatibility reasons, but it will change in a future version of Scrapy.
-
-See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation."""
- warnings.warn(message, category=ScrapyDeprecationWarning,
- stacklevel=2)
+ implementation = "2.6"
+ if implementation == "2.6":
+ message = (
+ "'2.6' is a deprecated value for the "
+ "'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting.\n"
+ "\n"
+ "It is also the default value. In other words, it is normal "
+ "to get this warning if you have not defined a value for the "
+ "'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting. This is so "
+ "for backward compatibility reasons, but it will change in a "
+ "future version of Scrapy.\n"
+ "\n"
+ "See the documentation of the "
+ "'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for "
+ "information on how to handle this deprecation."
+ )
+ warnings.warn(message, category=ScrapyDeprecationWarning, stacklevel=2)
self._fingerprint = _request_fingerprint_as_bytes
- elif implementation == '2.7':
+ elif implementation == "2.7":
self._fingerprint = fingerprint
else:
raise ValueError(
- f"Got an invalid value on setting 'REQUEST_FINGERPRINTER_IMPLEMENTATION': {implementation!r}. Valid values are '2.6' (deprecated) and '2.7'."
- )
+ f"Got an invalid value on setting "
+ f"'REQUEST_FINGERPRINTER_IMPLEMENTATION': "
+ f"{implementation!r}. Valid values are '2.6' (deprecated) "
+ f"and '2.7'."
+ )
+ def fingerprint(self, request: Request) -> bytes:
+ return self._fingerprint(request)
-def request_authenticate(request: Request, username: str, password: str
- ) ->None:
+
+def request_authenticate(
+ request: Request,
+ username: str,
+ password: str,
+) -> None:
"""Authenticate the given request (in place) using the HTTP basic access
authentication mechanism (RFC 2617) and the given username and password
"""
- pass
+ request.headers["Authorization"] = basic_auth_header(username, password)
-def request_httprepr(request: Request) ->bytes:
+def request_httprepr(request: Request) -> bytes:
"""Return the raw HTTP representation (as bytes) of the given request.
This is provided only for reference since it's not the actual stream of
bytes that will be send when performing the request (that's controlled
by Twisted).
"""
- pass
-
-
-def referer_str(request: Request) ->Optional[str]:
+ parsed = urlparse_cached(request)
+ path = urlunparse(("", "", parsed.path or "/", parsed.params, parsed.query, ""))
+ s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
+ s += b"Host: " + to_bytes(parsed.hostname or b"") + b"\r\n"
+ if request.headers:
+ s += request.headers.to_string() + b"\r\n"
+ s += b"\r\n"
+ s += request.body
+ return s
+
+
+def referer_str(request: Request) -> Optional[str]:
"""Return Referer HTTP header suitable for logging."""
- pass
+ referrer = request.headers.get("Referer")
+ if referrer is None:
+ return referrer
+ return to_unicode(referrer, errors="replace")
-def request_from_dict(d: dict, *, spider: Optional[Spider]=None) ->Request:
+def request_from_dict(d: dict, *, spider: Optional[Spider] = None) -> Request:
"""Create a :class:`~scrapy.Request` object from a dict.
If a spider is given, it will try to resolve the callbacks looking at the
spider for methods with the same name.
"""
- pass
+ request_cls: Type[Request] = load_object(d["_class"]) if "_class" in d else Request
+ kwargs = {key: value for key, value in d.items() if key in request_cls.attributes}
+ if d.get("callback") and spider:
+ kwargs["callback"] = _get_method(spider, d["callback"])
+ if d.get("errback") and spider:
+ kwargs["errback"] = _get_method(spider, d["errback"])
+ return request_cls(**kwargs)
-def _get_method(obj: Any, name: Any) ->Any:
+def _get_method(obj: Any, name: Any) -> Any:
"""Helper function for request_from_dict"""
- pass
+ name = str(name)
+ try:
+ return getattr(obj, name)
+ except AttributeError:
+ raise ValueError(f"Method {name!r} not found in: {obj}")
-def request_to_curl(request: Request) ->str:
+def request_to_curl(request: Request) -> str:
"""
Converts a :class:`~scrapy.Request` object to a curl command.
:param :class:`~scrapy.Request`: Request object to be converted
:return: string containing the curl command
"""
- pass
+ method = request.method
+
+ data = f"--data-raw '{request.body.decode('utf-8')}'" if request.body else ""
+
+ headers = " ".join(
+ f"-H '{k.decode()}: {v[0].decode()}'" for k, v in request.headers.items()
+ )
+
+ url = request.url
+ cookies = ""
+ if request.cookies:
+ if isinstance(request.cookies, dict):
+ cookie = "; ".join(f"{k}={v}" for k, v in request.cookies.items())
+ cookies = f"--cookie '{cookie}'"
+ elif isinstance(request.cookies, list):
+ cookie = "; ".join(
+ f"{list(c.keys())[0]}={list(c.values())[0]}" for c in request.cookies
+ )
+ cookies = f"--cookie '{cookie}'"
+
+ curl_cmd = f"curl -X {method} {url} {data} {headers} {cookies}".strip()
+ return " ".join(curl_cmd.split())
diff --git a/scrapy/utils/response.py b/scrapy/utils/response.py
index ce657fa90..fabfb1167 100644
--- a/scrapy/utils/response.py
+++ b/scrapy/utils/response.py
@@ -8,50 +8,91 @@ import tempfile
import webbrowser
from typing import Any, Callable, Iterable, Tuple, Union
from weakref import WeakKeyDictionary
+
from twisted.web import http
from w3lib import html
+
import scrapy
from scrapy.http.response import Response
from scrapy.utils.decorators import deprecated
from scrapy.utils.python import to_bytes, to_unicode
-_baseurl_cache: 'WeakKeyDictionary[Response, str]' = WeakKeyDictionary()
+
+_baseurl_cache: "WeakKeyDictionary[Response, str]" = WeakKeyDictionary()
-def get_base_url(response: 'scrapy.http.response.text.TextResponse') ->str:
+def get_base_url(response: "scrapy.http.response.text.TextResponse") -> str:
"""Return the base url of the given response, joined with the response url"""
- pass
+ if response not in _baseurl_cache:
+ text = response.text[0:4096]
+ _baseurl_cache[response] = html.get_base_url(
+ text, response.url, response.encoding
+ )
+ return _baseurl_cache[response]
-(_metaref_cache:
- 'WeakKeyDictionary[Response, Union[Tuple[None, None], Tuple[float, str]]]'
- ) = WeakKeyDictionary()
+_metaref_cache: "WeakKeyDictionary[Response, Union[Tuple[None, None], Tuple[float, str]]]" = (
+ WeakKeyDictionary()
+)
-def get_meta_refresh(response: 'scrapy.http.response.text.TextResponse',
- ignore_tags: Iterable[str]=('script', 'noscript')) ->Union[Tuple[None,
- None], Tuple[float, str]]:
+def get_meta_refresh(
+ response: "scrapy.http.response.text.TextResponse",
+ ignore_tags: Iterable[str] = ("script", "noscript"),
+) -> Union[Tuple[None, None], Tuple[float, str]]:
"""Parse the http-equiv refresh parameter from the given response"""
- pass
+ if response not in _metaref_cache:
+ text = response.text[0:4096]
+ _metaref_cache[response] = html.get_meta_refresh(
+ text, response.url, response.encoding, ignore_tags=ignore_tags
+ )
+ return _metaref_cache[response]
-def response_status_message(status: Union[bytes, float, int, str]) ->str:
+def response_status_message(status: Union[bytes, float, int, str]) -> str:
"""Return status code plus status text descriptive message"""
- pass
+ status_int = int(status)
+ message = http.RESPONSES.get(status_int, "Unknown Status")
+ return f"{status_int} {to_unicode(message)}"
@deprecated
-def response_httprepr(response: Response) ->bytes:
+def response_httprepr(response: Response) -> bytes:
"""Return raw HTTP representation (as bytes) of the given response. This
is provided only for reference, since it's not the exact stream of bytes
that was received (that's not exposed by Twisted).
"""
- pass
-
-
-def open_in_browser(response: Union[
- 'scrapy.http.response.html.HtmlResponse',
- 'scrapy.http.response.text.TextResponse'], _openfunc: Callable[[str],
- Any]=webbrowser.open) ->Any:
+ values = [
+ b"HTTP/1.1 ",
+ to_bytes(str(response.status)),
+ b" ",
+ to_bytes(http.RESPONSES.get(response.status, b"")),
+ b"\r\n",
+ ]
+ if response.headers:
+ values.extend([response.headers.to_string(), b"\r\n"])
+ values.extend([b"\r\n", response.body])
+ return b"".join(values)
+
+
+def _remove_html_comments(body):
+ start = body.find(b"<!--")
+ while start != -1:
+ end = body.find(b"-->", start + 1)
+ if end == -1:
+ return body[:start]
+ else:
+ body = body[:start] + body[end + 3 :]
+ start = body.find(b"<!--")
+ return body
+
+
+def open_in_browser(
+ response: Union[
+ "scrapy.http.response.html.HtmlResponse",
+ "scrapy.http.response.text.TextResponse",
+ ],
+ _openfunc: Callable[[str], Any] = webbrowser.open,
+) -> Any:
"""Open *response* in a local web browser, adjusting the `base tag`_ for
external links to work, e.g. so that images and styles are displayed.
@@ -68,4 +109,21 @@ def open_in_browser(response: Union[
if "item name" not in response.body:
open_in_browser(response)
"""
- pass
+ from scrapy.http import HtmlResponse, TextResponse
+
+ # XXX: this implementation is a bit dirty and could be improved
+ body = response.body
+ if isinstance(response, HtmlResponse):
+ if b"<base" not in body:
+ _remove_html_comments(body)
+ repl = rf'\0<base href="{response.url}">'
+ body = re.sub(rb"<head(?:[^<>]*?>)", to_bytes(repl), body, count=1)
+ ext = ".html"
+ elif isinstance(response, TextResponse):
+ ext = ".txt"
+ else:
+ raise TypeError("Unsupported response type: " f"{response.__class__.__name__}")
+ fd, fname = tempfile.mkstemp(ext)
+ os.write(fd, body)
+ os.close(fd)
+ return _openfunc(f"file://{fname}")
diff --git a/scrapy/utils/serialize.py b/scrapy/utils/serialize.py
index 349bc19f4..3b4f67f00 100644
--- a/scrapy/utils/serialize.py
+++ b/scrapy/utils/serialize.py
@@ -2,14 +2,37 @@ import datetime
import decimal
import json
from typing import Any
+
from itemadapter import ItemAdapter, is_item
from twisted.internet import defer
+
from scrapy.http import Request, Response
class ScrapyJSONEncoder(json.JSONEncoder):
- DATE_FORMAT = '%Y-%m-%d'
- TIME_FORMAT = '%H:%M:%S'
+ DATE_FORMAT = "%Y-%m-%d"
+ TIME_FORMAT = "%H:%M:%S"
+
+ def default(self, o: Any) -> Any:
+ if isinstance(o, set):
+ return list(o)
+ if isinstance(o, datetime.datetime):
+ return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}")
+ if isinstance(o, datetime.date):
+ return o.strftime(self.DATE_FORMAT)
+ if isinstance(o, datetime.time):
+ return o.strftime(self.TIME_FORMAT)
+ if isinstance(o, decimal.Decimal):
+ return str(o)
+ if isinstance(o, defer.Deferred):
+ return str(o)
+ if is_item(o):
+ return ItemAdapter(o).asdict()
+ if isinstance(o, Request):
+ return f"<{type(o).__name__} {o.method} {o.url}>"
+ if isinstance(o, Response):
+ return f"<{type(o).__name__} {o.status} {o.url}>"
+ return super().default(o)
class ScrapyJSONDecoder(json.JSONDecoder):
diff --git a/scrapy/utils/signal.py b/scrapy/utils/signal.py
index ba9aa6bf8..21a12a19e 100644
--- a/scrapy/utils/signal.py
+++ b/scrapy/utils/signal.py
@@ -3,36 +3,110 @@ import collections.abc
import logging
from typing import Any as TypingAny
from typing import List, Tuple
-from pydispatch.dispatcher import Anonymous, Any, disconnect, getAllReceivers, liveReceivers
+
+from pydispatch.dispatcher import (
+ Anonymous,
+ Any,
+ disconnect,
+ getAllReceivers,
+ liveReceivers,
+)
from pydispatch.robustapply import robustApply
from twisted.internet.defer import Deferred, DeferredList
from twisted.python.failure import Failure
+
from scrapy.exceptions import StopDownload
from scrapy.utils.defer import maybeDeferred_coro
from scrapy.utils.log import failure_to_exc_info
+
logger = logging.getLogger(__name__)
-def send_catch_log(signal: TypingAny=Any, sender: TypingAny=Anonymous, *
- arguments: TypingAny, **named: TypingAny) ->List[Tuple[TypingAny,
- TypingAny]]:
+def send_catch_log(
+ signal: TypingAny = Any,
+ sender: TypingAny = Anonymous,
+ *arguments: TypingAny,
+ **named: TypingAny
+) -> List[Tuple[TypingAny, TypingAny]]:
"""Like pydispatcher.robust.sendRobust but it also logs errors and returns
Failures instead of exceptions.
"""
- pass
+ dont_log = named.pop("dont_log", ())
+ dont_log = (
+ tuple(dont_log)
+ if isinstance(dont_log, collections.abc.Sequence)
+ else (dont_log,)
+ )
+ dont_log += (StopDownload,)
+ spider = named.get("spider", None)
+ responses: List[Tuple[TypingAny, TypingAny]] = []
+ for receiver in liveReceivers(getAllReceivers(sender, signal)):
+ result: TypingAny
+ try:
+ response = robustApply(
+ receiver, signal=signal, sender=sender, *arguments, **named
+ )
+ if isinstance(response, Deferred):
+ logger.error(
+ "Cannot return deferreds from signal handler: %(receiver)s",
+ {"receiver": receiver},
+ extra={"spider": spider},
+ )
+ except dont_log:
+ result = Failure()
+ except Exception:
+ result = Failure()
+ logger.error(
+ "Error caught on signal handler: %(receiver)s",
+ {"receiver": receiver},
+ exc_info=True,
+ extra={"spider": spider},
+ )
+ else:
+ result = response
+ responses.append((receiver, result))
+ return responses
-def send_catch_log_deferred(signal: TypingAny=Any, sender: TypingAny=
- Anonymous, *arguments: TypingAny, **named: TypingAny) ->Deferred:
+def send_catch_log_deferred(
+ signal: TypingAny = Any,
+ sender: TypingAny = Anonymous,
+ *arguments: TypingAny,
+ **named: TypingAny
+) -> Deferred:
"""Like send_catch_log but supports returning deferreds on signal handlers.
Returns a deferred that gets fired once all signal handlers deferreds were
fired.
"""
- pass
+
+ def logerror(failure: Failure, recv: Any) -> Failure:
+ if dont_log is None or not isinstance(failure.value, dont_log):
+ logger.error(
+ "Error caught on signal handler: %(receiver)s",
+ {"receiver": recv},
+ exc_info=failure_to_exc_info(failure),
+ extra={"spider": spider},
+ )
+ return failure
+
+ dont_log = named.pop("dont_log", None)
+ spider = named.get("spider", None)
+ dfds = []
+ for receiver in liveReceivers(getAllReceivers(sender, signal)):
+ d = maybeDeferred_coro(
+ robustApply, receiver, signal=signal, sender=sender, *arguments, **named
+ )
+ d.addErrback(logerror, receiver)
+ d.addBoth(lambda result: (receiver, result))
+ dfds.append(d)
+ d = DeferredList(dfds)
+ d.addCallback(lambda out: [x[1] for x in out])
+ return d
-def disconnect_all(signal: TypingAny=Any, sender: TypingAny=Any) ->None:
+def disconnect_all(signal: TypingAny = Any, sender: TypingAny = Any) -> None:
"""Disconnect all signal handlers. Useful for cleaning up after running
tests
"""
- pass
+ for receiver in liveReceivers(getAllReceivers(sender, signal)):
+ disconnect(receiver, signal=signal, sender=sender)
diff --git a/scrapy/utils/sitemap.py b/scrapy/utils/sitemap.py
index 2ee154109..759b1c1a9 100644
--- a/scrapy/utils/sitemap.py
+++ b/scrapy/utils/sitemap.py
@@ -6,7 +6,8 @@ SitemapSpider, its API is subject to change without notice.
"""
from typing import Any, Dict, Generator, Iterator, Optional
from urllib.parse import urljoin
-import lxml.etree
+
+import lxml.etree # nosec
class Sitemap:
@@ -14,30 +15,37 @@ class Sitemap:
(type=sitemapindex) files"""
def __init__(self, xmltext: str):
- xmlp = lxml.etree.XMLParser(recover=True, remove_comments=True,
- resolve_entities=False)
- self._root = lxml.etree.fromstring(xmltext, parser=xmlp)
+ xmlp = lxml.etree.XMLParser(
+ recover=True, remove_comments=True, resolve_entities=False
+ )
+ self._root = lxml.etree.fromstring(xmltext, parser=xmlp) # nosec
rt = self._root.tag
- self.type = self._root.tag.split('}', 1)[1] if '}' in rt else rt
+ self.type = self._root.tag.split("}", 1)[1] if "}" in rt else rt
- def __iter__(self) ->Iterator[Dict[str, Any]]:
+ def __iter__(self) -> Iterator[Dict[str, Any]]:
for elem in self._root.getchildren():
d: Dict[str, Any] = {}
for el in elem.getchildren():
tag = el.tag
- name = tag.split('}', 1)[1] if '}' in tag else tag
- if name == 'link':
- if 'href' in el.attrib:
- d.setdefault('alternate', []).append(el.get('href'))
+ name = tag.split("}", 1)[1] if "}" in tag else tag
+
+ if name == "link":
+ if "href" in el.attrib:
+ d.setdefault("alternate", []).append(el.get("href"))
else:
- d[name] = el.text.strip() if el.text else ''
- if 'loc' in d:
+ d[name] = el.text.strip() if el.text else ""
+
+ if "loc" in d:
yield d
-def sitemap_urls_from_robots(robots_text: str, base_url: Optional[str]=None
- ) ->Generator[str, Any, None]:
+def sitemap_urls_from_robots(
+ robots_text: str, base_url: Optional[str] = None
+) -> Generator[str, Any, None]:
"""Return an iterator over all sitemap urls contained in the given
robots.txt file
"""
- pass
+ for line in robots_text.splitlines():
+ if line.lstrip().lower().startswith("sitemap:"):
+ url = line.split(":", 1)[1].strip()
+ yield urljoin(base_url or "", url)
diff --git a/scrapy/utils/spider.py b/scrapy/utils/spider.py
index b93bbbad9..704df8657 100644
--- a/scrapy/utils/spider.py
+++ b/scrapy/utils/spider.py
@@ -1,30 +1,121 @@
from __future__ import annotations
+
import inspect
import logging
from types import CoroutineType, ModuleType
-from typing import TYPE_CHECKING, Any, AsyncGenerator, Generator, Iterable, Literal, Optional, Type, TypeVar, Union, overload
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ AsyncGenerator,
+ Generator,
+ Iterable,
+ Literal,
+ Optional,
+ Type,
+ TypeVar,
+ Union,
+ overload,
+)
+
from twisted.internet.defer import Deferred
+
from scrapy import Request
from scrapy.spiders import Spider
from scrapy.utils.defer import deferred_from_coro
from scrapy.utils.misc import arg_to_iter
+
if TYPE_CHECKING:
from scrapy.spiderloader import SpiderLoader
+
logger = logging.getLogger(__name__)
-_T = TypeVar('_T')
+
+_T = TypeVar("_T")
+
+
+# https://stackoverflow.com/questions/60222982
+@overload
+def iterate_spider_output(result: AsyncGenerator) -> AsyncGenerator: # type: ignore[misc]
+ ...
+
+
+@overload
+def iterate_spider_output(result: CoroutineType) -> Deferred:
+ ...
+
+
+@overload
+def iterate_spider_output(result: _T) -> Iterable:
+ ...
-def iter_spider_classes(module: ModuleType) ->Generator[Type[Spider], Any, None
- ]:
+def iterate_spider_output(result: Any) -> Union[Iterable, AsyncGenerator, Deferred]:
+ if inspect.isasyncgen(result):
+ return result
+ if inspect.iscoroutine(result):
+ d = deferred_from_coro(result)
+ d.addCallback(iterate_spider_output)
+ return d
+ return arg_to_iter(deferred_from_coro(result))
+
+
+def iter_spider_classes(module: ModuleType) -> Generator[Type[Spider], Any, None]:
"""Return an iterator over all spider classes defined in the given module
that can be instantiated (i.e. which have name)
"""
- pass
+ # this needs to be imported here until get rid of the spider manager
+ # singleton in scrapy.spider.spiders
+ from scrapy.spiders import Spider
+
+ for obj in vars(module).values():
+ if (
+ inspect.isclass(obj)
+ and issubclass(obj, Spider)
+ and obj.__module__ == module.__name__
+ and getattr(obj, "name", None)
+ ):
+ yield obj
+
+
+@overload
+def spidercls_for_request(
+ spider_loader: SpiderLoader,
+ request: Request,
+ default_spidercls: Type[Spider],
+ log_none: bool = ...,
+ log_multiple: bool = ...,
+) -> Type[Spider]:
+ ...
+
+@overload
+def spidercls_for_request(
+ spider_loader: SpiderLoader,
+ request: Request,
+ default_spidercls: Literal[None],
+ log_none: bool = ...,
+ log_multiple: bool = ...,
+) -> Optional[Type[Spider]]:
+ ...
-def spidercls_for_request(spider_loader: SpiderLoader, request: Request,
- default_spidercls: Optional[Type[Spider]]=None, log_none: bool=False,
- log_multiple: bool=False) ->Optional[Type[Spider]]:
+
+@overload
+def spidercls_for_request(
+ spider_loader: SpiderLoader,
+ request: Request,
+ *,
+ log_none: bool = ...,
+ log_multiple: bool = ...,
+) -> Optional[Type[Spider]]:
+ ...
+
+
+def spidercls_for_request(
+ spider_loader: SpiderLoader,
+ request: Request,
+ default_spidercls: Optional[Type[Spider]] = None,
+ log_none: bool = False,
+ log_multiple: bool = False,
+) -> Optional[Type[Spider]]:
"""Return a spider class that handles the given Request.
This will look for the spiders that can handle the given request (using
@@ -35,8 +126,23 @@ def spidercls_for_request(spider_loader: SpiderLoader, request: Request,
default_spidercls passed. It can optionally log if multiple or no spiders
are found.
"""
- pass
+ snames = spider_loader.find_by_request(request)
+ if len(snames) == 1:
+ return spider_loader.load(snames[0])
+
+ if len(snames) > 1 and log_multiple:
+ logger.error(
+ "More than one spider can handle: %(request)s - %(snames)s",
+ {"request": request, "snames": ", ".join(snames)},
+ )
+
+ if len(snames) == 0 and log_none:
+ logger.error(
+ "Unable to find spider that handles: %(request)s", {"request": request}
+ )
+
+ return default_spidercls
class DefaultSpider(Spider):
- name = 'default'
+ name = "default"
diff --git a/scrapy/utils/ssl.py b/scrapy/utils/ssl.py
index 1588eed7a..d520ef809 100644
--- a/scrapy/utils/ssl.py
+++ b/scrapy/utils/ssl.py
@@ -1,6 +1,63 @@
from typing import Any, Optional
+
import OpenSSL._util as pyOpenSSLutil
import OpenSSL.SSL
import OpenSSL.version
from OpenSSL.crypto import X509Name
+
from scrapy.utils.python import to_unicode
+
+
+def ffi_buf_to_string(buf: Any) -> str:
+ return to_unicode(pyOpenSSLutil.ffi.string(buf))
+
+
+def x509name_to_string(x509name: X509Name) -> str:
+ # from OpenSSL.crypto.X509Name.__repr__
+ result_buffer: Any = pyOpenSSLutil.ffi.new("char[]", 512)
+ pyOpenSSLutil.lib.X509_NAME_oneline(
+ x509name._name, result_buffer, len(result_buffer) # type: ignore[attr-defined]
+ )
+
+ return ffi_buf_to_string(result_buffer)
+
+
+def get_temp_key_info(ssl_object: Any) -> Optional[str]:
+ # adapted from OpenSSL apps/s_cb.c::ssl_print_tmp_key()
+ if not hasattr(pyOpenSSLutil.lib, "SSL_get_server_tmp_key"):
+ # removed in cryptography 40.0.0
+ return None
+ temp_key_p = pyOpenSSLutil.ffi.new("EVP_PKEY **")
+ if not pyOpenSSLutil.lib.SSL_get_server_tmp_key(ssl_object, temp_key_p):
+ return None
+ temp_key = temp_key_p[0]
+ if temp_key == pyOpenSSLutil.ffi.NULL:
+ return None
+ temp_key = pyOpenSSLutil.ffi.gc(temp_key, pyOpenSSLutil.lib.EVP_PKEY_free)
+ key_info = []
+ key_type = pyOpenSSLutil.lib.EVP_PKEY_id(temp_key)
+ if key_type == pyOpenSSLutil.lib.EVP_PKEY_RSA:
+ key_info.append("RSA")
+ elif key_type == pyOpenSSLutil.lib.EVP_PKEY_DH:
+ key_info.append("DH")
+ elif key_type == pyOpenSSLutil.lib.EVP_PKEY_EC:
+ key_info.append("ECDH")
+ ec_key = pyOpenSSLutil.lib.EVP_PKEY_get1_EC_KEY(temp_key)
+ ec_key = pyOpenSSLutil.ffi.gc(ec_key, pyOpenSSLutil.lib.EC_KEY_free)
+ nid = pyOpenSSLutil.lib.EC_GROUP_get_curve_name(
+ pyOpenSSLutil.lib.EC_KEY_get0_group(ec_key)
+ )
+ cname = pyOpenSSLutil.lib.EC_curve_nid2nist(nid)
+ if cname == pyOpenSSLutil.ffi.NULL:
+ cname = pyOpenSSLutil.lib.OBJ_nid2sn(nid)
+ key_info.append(ffi_buf_to_string(cname))
+ else:
+ key_info.append(ffi_buf_to_string(pyOpenSSLutil.lib.OBJ_nid2sn(key_type)))
+ key_info.append(f"{pyOpenSSLutil.lib.EVP_PKEY_bits(temp_key)} bits")
+ return ", ".join(key_info)
+
+
+def get_openssl_version() -> str:
+ system_openssl_bytes = OpenSSL.SSL.SSLeay_version(OpenSSL.SSL.SSLEAY_VERSION)
+ system_openssl = system_openssl_bytes.decode("ascii", errors="replace")
+ return f"{OpenSSL.version.__version__} ({system_openssl})"
diff --git a/scrapy/utils/template.py b/scrapy/utils/template.py
index 705073b43..6b22f3bfa 100644
--- a/scrapy/utils/template.py
+++ b/scrapy/utils/template.py
@@ -1,13 +1,30 @@
"""Helper functions for working with templates"""
+
import re
import string
from os import PathLike
from pathlib import Path
from typing import Any, Union
-CAMELCASE_INVALID_CHARS = re.compile('[^a-zA-Z\\d]')
-def string_camelcase(string: str) ->str:
+def render_templatefile(path: Union[str, PathLike], **kwargs: Any) -> None:
+ path_obj = Path(path)
+ raw = path_obj.read_text("utf8")
+
+ content = string.Template(raw).substitute(**kwargs)
+
+ render_path = path_obj.with_suffix("") if path_obj.suffix == ".tmpl" else path_obj
+
+ if path_obj.suffix == ".tmpl":
+ path_obj.rename(render_path)
+
+ render_path.write_text(content, "utf8")
+
+
+CAMELCASE_INVALID_CHARS = re.compile(r"[^a-zA-Z\d]")
+
+
+def string_camelcase(string: str) -> str:
"""Convert a word to its CamelCase version and remove invalid chars
>>> string_camelcase('lost-pound')
@@ -17,4 +34,4 @@ def string_camelcase(string: str) ->str:
'MissingImages'
"""
- pass
+ return CAMELCASE_INVALID_CHARS.sub("", string.title())
diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py
index 88505e748..709e0b00d 100644
--- a/scrapy/utils/test.py
+++ b/scrapy/utils/test.py
@@ -1,6 +1,7 @@
"""
This module contains some assorted functions used in tests
"""
+
import asyncio
import os
from importlib import import_module
@@ -8,49 +9,144 @@ from pathlib import Path
from posixpath import split
from typing import Any, Coroutine, Dict, List, Optional, Tuple, Type
from unittest import TestCase, mock
+
from twisted.internet.defer import Deferred
from twisted.trial.unittest import SkipTest
+
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.utils.boto import is_botocore_available
+def assert_gcs_environ() -> None:
+ if "GCS_PROJECT_ID" not in os.environ:
+ raise SkipTest("GCS_PROJECT_ID not found")
+
+
+def skip_if_no_boto() -> None:
+ if not is_botocore_available():
+ raise SkipTest("missing botocore library")
+
+
+def get_gcs_content_and_delete(
+ bucket: Any, path: str
+) -> Tuple[bytes, List[Dict[str, str]], Any]:
+ from google.cloud import storage
+
+ client = storage.Client(project=os.environ.get("GCS_PROJECT_ID"))
+ bucket = client.get_bucket(bucket)
+ blob = bucket.get_blob(path)
+ content = blob.download_as_string()
+ acl = list(blob.acl) # loads acl before it will be deleted
+ bucket.delete_blob(path)
+ return content, acl, blob
+
+
+def get_ftp_content_and_delete(
+ path: str,
+ host: str,
+ port: int,
+ username: str,
+ password: str,
+ use_active_mode: bool = False,
+) -> bytes:
+ from ftplib import FTP
+
+ ftp = FTP()
+ ftp.connect(host, port)
+ ftp.login(username, password)
+ if use_active_mode:
+ ftp.set_pasv(False)
+ ftp_data: List[bytes] = []
+
+ def buffer_data(data: bytes) -> None:
+ ftp_data.append(data)
+
+ ftp.retrbinary(f"RETR {path}", buffer_data)
+ dirname, filename = split(path)
+ ftp.cwd(dirname)
+ ftp.delete(filename)
+ return b"".join(ftp_data)
+
+
class TestSpider(Spider):
- name = 'test'
+ name = "test"
-def get_crawler(spidercls: Optional[Type[Spider]]=None, settings_dict:
- Optional[Dict[str, Any]]=None, prevent_warnings: bool=True) ->Crawler:
+def get_crawler(
+ spidercls: Optional[Type[Spider]] = None,
+ settings_dict: Optional[Dict[str, Any]] = None,
+ prevent_warnings: bool = True,
+) -> Crawler:
"""Return an unconfigured Crawler object. If settings_dict is given, it
will be used to populate the crawler settings with a project level
priority.
"""
- pass
+ from scrapy.crawler import CrawlerRunner
+
+ # Set by default settings that prevent deprecation warnings.
+ settings: Dict[str, Any] = {}
+ if prevent_warnings:
+ settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7"
+ settings.update(settings_dict or {})
+ runner = CrawlerRunner(settings)
+ crawler = runner.create_crawler(spidercls or TestSpider)
+ crawler._apply_settings()
+ return crawler
-def get_pythonpath() ->str:
+def get_pythonpath() -> str:
"""Return a PYTHONPATH suitable to use in processes so that they find this
installation of Scrapy"""
- pass
+ scrapy_path = import_module("scrapy").__path__[0]
+ return str(Path(scrapy_path).parent) + os.pathsep + os.environ.get("PYTHONPATH", "")
-def get_testenv() ->Dict[str, str]:
+def get_testenv() -> Dict[str, str]:
"""Return a OS environment dict suitable to fork processes that need to import
this installation of Scrapy, instead of a system installed one.
"""
- pass
+ env = os.environ.copy()
+ env["PYTHONPATH"] = get_pythonpath()
+ return env
-def assert_samelines(testcase: TestCase, text1: str, text2: str, msg:
- Optional[str]=None) ->None:
+def assert_samelines(
+ testcase: TestCase, text1: str, text2: str, msg: Optional[str] = None
+) -> None:
"""Asserts text1 and text2 have the same lines, ignoring differences in
line endings between platforms
"""
- pass
+ testcase.assertEqual(text1.splitlines(), text2.splitlines(), msg)
-def mock_google_cloud_storage() ->Tuple[Any, Any, Any]:
+def get_from_asyncio_queue(value: Any) -> Coroutine:
+ q: asyncio.Queue = asyncio.Queue()
+ getter = q.get()
+ q.put_nowait(value)
+ return getter
+
+
+def mock_google_cloud_storage() -> Tuple[Any, Any, Any]:
"""Creates autospec mocks for google-cloud-storage Client, Bucket and Blob
classes and set their proper return values.
"""
- pass
+ from google.cloud.storage import Blob, Bucket, Client
+
+ client_mock = mock.create_autospec(Client)
+
+ bucket_mock = mock.create_autospec(Bucket)
+ client_mock.get_bucket.return_value = bucket_mock
+
+ blob_mock = mock.create_autospec(Blob)
+ bucket_mock.blob.return_value = blob_mock
+
+ return (client_mock, bucket_mock, blob_mock)
+
+
+def get_web_client_agent_req(url: str) -> Deferred:
+ from twisted.internet import reactor
+ from twisted.web.client import Agent # imports twisted.internet.reactor
+
+ agent = Agent(reactor)
+ return agent.request(b"GET", url.encode("utf-8"))
diff --git a/scrapy/utils/testproc.py b/scrapy/utils/testproc.py
index 4e09c46cc..0688e014b 100644
--- a/scrapy/utils/testproc.py
+++ b/scrapy/utils/testproc.py
@@ -1,7 +1,9 @@
from __future__ import annotations
+
import os
import sys
from typing import Iterable, List, Optional, Tuple, cast
+
from twisted.internet.defer import Deferred
from twisted.internet.error import ProcessTerminated
from twisted.internet.protocol import ProcessProtocol
@@ -10,14 +12,52 @@ from twisted.python.failure import Failure
class ProcessTest:
command = None
- prefix = [sys.executable, '-m', 'scrapy.cmdline']
- cwd = os.getcwd()
+ prefix = [sys.executable, "-m", "scrapy.cmdline"]
+ cwd = os.getcwd() # trial chdirs to temp dir
+ def execute(
+ self,
+ args: Iterable[str],
+ check_code: bool = True,
+ settings: Optional[str] = None,
+ ) -> Deferred:
+ from twisted.internet import reactor
+
+ env = os.environ.copy()
+ if settings is not None:
+ env["SCRAPY_SETTINGS_MODULE"] = settings
+ assert self.command
+ cmd = self.prefix + [self.command] + list(args)
+ pp = TestProcessProtocol()
+ pp.deferred.addCallback(self._process_finished, cmd, check_code)
+ reactor.spawnProcess(pp, cmd[0], cmd, env=env, path=self.cwd)
+ return pp.deferred
+
+ def _process_finished(
+ self, pp: TestProcessProtocol, cmd: List[str], check_code: bool
+ ) -> Tuple[int, bytes, bytes]:
+ if pp.exitcode and check_code:
+ msg = f"process {cmd} exit with code {pp.exitcode}"
+ msg += f"\n>>> stdout <<<\n{pp.out.decode()}"
+ msg += "\n"
+ msg += f"\n>>> stderr <<<\n{pp.err.decode()}"
+ raise RuntimeError(msg)
+ return cast(int, pp.exitcode), pp.out, pp.err
-class TestProcessProtocol(ProcessProtocol):
- def __init__(self) ->None:
+class TestProcessProtocol(ProcessProtocol):
+ def __init__(self) -> None:
self.deferred: Deferred = Deferred()
- self.out: bytes = b''
- self.err: bytes = b''
+ self.out: bytes = b""
+ self.err: bytes = b""
self.exitcode: Optional[int] = None
+
+ def outReceived(self, data: bytes) -> None:
+ self.out += data
+
+ def errReceived(self, data: bytes) -> None:
+ self.err += data
+
+ def processEnded(self, status: Failure) -> None:
+ self.exitcode = cast(ProcessTerminated, status.value).exitCode
+ self.deferred.callback(self)
diff --git a/scrapy/utils/testsite.py b/scrapy/utils/testsite.py
index c91969a13..de9ce992a 100644
--- a/scrapy/utils/testsite.py
+++ b/scrapy/utils/testsite.py
@@ -1,17 +1,55 @@
from urllib.parse import urljoin
+
from twisted.web import resource, server, static, util
class SiteTest:
- pass
+ def setUp(self):
+ from twisted.internet import reactor
+
+ super().setUp()
+ self.site = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
+ self.baseurl = f"http://localhost:{self.site.getHost().port}/"
+
+ def tearDown(self):
+ super().tearDown()
+ self.site.stopListening()
+
+ def url(self, path):
+ return urljoin(self.baseurl, path)
class NoMetaRefreshRedirect(util.Redirect):
- pass
+ def render(self, request):
+ content = util.Redirect.render(self, request)
+ return content.replace(
+ b'http-equiv="refresh"', b'http-no-equiv="do-not-refresh-me"'
+ )
-if __name__ == '__main__':
+def test_site():
+ r = resource.Resource()
+ r.putChild(b"text", static.Data(b"Works", "text/plain"))
+ r.putChild(
+ b"html",
+ static.Data(
+ b"<body><p class='one'>Works</p><p class='two'>World</p></body>",
+ "text/html",
+ ),
+ )
+ r.putChild(
+ b"enc-gb18030",
+ static.Data(b"<p>gb18030 encoding</p>", "text/html; charset=gb18030"),
+ )
+ r.putChild(b"redirect", util.Redirect(b"/redirected"))
+ r.putChild(b"redirect-no-meta-refresh", NoMetaRefreshRedirect(b"/redirected"))
+ r.putChild(b"redirected", static.Data(b"Redirected here", "text/plain"))
+ return server.Site(r)
+
+
+if __name__ == "__main__":
from twisted.internet import reactor
- port = reactor.listenTCP(0, test_site(), interface='127.0.0.1')
- print(f'http://localhost:{port.getHost().port}/')
+
+ port = reactor.listenTCP(0, test_site(), interface="127.0.0.1")
+ print(f"http://localhost:{port.getHost().port}/")
reactor.run()
diff --git a/scrapy/utils/trackref.py b/scrapy/utils/trackref.py
index 42dd22dd7..9ff9a273f 100644
--- a/scrapy/utils/trackref.py
+++ b/scrapy/utils/trackref.py
@@ -8,43 +8,65 @@ About performance: This library has a minimal performance impact when enabled,
and no performance penalty at all when disabled (as object_ref becomes just an
alias to object in that case).
"""
+
from collections import defaultdict
from operator import itemgetter
from time import time
from typing import TYPE_CHECKING, Any, DefaultDict, Iterable
from weakref import WeakKeyDictionary
+
if TYPE_CHECKING:
+ # typing.Self requires Python 3.11
from typing_extensions import Self
+
+
NoneType = type(None)
-live_refs: DefaultDict[type, WeakKeyDictionary] = defaultdict(WeakKeyDictionary
- )
+live_refs: DefaultDict[type, WeakKeyDictionary] = defaultdict(WeakKeyDictionary)
class object_ref:
"""Inherit from this class to a keep a record of live instances"""
+
__slots__ = ()
- def __new__(cls, *args: Any, **kwargs: Any) ->'Self':
+ def __new__(cls, *args: Any, **kwargs: Any) -> "Self":
obj = object.__new__(cls)
live_refs[cls][obj] = time()
return obj
-def format_live_refs(ignore: Any=NoneType) ->str:
+# using Any as it's hard to type type(None)
+def format_live_refs(ignore: Any = NoneType) -> str:
"""Return a tabular representation of tracked objects"""
- pass
+ s = "Live References\n\n"
+ now = time()
+ for cls, wdict in sorted(live_refs.items(), key=lambda x: x[0].__name__):
+ if not wdict:
+ continue
+ if issubclass(cls, ignore):
+ continue
+ oldest = min(wdict.values())
+ s += f"{cls.__name__:<30} {len(wdict):6} oldest: {int(now - oldest)}s ago\n"
+ return s
-def print_live_refs(*a: Any, **kw: Any) ->None:
+def print_live_refs(*a: Any, **kw: Any) -> None:
"""Print tracked objects"""
- pass
+ print(format_live_refs(*a, **kw))
-def get_oldest(class_name: str) ->Any:
+def get_oldest(class_name: str) -> Any:
"""Get the oldest object for a specific class name"""
- pass
+ for cls, wdict in live_refs.items():
+ if cls.__name__ == class_name:
+ if not wdict:
+ break
+ return min(wdict.items(), key=itemgetter(1))[0]
-def iter_all(class_name: str) ->Iterable[Any]:
+def iter_all(class_name: str) -> Iterable[Any]:
"""Iterate over all objects of the same class by its class name"""
- pass
+ for cls, wdict in live_refs.items():
+ if cls.__name__ == class_name:
+ return wdict.keys()
+ return []
diff --git a/scrapy/utils/url.py b/scrapy/utils/url.py
index 31325513d..22b4197f9 100644
--- a/scrapy/utils/url.py
+++ b/scrapy/utils/url.py
@@ -8,37 +8,53 @@ to the w3lib.url module. Always import those from there instead.
import re
from typing import TYPE_CHECKING, Iterable, Optional, Type, Union, cast
from urllib.parse import ParseResult, urldefrag, urlparse, urlunparse
+
+# scrapy.utils.url was moved to w3lib.url and import * ensures this
+# move doesn't break old code
from w3lib.url import *
-from w3lib.url import _safe_chars, _unquotepath
+from w3lib.url import _safe_chars, _unquotepath # noqa: F401
+
from scrapy.utils.python import to_unicode
+
if TYPE_CHECKING:
from scrapy import Spider
+
+
UrlT = Union[str, bytes, ParseResult]
-def url_is_from_any_domain(url: UrlT, domains: Iterable[str]) ->bool:
+def url_is_from_any_domain(url: UrlT, domains: Iterable[str]) -> bool:
"""Return True if the url belongs to any of the given domains"""
- pass
+ host = parse_url(url).netloc.lower()
+ if not host:
+ return False
+ domains = [d.lower() for d in domains]
+ return any((host == d) or (host.endswith(f".{d}")) for d in domains)
-def url_is_from_spider(url: UrlT, spider: Type['Spider']) ->bool:
+def url_is_from_spider(url: UrlT, spider: Type["Spider"]) -> bool:
"""Return True if the url belongs to the given spider"""
- pass
+ return url_is_from_any_domain(
+ url, [spider.name] + list(getattr(spider, "allowed_domains", []))
+ )
-def url_has_any_extension(url: UrlT, extensions: Iterable[str]) ->bool:
+def url_has_any_extension(url: UrlT, extensions: Iterable[str]) -> bool:
"""Return True if the url ends with one of the extensions provided"""
- pass
+ lowercase_path = parse_url(url).path.lower()
+ return any(lowercase_path.endswith(ext) for ext in extensions)
-def parse_url(url: UrlT, encoding: Optional[str]=None) ->ParseResult:
+def parse_url(url: UrlT, encoding: Optional[str] = None) -> ParseResult:
"""Return urlparsed url from the given argument (which could be an already
parsed url)
"""
- pass
+ if isinstance(url, ParseResult):
+ return url
+ return cast(ParseResult, urlparse(to_unicode(url, encoding)))
-def escape_ajax(url: str) ->str:
+def escape_ajax(url: str) -> str:
"""
Return the crawlable url according to:
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
@@ -61,22 +77,80 @@ def escape_ajax(url: str) ->str:
>>> escape_ajax("www.example.com/ajax.html")
'www.example.com/ajax.html'
"""
- pass
+ defrag, frag = urldefrag(url)
+ if not frag.startswith("!"):
+ return url
+ return add_or_replace_parameter(defrag, "_escaped_fragment_", frag[1:])
-def add_http_if_no_scheme(url: str) ->str:
+def add_http_if_no_scheme(url: str) -> str:
"""Add http as the default scheme if it is missing from the url."""
- pass
-
-
-def guess_scheme(url: str) ->str:
+ match = re.match(r"^\w+://", url, flags=re.I)
+ if not match:
+ parts = urlparse(url)
+ scheme = "http:" if parts.netloc else "http://"
+ url = scheme + url
+
+ return url
+
+
+def _is_posix_path(string: str) -> bool:
+ return bool(
+ re.match(
+ r"""
+ ^ # start with...
+ (
+ \. # ...a single dot,
+ (
+ \. | [^/\.]+ # optionally followed by
+ )? # either a second dot or some characters
+ |
+ ~ # $HOME
+ )? # optional match of ".", ".." or ".blabla"
+ / # at least one "/" for a file path,
+ . # and something after the "/"
+ """,
+ string,
+ flags=re.VERBOSE,
+ )
+ )
+
+
+def _is_windows_path(string: str) -> bool:
+ return bool(
+ re.match(
+ r"""
+ ^
+ (
+ [a-z]:\\
+ | \\\\
+ )
+ """,
+ string,
+ flags=re.IGNORECASE | re.VERBOSE,
+ )
+ )
+
+
+def _is_filesystem_path(string: str) -> bool:
+ return _is_posix_path(string) or _is_windows_path(string)
+
+
+def guess_scheme(url: str) -> str:
"""Add an URL scheme if missing: file:// for filepath-like input or
http:// otherwise."""
- pass
-
-
-def strip_url(url: str, strip_credentials: bool=True, strip_default_port:
- bool=True, origin_only: bool=False, strip_fragment: bool=True) ->str:
+ if _is_filesystem_path(url):
+ return any_to_uri(url)
+ return add_http_if_no_scheme(url)
+
+
+def strip_url(
+ url: str,
+ strip_credentials: bool = True,
+ strip_default_port: bool = True,
+ origin_only: bool = False,
+ strip_fragment: bool = True,
+) -> str:
"""Strip URL string from some of its components:
- ``strip_credentials`` removes "user:password@"
@@ -86,4 +160,27 @@ def strip_url(url: str, strip_credentials: bool=True, strip_default_port:
query and fragment components ; it also strips credentials
- ``strip_fragment`` drops any #fragment component
"""
- pass
+
+ parsed_url = urlparse(url)
+ netloc = parsed_url.netloc
+ if (strip_credentials or origin_only) and (
+ parsed_url.username or parsed_url.password
+ ):
+ netloc = netloc.split("@")[-1]
+ if strip_default_port and parsed_url.port:
+ if (parsed_url.scheme, parsed_url.port) in (
+ ("http", 80),
+ ("https", 443),
+ ("ftp", 21),
+ ):
+ netloc = netloc.replace(f":{parsed_url.port}", "")
+ return urlunparse(
+ (
+ parsed_url.scheme,
+ netloc,
+ "/" if origin_only else parsed_url.path,
+ "" if origin_only else parsed_url.params,
+ "" if origin_only else parsed_url.query,
+ "" if strip_fragment else parsed_url.fragment,
+ )
+ )
diff --git a/scrapy/utils/versions.py b/scrapy/utils/versions.py
index b49afb199..42e5e9be4 100644
--- a/scrapy/utils/versions.py
+++ b/scrapy/utils/versions.py
@@ -1,11 +1,32 @@
import platform
import sys
from typing import List, Tuple
+
import cryptography
import cssselect
-import lxml.etree
+import lxml.etree # nosec
import parsel
import twisted
import w3lib
+
import scrapy
from scrapy.utils.ssl import get_openssl_version
+
+
+def scrapy_components_versions() -> List[Tuple[str, str]]:
+ lxml_version = ".".join(map(str, lxml.etree.LXML_VERSION))
+ libxml2_version = ".".join(map(str, lxml.etree.LIBXML_VERSION))
+
+ return [
+ ("Scrapy", scrapy.__version__),
+ ("lxml", lxml_version),
+ ("libxml2", libxml2_version),
+ ("cssselect", cssselect.__version__),
+ ("parsel", parsel.__version__),
+ ("w3lib", w3lib.__version__),
+ ("Twisted", twisted.version.short()),
+ ("Python", sys.version.replace("\n", "- ")),
+ ("pyOpenSSL", get_openssl_version()),
+ ("cryptography", cryptography.__version__),
+ ("Platform", platform.platform()),
+ ]