diff --git a/GoogleScraper/__pycache__/__init__.cpython-34.pyc b/GoogleScraper/__pycache__/__init__.cpython-34.pyc deleted file mode 100644 index 5c27134e..00000000 Binary files a/GoogleScraper/__pycache__/__init__.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/caching.cpython-34.pyc b/GoogleScraper/__pycache__/caching.cpython-34.pyc deleted file mode 100644 index 26f4cf24..00000000 Binary files a/GoogleScraper/__pycache__/caching.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/commandline.cpython-34.pyc b/GoogleScraper/__pycache__/commandline.cpython-34.pyc deleted file mode 100644 index b6b515c4..00000000 Binary files a/GoogleScraper/__pycache__/commandline.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/config.cpython-34.pyc b/GoogleScraper/__pycache__/config.cpython-34.pyc deleted file mode 100644 index c4582a4d..00000000 Binary files a/GoogleScraper/__pycache__/config.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/core.cpython-34.pyc b/GoogleScraper/__pycache__/core.cpython-34.pyc deleted file mode 100644 index ab3a41c9..00000000 Binary files a/GoogleScraper/__pycache__/core.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/database.cpython-34.pyc b/GoogleScraper/__pycache__/database.cpython-34.pyc deleted file mode 100644 index 68bc2e89..00000000 Binary files a/GoogleScraper/__pycache__/database.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/http_mode.cpython-34.pyc b/GoogleScraper/__pycache__/http_mode.cpython-34.pyc deleted file mode 100644 index 2774016a..00000000 Binary files a/GoogleScraper/__pycache__/http_mode.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/log.cpython-34.pyc b/GoogleScraper/__pycache__/log.cpython-34.pyc deleted file mode 100644 index b8c0ed1f..00000000 Binary files a/GoogleScraper/__pycache__/log.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/output_converter.cpython-34.pyc b/GoogleScraper/__pycache__/output_converter.cpython-34.pyc deleted file mode 100644 index 452c3b8d..00000000 Binary files a/GoogleScraper/__pycache__/output_converter.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/parsing.cpython-34.pyc b/GoogleScraper/__pycache__/parsing.cpython-34.pyc deleted file mode 100644 index d80a12e9..00000000 Binary files a/GoogleScraper/__pycache__/parsing.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/proxies.cpython-34.pyc b/GoogleScraper/__pycache__/proxies.cpython-34.pyc deleted file mode 100644 index 3d69d4ba..00000000 Binary files a/GoogleScraper/__pycache__/proxies.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/scrape_jobs.cpython-34.pyc b/GoogleScraper/__pycache__/scrape_jobs.cpython-34.pyc deleted file mode 100644 index 87c9112c..00000000 Binary files a/GoogleScraper/__pycache__/scrape_jobs.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/scraping.cpython-34.pyc b/GoogleScraper/__pycache__/scraping.cpython-34.pyc deleted file mode 100644 index 05fbfdf3..00000000 Binary files a/GoogleScraper/__pycache__/scraping.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/selenium_mode.cpython-34.pyc b/GoogleScraper/__pycache__/selenium_mode.cpython-34.pyc deleted file mode 100644 index b8ea5577..00000000 Binary files a/GoogleScraper/__pycache__/selenium_mode.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/socks.cpython-34.pyc b/GoogleScraper/__pycache__/socks.cpython-34.pyc deleted file mode 100644 index ca864664..00000000 Binary files a/GoogleScraper/__pycache__/socks.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/utils.cpython-34.pyc b/GoogleScraper/__pycache__/utils.cpython-34.pyc deleted file mode 100644 index 3d9a5dc2..00000000 Binary files a/GoogleScraper/__pycache__/utils.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/__pycache__/version.cpython-34.pyc b/GoogleScraper/__pycache__/version.cpython-34.pyc deleted file mode 100644 index bbac7613..00000000 Binary files a/GoogleScraper/__pycache__/version.cpython-34.pyc and /dev/null differ diff --git a/GoogleScraper/config.cfg b/GoogleScraper/config.cfg index 323a7016..d711a517 100644 --- a/GoogleScraper/config.cfg +++ b/GoogleScraper/config.cfg @@ -106,6 +106,11 @@ check_proxies: True ; response when something fails. raise_exceptions_while_scraping: False +; The following two options only make sense when search_engine is set to "googleimg" +; do NOT use them unless you are sure what you are goint to do +image_type: None +image_size: None + ; Global configuration parameters that apply on all modes. [GLOBAL] ; The proxy file. If this is a valid file path, each line will represent a proxy. @@ -236,6 +241,12 @@ sel_browser: Chrome ; with the current proxy is discarded. manual_captcha_solving: False +; Xvfb display option +; You should start xvfb at your own +; Format: [hostname]:displaynumber[.screennumber], see X(7) manuel for details +; will set environment variable $DISPLAY to it +xvfb_display: None + ; All settings that target the raw http packet scraping mode. [HTTP] diff --git a/GoogleScraper/parsing.py b/GoogleScraper/parsing.py index 58e62f59..bfa2a2b9 100644 --- a/GoogleScraper/parsing.py +++ b/GoogleScraper/parsing.py @@ -953,7 +953,7 @@ def get_parser_by_search_engine(search_engine): Raises: NoParserForSearchEngineException if no parser could be found for the name. """ - if search_engine == 'google': + if search_engine == 'google' or search_engine == 'googleimg': return GoogleParser elif search_engine == 'yandex': return YandexParser @@ -961,7 +961,7 @@ def get_parser_by_search_engine(search_engine): return BingParser elif search_engine == 'yahoo': return YahooParser - elif search_engine == 'baidu': + elif search_engine == 'baidu' or search_engine == 'baiduimg': return BaiduParser elif search_engine == 'duckduckgo': return DuckduckgoParser diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py index 9e580610..1180d851 100644 --- a/GoogleScraper/selenium_mode.py +++ b/GoogleScraper/selenium_mode.py @@ -9,6 +9,7 @@ import math import re import sys +import os try: from selenium import webdriver @@ -61,7 +62,9 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', 'blekko': '', - 'duckduckgo': '' + 'duckduckgo': '', + 'googleimg': '#pnnext', + 'baiduimg': '.n', } input_field_selectors = { @@ -73,6 +76,23 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'duckduckgo': (By.NAME, 'q'), 'ask': (By.NAME, 'q'), 'blekko': (By.NAME, 'q'), + 'google': (By.NAME, 'q'), + 'googleimg': (By.NAME, 'as_q'), + 'baiduimg': (By.NAME, 'word'), + } + + param_field_selectors = { + 'googleimg': { + 'image_type': (By.ID, 'imgtype_input'), + 'image_size': (By.ID, 'imgsz_input'), + }, + } + + search_params = { + 'googleimg': { + 'image_type': None, + 'image_size': None, + }, } normal_search_locations = { @@ -83,7 +103,7 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://baidu.com/', 'duckduckgo': 'https://duckduckgo.com/', 'ask': 'http://ask.com/', - 'blekko': 'http://blekko.com/' + 'blekko': 'http://blekko.com/', } image_search_locations = { @@ -95,6 +115,8 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'duckduckgo': None, # duckduckgo doesnt't support direct image search 'ask': 'http://www.ask.com/pictures/', 'blekko': None, + 'googleimg':'https://www.google.com/advanced_image_search', + 'baiduimg': 'http://image.baidu.com/', } def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): @@ -115,6 +137,10 @@ def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs): self.captcha_lock = captcha_lock self.scrape_method = 'selenium' + self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None) + + self.search_param_values = self._get_search_param_values() + # get the base search url based on the search engine. self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method) super().instance_creation_info(self.__class__.__name__) @@ -153,6 +179,11 @@ def proxy_check(self): return online + def _set_xvfb_display(self): + # TODO: should we check the format of the config? + if self.xvfb_display: + os.environ['DISPLAY'] = self.xvfb_display + def _get_webdriver(self): """Return a webdriver instance and set it up with the according profile/ proxies. @@ -292,6 +323,15 @@ def build_search(self): self.webdriver.get(self.starting_point) + def _get_search_param_values(self): + search_param_values = {} + if self.search_engine_name in self.search_params: + for param_key in self.search_params[self.search_engine_name]: + cfg = Config['SCRAPING'].get(param_key, None) + if cfg: + search_param_values[param_key] = cfg + return search_param_values + def _get_search_input_field(self): """Get the search input field for the current search_engine. @@ -300,6 +340,12 @@ def _get_search_input_field(self): """ return self.input_field_selectors[self.search_engine_name] + def _get_search_param_fields(self): + if self.search_engine_name in self.param_field_selectors: + return self.param_field_selectors[self.search_engine_name] + else: + return {} + def _wait_until_search_input_field_appears(self, max_wait=5): """Waits until the search input field can be located for the current search engine @@ -321,6 +367,20 @@ def find_visible_search_input(driver): logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e)) return False + def _wait_until_search_param_fields_appears(self, max_wait=5): + def find_visible_search_param(driver): + for param, field in self._get_search_param_fields().items(): + input_field = driver.find_element(*field) + if not input_field: + return False + return True + + try: + fields = WebDriverWait(self.webdriver, max_wait).until(find_visible_search_param) + return fields + except TimeoutException as e: + logger.error('{}: TimeoutException waiting for search param field: {}'.format(self.name, e)) + return False def _wait_until_search_input_field_contains_query(self, max_wait=5): """Waits until the search input field contains the query. @@ -439,6 +499,28 @@ def search(self): self.search_input.clear() time.sleep(.25) + self.search_param_fields = self._get_search_param_fields() + + if self.search_param_fields: + wait_res = self._wait_until_search_param_fields_appears() + if wait_res is False: + raise Exception('Waiting search param input fields time exceeds') + for param, field in self.search_param_fields.items(): + if field[0] == By.ID: + js_tpl = ''' + var field = document.getElementById("%s"); + field.setAttribute("value", "%s"); + ''' + elif field[0] == By.NAME: + js_tpl = ''' + var fields = document.getElementsByName("%s"); + for (var f in fields) { + f.setAttribute("value", "%s"); + } + ''' + js_str = js_tpl % (field[1], self.search_param_values[param]) + self.webdriver.execute_script(js_str) + try: self.search_input.send_keys(self.query + Keys.ENTER) except ElementNotVisibleException as e: @@ -495,6 +577,8 @@ def page_down(self): def run(self): """Run the SelScraper.""" + self._set_xvfb_display() + if not self._get_webdriver(): raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name), exception_obj=SeleniumMisconfigurationError)