Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed GoogleScraper/__pycache__/__init__.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/caching.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/commandline.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/config.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/core.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/database.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/http_mode.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/log.cpython-34.pyc
Binary file not shown.
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/parsing.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/proxies.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/scrape_jobs.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/scraping.cpython-34.pyc
Binary file not shown.
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/socks.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/utils.cpython-34.pyc
Binary file not shown.
Binary file removed GoogleScraper/__pycache__/version.cpython-34.pyc
Binary file not shown.
11 changes: 11 additions & 0 deletions GoogleScraper/config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,11 @@ check_proxies: True
; response when something fails.
raise_exceptions_while_scraping: False

; The following two options only make sense when search_engine is set to "googleimg"
; do NOT use them unless you are sure what you are goint to do
image_type: None
image_size: None

; Global configuration parameters that apply on all modes.
[GLOBAL]
; The proxy file. If this is a valid file path, each line will represent a proxy.
Expand Down Expand Up @@ -236,6 +241,12 @@ sel_browser: Chrome
; with the current proxy is discarded.
manual_captcha_solving: False

; Xvfb display option
; You should start xvfb at your own
; Format: [hostname]:displaynumber[.screennumber], see X(7) manuel for details
; will set environment variable $DISPLAY to it
xvfb_display: None

; All settings that target the raw http packet scraping mode.
[HTTP]

Expand Down
4 changes: 2 additions & 2 deletions GoogleScraper/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,15 +953,15 @@ def get_parser_by_search_engine(search_engine):
Raises:
NoParserForSearchEngineException if no parser could be found for the name.
"""
if search_engine == 'google':
if search_engine == 'google' or search_engine == 'googleimg':
return GoogleParser
elif search_engine == 'yandex':
return YandexParser
elif search_engine == 'bing':
return BingParser
elif search_engine == 'yahoo':
return YahooParser
elif search_engine == 'baidu':
elif search_engine == 'baidu' or search_engine == 'baiduimg':
return BaiduParser
elif search_engine == 'duckduckgo':
return DuckduckgoParser
Expand Down
88 changes: 86 additions & 2 deletions GoogleScraper/selenium_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import math
import re
import sys
import os

try:
from selenium import webdriver
Expand Down Expand Up @@ -61,7 +62,9 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': '.n',
'ask': '#paging div a.txt3.l_nu',
'blekko': '',
'duckduckgo': ''
'duckduckgo': '',
'googleimg': '#pnnext',
'baiduimg': '.n',
}

input_field_selectors = {
Expand All @@ -73,6 +76,23 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'duckduckgo': (By.NAME, 'q'),
'ask': (By.NAME, 'q'),
'blekko': (By.NAME, 'q'),
'google': (By.NAME, 'q'),
'googleimg': (By.NAME, 'as_q'),
'baiduimg': (By.NAME, 'word'),
}

param_field_selectors = {
'googleimg': {
'image_type': (By.ID, 'imgtype_input'),
'image_size': (By.ID, 'imgsz_input'),
},
}

search_params = {
'googleimg': {
'image_type': None,
'image_size': None,
},
}

normal_search_locations = {
Expand All @@ -83,7 +103,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'baidu': 'http://baidu.com/',
'duckduckgo': 'https://duckduckgo.com/',
'ask': 'http://ask.com/',
'blekko': 'http://blekko.com/'
'blekko': 'http://blekko.com/',
}

image_search_locations = {
Expand All @@ -95,6 +115,8 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'duckduckgo': None, # duckduckgo doesnt't support direct image search
'ask': 'http://www.ask.com/pictures/',
'blekko': None,
'googleimg':'https://www.google.com/advanced_image_search',
'baiduimg': 'http://image.baidu.com/',
}

def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
Expand All @@ -115,6 +137,10 @@ def __init__(self, *args, captcha_lock=None, browser_num=1, **kwargs):
self.captcha_lock = captcha_lock
self.scrape_method = 'selenium'

self.xvfb_display = Config['SELENIUM'].get('xvfb_display', None)

self.search_param_values = self._get_search_param_values()

# get the base search url based on the search engine.
self.base_search_url = get_base_search_url_by_search_engine(self.search_engine_name, self.scrape_method)
super().instance_creation_info(self.__class__.__name__)
Expand Down Expand Up @@ -153,6 +179,11 @@ def proxy_check(self):

return online

def _set_xvfb_display(self):
# TODO: should we check the format of the config?
if self.xvfb_display:
os.environ['DISPLAY'] = self.xvfb_display

def _get_webdriver(self):
"""Return a webdriver instance and set it up with the according profile/ proxies.

Expand Down Expand Up @@ -292,6 +323,15 @@ def build_search(self):

self.webdriver.get(self.starting_point)

def _get_search_param_values(self):
search_param_values = {}
if self.search_engine_name in self.search_params:
for param_key in self.search_params[self.search_engine_name]:
cfg = Config['SCRAPING'].get(param_key, None)
if cfg:
search_param_values[param_key] = cfg
return search_param_values

def _get_search_input_field(self):
"""Get the search input field for the current search_engine.

Expand All @@ -300,6 +340,12 @@ def _get_search_input_field(self):
"""
return self.input_field_selectors[self.search_engine_name]

def _get_search_param_fields(self):
if self.search_engine_name in self.param_field_selectors:
return self.param_field_selectors[self.search_engine_name]
else:
return {}

def _wait_until_search_input_field_appears(self, max_wait=5):
"""Waits until the search input field can be located for the current search engine

Expand All @@ -321,6 +367,20 @@ def find_visible_search_input(driver):
logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e))
return False

def _wait_until_search_param_fields_appears(self, max_wait=5):
def find_visible_search_param(driver):
for param, field in self._get_search_param_fields().items():
input_field = driver.find_element(*field)
if not input_field:
return False
return True

try:
fields = WebDriverWait(self.webdriver, max_wait).until(find_visible_search_param)
return fields
except TimeoutException as e:
logger.error('{}: TimeoutException waiting for search param field: {}'.format(self.name, e))
return False

def _wait_until_search_input_field_contains_query(self, max_wait=5):
"""Waits until the search input field contains the query.
Expand Down Expand Up @@ -439,6 +499,28 @@ def search(self):
self.search_input.clear()
time.sleep(.25)

self.search_param_fields = self._get_search_param_fields()

if self.search_param_fields:
wait_res = self._wait_until_search_param_fields_appears()
if wait_res is False:
raise Exception('Waiting search param input fields time exceeds')
for param, field in self.search_param_fields.items():
if field[0] == By.ID:
js_tpl = '''
var field = document.getElementById("%s");
field.setAttribute("value", "%s");
'''
elif field[0] == By.NAME:
js_tpl = '''
var fields = document.getElementsByName("%s");
for (var f in fields) {
f.setAttribute("value", "%s");
}
'''
js_str = js_tpl % (field[1], self.search_param_values[param])
self.webdriver.execute_script(js_str)

try:
self.search_input.send_keys(self.query + Keys.ENTER)
except ElementNotVisibleException as e:
Expand Down Expand Up @@ -495,6 +577,8 @@ def page_down(self):
def run(self):
"""Run the SelScraper."""

self._set_xvfb_display()

if not self._get_webdriver():
raise_or_log('{}: Aborting due to no available selenium webdriver.'.format(self.name), exception_obj=SeleniumMisconfigurationError)

Expand Down