NikolaiT · leadscloud · Jan 12, 2015 · Jan 12, 2015 · Jan 12, 2015
diff --git a/GoogleScraper/config.cfg b/GoogleScraper/config.cfg
@@ -99,6 +99,9 @@ use_own_ip: True
 ; Whether to check proxies before starting the scrape
 check_proxies: True
 
+; Set HTTP requests to stop waiting for a response after a given number of seconds
+timeout: 10
+
 ; Global configuration parameters that apply on all modes.
 [GLOBAL]
 ; The proxy file. If this is a valid file path, each line will represent a proxy.

diff --git a/GoogleScraper/core.py b/GoogleScraper/core.py
@@ -259,6 +259,10 @@ def main(return_results=False, parse_cmd_line=True):
 
     if Config['SCRAPING'].getboolean('use_own_ip'):
         proxies.append(None)
+
+    request_timeout = Config['SCRAPING'].getint('timeout', 10)
+    if request_timeout < 10:
+        request_timeout = 10
 
     if not proxies:
         raise InvalidConfigurationException("No proxies available and using own IP is prohibited by configuration. Turning down.")
@@ -398,6 +402,7 @@ def main(return_results=False, parse_cmd_line=True):
                                 db_lock=db_lock,
                                 proxy=proxy_to_use,
                                 progress_queue=q,
+                                request_timeout=request_timeout
                             )
                         )
 

diff --git a/GoogleScraper/http.py b/GoogleScraper/http.py
@@ -250,7 +250,8 @@ def search(self, *args, rand=False, **kwargs):
             super().detection_prevention_sleep()
             super().keyword_info()
 
-            request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers, timeout=5)
+            request = self.requests.get(self.base_search_url + urlencode(self.search_params), headers=self.headers,
+                                        timeout=self.request_timeout)
 
             self.current_request_time = datetime.datetime.utcnow()
             self.html = request.text

diff --git a/GoogleScraper/scraping.py b/GoogleScraper/scraping.py
@@ -136,7 +136,8 @@ class SearchEngineScrape(metaclass=abc.ABCMeta):
     }
 
     def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=None, cache_lock=None,
-                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None):
+                 start_page_pos=1, search_engine=None, search_type=None, proxy=None, progress_queue=None,
+                 request_timeout=10):
         """Instantiate an SearchEngineScrape object.
 
         Args:
@@ -240,6 +241,9 @@ def __init__(self, keywords=None, scraper_search=None, session=None, db_lock=Non
         # the default timeout
         self.timeout = 5
 
+        # http request timeout
+        self.request_timeout = request_timeout
+
 
 
     @abc.abstractmethod
@@ -275,7 +279,7 @@ def blocking_search(self, callback, *args, **kwargs):
                     # Leave search when search engines detected us
                     # add the rest of the keywords as missed one
                     logger.critical(e)
-                    self.missed_keywords.add(self.keywords[i:])
+                    self.missed_keywords.add(self.keywords[i])
                     continue
 
     @abc.abstractmethod

diff --git a/GoogleScraper/selenium.py b/GoogleScraper/selenium.py
@@ -364,7 +364,10 @@ def search(self):
             if self.search_input:
                 self.search_input.clear()
                 time.sleep(.25)
-                self.search_input.send_keys(self.current_keyword + Keys.ENTER)
+                self.search_input.send_keys(self.current_keyword)
+                if self.browser_type == 'phantomjs':
+                    time.sleep(1)  # Phantomjs are much faster than firefox, chrome
+                self.search_input.send_keys(Keys.ENTER)
                 self.current_request_time = datetime.datetime.utcnow()
             else:
                 logger.warning('Cannot get handle to the input form for keyword {}.'.format(self.current_keyword))