From bcbb1b5f98e11c76cab540d070ffcfa052067b8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oskar=20S=C3=B6derbom?= Date: Wed, 24 Jul 2024 21:33:05 +0200 Subject: [PATCH 1/5] start of search results now argument --- googlesearch/__init__.py | 43 ++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index ab38640..5d41c86 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,4 +1,5 @@ """googlesearch is a Python library for searching Google, easily.""" + from time import sleep from bs4 import BeautifulSoup from requests import get @@ -8,9 +9,7 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify): resp = get( url="https://www.google.com/search", - headers={ - "User-Agent": get_useragent() - }, + headers={"User-Agent": get_useragent()}, params={ "q": term, "num": results + 2, # Prevents multiple requests @@ -35,20 +34,44 @@ def __init__(self, url, title, description): def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None): + +def search( + term, + num_results=10, + lang="en", + proxy=None, + advanced=False, + sleep_interval=0, + timeout=5, + safe="active", + ssl_verify=None, + start_num=0, +): """Search the Google search engine""" escaped_term = term.replace(" ", "+") # Proxy setup - proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + proxies = ( + {"https": proxy, "http": proxy} + if proxy and (proxy.startswith("https") or proxy.startswith("http")) + else None + ) - start = 0 + start = start_num fetched_results = 0 # Keep track of the total fetched results while fetched_results < num_results: # Send request - resp = _req(escaped_term, num_results - start, - lang, start, proxies, timeout, safe, ssl_verify) + resp = _req( + escaped_term, + num_results - start, + lang, + start, + proxies, + timeout, + safe, + ssl_verify, + ) # Parse soup = BeautifulSoup(resp.text, "html.parser") @@ -74,8 +97,8 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in break # Stop if we have fetched the desired number of results if new_results == 0: - #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: - #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + # If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + # print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results From 6acc5bb58064ff13ea0d01cad3499334c797e24c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oskar=20S=C3=B6derbom?= Date: Wed, 24 Jul 2024 21:34:55 +0200 Subject: [PATCH 2/5] start of search now as argument, not hardcoded --- googlesearch/__init__.py | 41 +++++++++------------------------------- 1 file changed, 9 insertions(+), 32 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 5d41c86..0144090 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,5 +1,4 @@ """googlesearch is a Python library for searching Google, easily.""" - from time import sleep from bs4 import BeautifulSoup from requests import get @@ -9,7 +8,9 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify): resp = get( url="https://www.google.com/search", - headers={"User-Agent": get_useragent()}, + headers={ + "User-Agent": get_useragent() + }, params={ "q": term, "num": results + 2, # Prevents multiple requests @@ -34,44 +35,20 @@ def __init__(self, url, title, description): def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" - -def search( - term, - num_results=10, - lang="en", - proxy=None, - advanced=False, - sleep_interval=0, - timeout=5, - safe="active", - ssl_verify=None, - start_num=0, -): +def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, start_num=0): """Search the Google search engine""" escaped_term = term.replace(" ", "+") # Proxy setup - proxies = ( - {"https": proxy, "http": proxy} - if proxy and (proxy.startswith("https") or proxy.startswith("http")) - else None - ) + proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results while fetched_results < num_results: # Send request - resp = _req( - escaped_term, - num_results - start, - lang, - start, - proxies, - timeout, - safe, - ssl_verify, - ) + resp = _req(escaped_term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify) # Parse soup = BeautifulSoup(resp.text, "html.parser") @@ -97,8 +74,8 @@ def search( break # Stop if we have fetched the desired number of results if new_results == 0: - # If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: - # print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results From e7caf8b69eb54706368e2840acec1ec392c7af05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oskar=20S=C3=B6derbom?= Date: Sun, 4 Aug 2024 19:50:48 +0200 Subject: [PATCH 3/5] updated readme with info about start_num --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index f101888..529cc3a 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,14 @@ from googlesearch import search search("Google", sleep_interval=5, num_results=200) ``` +``` +If requesting more than 10 results, but want to manage the batching yourself? +Use `start_num` to specify the start number of the results you want to get: +```python +from googlesearch import search +search("Google", sleep_interval=5, num_results=200, start_result=10) +``` + If you are using a HTTP Rotating Proxy which requires you to install their CA Certificate, you can simply add `ssl_verify=False` in the `search()` method to avoid SSL Verification. ```python from googlesearch import search From 855525a341fe30197d5353c9fcb76ed36697006b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oskar=20S=C3=B6derbom?= Date: Mon, 5 Aug 2024 21:14:48 +0200 Subject: [PATCH 4/5] Resolved merge conflicts and integrated changes from upstream --- README.md | 5 +++++ setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 529cc3a..310f2a7 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,11 @@ In addition, you can change the language google searches in. For example, to get from googlesearch import search search("Google", lang="fr") ``` +You can also specify the region ([Country Codes](https://developers.google.com/custom-search/docs/json_api_reference#countryCodes)) for your search results. For example, to get results specifically from the US run the following program: +```python +from googlesearch import search +search("Google", region="us") +``` If you want to turn off the safe search function (this function is on by default), you can do this: ```python from googlesearch import search diff --git a/setup.py b/setup.py index 039adc9..1cb05c7 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="googlesearch-python", - version="1.2.4", + version="1.2.5", author="Nishant Vikramaditya", author_email="junk4Nv7@gmail.com", description="A Python library for scraping the Google search engine.", From 4e63229f97e85e07eae17bc84bc9219214f9dd8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oskar=20S=C3=B6derbom?= Date: Mon, 5 Aug 2024 21:14:52 +0200 Subject: [PATCH 5/5] Resolved merge conflicts --- googlesearch/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 0144090..4ace792 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -5,7 +5,7 @@ from .user_agents import get_useragent -def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify): +def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): resp = get( url="https://www.google.com/search", headers={ @@ -17,6 +17,7 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify): "hl": lang, "start": start, "safe": safe, + "gl": region, }, proxies=proxies, timeout=timeout, @@ -35,9 +36,8 @@ def __init__(self, url, title, description): def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, start_num=0): +def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0): """Search the Google search engine""" - escaped_term = term.replace(" ", "+") # Proxy setup proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None @@ -47,8 +47,8 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in while fetched_results < num_results: # Send request - resp = _req(escaped_term, num_results - start, - lang, start, proxies, timeout, safe, ssl_verify) + resp = _req(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) # Parse soup = BeautifulSoup(resp.text, "html.parser")