Skip to content

Commit 084bfd4

Browse files
authored
Merge pull request #59 from opsdisk/ghdb_scraper-versioning
Fixed trailing tabs and added verisoning to ghdb_scraper.py
2 parents 4a85d62 + 90efc65 commit 084bfd4

File tree

1 file changed

+12
-3
lines changed

1 file changed

+12
-3
lines changed

ghdb_scraper.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
# Custom Python libraries.
1212

13+
14+
__version__ = "1.0.0"
15+
1316
"""
1417
Dork dictionary example:
1518
@@ -85,7 +88,8 @@ def retrieve_google_dorks(
8588
# Extract dork from <a href> using BeautifulSoup.
8689
# "<a href=\"/ghdb/5052\">inurl:_cpanel/forgotpwd</a>"
8790
soup = BeautifulSoup(dork["url_title"], "html.parser")
88-
extracted_dork = soup.find("a").contents[0]
91+
# Some of the URL titles have trailing tabs, remove them.
92+
extracted_dork = soup.find("a").contents[0].strip()
8993
extracted_dorks.append(extracted_dork)
9094

9195
# For individual categories.
@@ -98,6 +102,10 @@ def retrieve_google_dorks(
98102

99103
category_dict[numeric_category_id] = {"category_name": category_name, "dorks": []}
100104

105+
# Some of the URL titles have trailing tabs, use replace() to remove it in place. The strip() method cannot be
106+
# used because the tab is not at the end of the string, but between the <a> tags instead:
107+
# <a href="/ghdb/2696">"Powered by Rock Band CMS 0.10" </a>
108+
dork["url_title"] = dork["url_title"].replace("\t", "")
101109
category_dict[numeric_category_id]["dorks"].append(dork)
102110

103111
# If requested, break up dorks into individual files based off category.
@@ -121,7 +129,8 @@ def retrieve_google_dorks(
121129
# Extract dork from <a href> using BeautifulSoup.
122130
# "<a href=\"/ghdb/5052\">inurl:_cpanel/forgotpwd</a>"
123131
soup = BeautifulSoup(dork["url_title"], "html.parser")
124-
extracted_dork = soup.find("a").contents[0]
132+
# Some of the URL titles have trailing tabs, remove them.
133+
extracted_dork = soup.find("a").contents[0].strip()
125134
fh.write(f"{extracted_dork}\n")
126135

127136
# Save GHDB json object to all_google_dorks.json.
@@ -177,7 +186,7 @@ def retrieve_google_dorks(
177186
parser = argparse.ArgumentParser(
178187
formatter_class=argparse.RawDescriptionHelpFormatter,
179188
description=(
180-
"GHDB Scraper - Retrieve the Google Hacking Database dorks from "
189+
f"GHDB Scraper v{__version__} - Retrieve Google Hacking Database dorks from "
181190
"https://www.exploit-db.com/google-hacking-database."
182191
),
183192
epilog=epilog,

0 commit comments

Comments
 (0)