-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtest_scraping.py
105 lines (78 loc) · 4.46 KB
/
test_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import requests
from html import escape
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
from lxml import html
URL = 'https://install.appcenter.ms/users/drklo-2kb-ghpo/apps/telegram-beta-2/distribution_groups/all-users-of-telegram-beta-2'
def old():
page_content = requests.get(URL)
tree = html.fromstring(page_content.content)
# /html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3
# /html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/div[1]/a
download_url = tree.xpath('/html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/div[1]/a')
# print([print(dir(i)) for i in download_url])
print(download_url[0].values()[1])
# /html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3
version = tree.xpath('/html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3')
print(version[0].text)
print('\n'.join(dir(version[0])))
def main(selected_webdriver='pjs'):
# AppCenter requires JS enabled so we have to use selenium and simulate a browser request
version_xpath_ff = '/html/body/div[2]/div/div/div/div[2]/div[2]/div[2]/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]'
version_xpath_chromium = '//*[@id="app"]/div/div/div/div[2]/div[2]/div[2]/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]/text()[2]'
version_xpath_chromium_full = '/html/body/div[2]/div/div/div/div[2]/div[2]/div[2]/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]'
version_class_name = '_1qjHBJYex _19ynaVUpx _634PiScnx _1vGSY6Fax l1LdMrFBx _74rbA585x _1id30tvgx'
version_class_name_dotted = version_class_name.replace(' _', '._')
if selected_webdriver == 'ff':
print('FIREFOX DRIVER')
# https://github.com/mozilla/geckodriver/releases
# https://stackoverflow.com/a/40931903
firefox_capabilities = DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True
# https://towardsdatascience.com/data-science-skills-web-scraping-javascript-using-python-97a29738353f
chrome_options = Options()
chrome_options.headless = True
# https://stackoverflow.com/a/42122284
driver = webdriver.Firefox(executable_path=r'./browser_drivers/geckodriver_0.26.0.exe', capabilities=firefox_capabilities, firefox_options=chrome_options)
elif selected_webdriver == 'chr':
# https://chromedriver.storage.googleapis.com/index.html
print('CHROME DRIVER')
# https://selenium-python.readthedocs.io/api.html#module-selenium.webdriver.chrome.webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.headless = True
driver = webdriver.Chrome(executable_path='./browser_drivers/chromedriver_81.0.4044.69.exe', chrome_options=chrome_options)
else: # pjs: phantomJS
# phantomJS is deprecated, but it works (headless firefox doesn't)
# http://phantomjs.org/download.html
print('PHANTOMJS DRIVER')
# https://towardsdatascience.com/data-science-skills-web-scraping-javascript-using-python-97a29738353f
driver = webdriver.PhantomJS(executable_path='./browser_drivers/phantomjs_2.1.1.exe')
driver.get(URL)
by_xpath = driver.find_elements_by_xpath(version_xpath_chromium)
print('by xpath:\n', by_xpath)
by_tag_name = driver.find_elements_by_tag_name('div')
print('by tag name:\n', by_tag_name)
try:
print('\n'.join(dir(by_tag_name[0])))
print([e.text for e in by_tag_name])
except Exception as e:
print('exception:', str(e))
by_class = driver.find_elements_by_class_name(version_class_name_dotted)
print('by class:\n', by_class)
# page_html = driver.page_source
# soup = BeautifulSoup(page_html, features='html.parser')
# print(soup.prettify())
# _1qjHBJYex _19ynaVUpx _634PiScnx _1vGSY6Fax l1LdMrFBx _74rbA585x _1id30tvgx
# matches = soup.find_all("div")
# print(matches)
# /html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/h3
# /html/body/div[1]/div[2]/div/div[1]/div/div[3]/div[6]/div[1]/a
# download_url = tree.xpath('/html/body/div[2]/div/div/div/div[2]/div[2]/div[2]/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]')
# print([print(dir(i)) for i in download_url])
# print(download_url[0].values()[1])
# /html/body/div[2]/div/div/div/div[2]/div[2]/div[2]/div/div[1]/div[1]/div[2]/div/div/div[1]/div[1]
if __name__ == '__main__':
main()