-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprices.py
More file actions
74 lines (61 loc) · 3.61 KB
/
prices.py
File metadata and controls
74 lines (61 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from scrapy import Spider
from scrapy.http import Request
from scrapy.crawler import CrawlerProcess
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from time import sleep
from urllib.parse import urljoin
import sqlite3
class PricesSpider(Spider):
name = 'prices'
allowed_domains = ["mall.industry.siemens.com"]
def start_requests(self):
# 'login_url' is a page where spiders stars and logs in to get prices
# 'absolute_part_url' is a generic url to access particular part web page,
# a part number is added at the end of the url to do that
login_url = 'http://mall.industry.siemens.com/regpublic/Login.aspx?regionkey=GB&lang=en&app=MALL&ret=https' \
'%3a%2f%2fmall.industry.siemens.com%2fgoos%2fWelcomePage.aspx%3fregionUrl%3d%252fuk&login=&pwd= '
absolute_part_url = 'https://mall.industry.siemens.com/mall/en/uk/Catalog/Product/'
# Chrome web driver opens login page(login_url), inserts a login and a password and then clicks the login button
self.driver = webdriver.Chrome()
self.driver.get(login_url)
self.driver.find_element_by_id("ContentPlaceHolder1_TextSiemensLogin").send_keys('USER_LOGIN')
self.driver.find_element_by_id("ContentPlaceHolder1_TextPassword").send_keys('USER_PASSWORD')
self.driver.find_element_by_id("ContentPlaceHolder1_LoginUserNamePasswordButton").click()
# 'sleep' have to be there, Selenium works too slow and code executes faster than Selenium refreshes
# Seems like a typical problem as Selenium is not really crated for scrapping
sleep(3)
# SQLite connects to the DB
con = sqlite3.connect('test.db')
with con:
cur = con.cursor() # Setting up the cursor
cur.execute("SELECT Part_no FROM Parts") # Getting parts number from the part table
parts = cur.fetchall() # Creates list out of
# Iterating through the parts number in the table
for part in parts:
# Creating a part specific url by adding the a number and the end of the absolute url
# Variable 'part' is a tuple with a part number at [0] position
part_url = urljoin(absolute_part_url, part[0])
self.driver.get(part_url)
try:
# Webdiver gets prices form a part web pages
# and then removes EUR symbols and thousands separators
price = self.driver.find_element_by_id("CustomerPriceCell").text.replace('EUR', '').replace('.', '')
list_price = self.driver.find_element_by_id("ListPriceCell").text.replace('EUR', '').replace('.', '')
# Updates the table's row with new prices
cur.execute("UPDATE Parts SET Our_price=?, List_price=? WHERE Part_no=?",
(price, list_price, part[0]))
con.commit()
# When a part number is wrong or there is no such part, 'error' will be inserted to the table
except NoSuchElementException:
cur.execute("UPDATE Parts SET Our_price=?, List_price=? WHERE Part_no=?",
('error', 'error', part[0]))
con.commit()
yield Request(part_url, callback=self.parse_price)
def parse_price(self, response):
pass
# Pieces to use spider as a script
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(PricesSpider)
process.start() # The script will block here until the crawling is finished