This repository has been archived by the owner on Aug 25, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
96 lines (80 loc) · 2 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
'''
Basic web crawler.
Jordi Walter Hoock Castro.
'''
import re, urllib.request as url, random
class Crawler:
"""
Crawler class.
"""
page = ""
sites = []
validUrls = []
visitedSites = []
actualSite = 0
logLevel = 0
def __init__ ( self, page ):
"""
-Var page: In what page start to crawl.
"""
c = 0
self.page = page
while ( True ):
self.parseSiteText()
self.parseUrls()
self.checkUrls()
self.changeSite()
print( "Iteration: %i, Site: %s" % (c, self.page) )
c += 1
def parseSiteText ( self ):
"""
Convert site to plain text.
"""
# Here is a bug, FUCK!
try:
site = url.urlopen(self.page)
except IOError as e:
self.changeSite()
self.__init__(self.page)
self.page = str(site.read())
def parseUrls ( self ):
"""
Find all urls in the site and add them into a array.
"""
# Thanks for the pattern stackoverflow, i adapted the pattern to my needs.
self.sites = re.findall(r"<a.*?\s*href=\"(.*?)\".*?>(.*?)</a>", self.page)
def checkUrls ( self ):
"""
Check sanity of the urls.
"""
# Thanks for the pattern stackoverflow, i adapted the pattern to my needs.
checkPattern = '^(http|https|)://[a-zA-Z0-9\-\.]+\.(com|org|net|mil|edu|COM|ORG|NET|MIL|EDU|es)([/a-zA-Z0-9]+?.?(htm|php|asp|html)?)?$'
for links in self.sites:
if(re.search(checkPattern, links[0])):
self.validUrls.append(links[0])
def changeSite ( self ):
"""
Get random site from our array.
"""
# Have we already visited this site?.
x = random.randint(0, len(self.validUrls)-1)
while ( x in self.visitedSites ):
x = random.randint(0, len(self.validUrls)-1)
self.actualSite = x
self.visitedSites.append(x)
self.page = self.validUrls[x]
def l ( self, level ):
"""
Get log of what is happening.
-Var level:
1: Print all.
2: Print target site.
3: Write sites to external file [APPEND].
"""
if ( level == 1 ):
pass
elif ( level == 2 ):
pass
else:
f = open('log', 'a')
f.write("\n"+self.page)