|
1 |
| -import urllib |
2 |
| -import urlparse |
| 1 | +from urllib.request import urlopen, urlretrieve |
| 2 | +from urllib.parse import urlparse, urlunparse |
3 | 3 | import re
|
4 | 4 | import os
|
5 | 5 | from time import sleep
|
6 | 6 |
|
| 7 | + |
7 | 8 | class MoinSpider:
|
8 |
| - def __init__(self,site='fox.vincefn.net', |
9 |
| - exclude=["RecentChanges","action=", |
10 |
| - "FindPage","TitleIndex","WordIndex", |
11 |
| - "Help","template","Template","MoinMoin", |
12 |
| - "UserPreferences","WikiSandBox", |
13 |
| - "ScriptAlias","ScriptAlias"]): |
14 |
| - self.u=urllib.URLopener() |
15 |
| - #self.u.addheader(('USER_AGENT', 'Mozilla/4.0')) |
16 |
| - self.base='href=\"/' |
17 |
| - self.suffix="?action=print" |
18 |
| - self.site=site |
19 |
| - self.pages=[] # list of pairs [relative URL, page content] |
20 |
| - self.d={} # dictionnary with keys=relative URL, value= short filename for the downloaded page |
21 |
| - self.exclude=exclude |
22 |
| - self.nbFail=0 # pages which failed to load |
23 |
| - self.img=set() |
24 |
| - def Weave(self, lnk='/Fox/FoxWiki',nbtry=3): |
25 |
| - """ Download recursively all pages, starting from one relative URL. |
26 |
| - """ |
27 |
| - if self.d.has_key(lnk): # we already got that page ! |
28 |
| - return |
29 |
| - self.d[lnk]="wiki_%i.html"%(1000+len(self.d)) |
30 |
| - url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse ! |
31 |
| - print() |
32 |
| - print("Getting page: %s"%url) |
33 |
| - print(" -> %s"%(self.d[lnk])) |
34 |
| - nb=nbtry |
35 |
| - cont=True |
36 |
| - while(nb>0): |
37 |
| - try: |
38 |
| - p=self.u.open(url) |
39 |
| - page=p.read() |
40 |
| - nb=-1 |
41 |
| - except IOError: |
42 |
| - nb-=1 |
43 |
| - print("IOError..... retry #%i"%(nbtry-nb)) |
44 |
| - sleep(1) |
45 |
| - if nb==0: |
46 |
| - print("Failed to load page, after %i trials:"%nbtry,lnk) |
47 |
| - self.nbFail+=1 |
48 |
| - return |
49 |
| - if re.search("This page does not exist yet",page)!=None: |
50 |
| - print(" -> Page has not been written yet !") |
51 |
| - self.d[lnk]="http://"+self.site+lnk # Link directly to site |
52 |
| - return |
53 |
| - self.pages.append([lnk,page]) |
54 |
| - for m in re.finditer(r"href\=\"(.*?)\"",page): |
55 |
| - newlink=m.group() |
56 |
| - if len(newlink)>=len(self.base): |
57 |
| - if newlink[:len(self.base)]==self.base: |
58 |
| - keep=True |
59 |
| - for x in self.exclude: |
60 |
| - if re.search(x,newlink)!= None: |
61 |
| - keep=False |
62 |
| - break |
63 |
| - if keep: |
64 |
| - #print(" ->%s"%newlink) |
65 |
| - newlink=newlink[6:-1]# [6:-1] -> exlude ' href=" ' and the end ' " ' |
66 |
| - newlink=re.split('#',newlink)[0] # exclude anchors |
67 |
| - self.Weave(newlink) |
68 |
| - #else: |
69 |
| - # print(" ->%s ? NO"%newlink) |
| 9 | + def __init__(self, site='fox.vincefn.net', |
| 10 | + exclude=["RecentChanges", "action=", |
| 11 | + "FindPage", "TitleIndex", "WordIndex", |
| 12 | + "Help", "template", "Template", "MoinMoin", |
| 13 | + "UserPreferences", "WikiSandBox", |
| 14 | + "ScriptAlias", "ScriptAlias"]): |
| 15 | + # self.u.addheader(('USER_AGENT', 'Mozilla/4.0')) |
| 16 | + self.base = 'href=\"/' |
| 17 | + self.suffix = "?action=print" |
| 18 | + self.site = site |
| 19 | + self.pages = [] # list of pairs [relative URL, page content] |
| 20 | + self.d = {} # dictionnary with keys=relative URL, value= short filename for the downloaded page |
| 21 | + self.exclude = exclude |
| 22 | + self.nbFail = 0 # pages which failed to load |
| 23 | + self.img = set() |
70 | 24 |
|
71 |
| - def WeaveStatic(self, pagelist,nbtry=3): |
72 |
| - """ Alternative to weave: download a pre-selected list of pages |
73 |
| - """ |
74 |
| - for lnk in pagelist: |
75 |
| - self.d[lnk]="wiki_%i.html"%(1000+len(self.d)) |
76 |
| - url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse ! |
77 |
| - print("Getting page: %s -> %s"%(url,self.d[lnk])) |
78 |
| - nb=nbtry |
79 |
| - cont=True |
80 |
| - while(nb>0): |
| 25 | + def Weave(self, lnk='/Fox/FoxWiki', nbtry=3): |
| 26 | + """ Download recursively all pages, starting from one relative URL. |
| 27 | + """ |
| 28 | + if self.d.has_key(lnk): # we already got that page ! |
| 29 | + return |
| 30 | + self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d)) |
| 31 | + url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse ! |
| 32 | + print() |
| 33 | + print("Getting page: %s" % url) |
| 34 | + print(" -> %s" % (self.d[lnk])) |
| 35 | + nb = nbtry |
| 36 | + cont = True |
| 37 | + while (nb > 0): |
81 | 38 | try:
|
82 |
| - print(url) |
83 |
| - p=self.u.open(url) |
84 |
| - page=p.read() |
85 |
| - nb=-1 |
| 39 | + p = urlopen(url) |
| 40 | + page = p.read().decode('utf-8') |
| 41 | + nb = -1 |
86 | 42 | except IOError:
|
87 |
| - nb-=1 |
88 |
| - print("IOError..... retry #%i"%(nbtry-nb)) |
89 |
| - sleep(1) |
90 |
| - if nb==0: |
91 |
| - print("Failed to load page, after %i trials:"%nbtry,lnk) |
92 |
| - if re.search("This page does not exist yet",page)!=None: |
93 |
| - print(" -> Page has not been written yet !") |
94 |
| - self.d[lnk]="http://"+self.site+lnk # Link directly to site |
95 |
| - nb=0 |
96 |
| - else: |
97 |
| - self.pages.append([lnk,page]) |
| 43 | + nb -= 1 |
| 44 | + print("IOError..... retry #%i" % (nbtry - nb)) |
| 45 | + sleep(1) |
| 46 | + if nb == 0: |
| 47 | + print("Failed to load page, after %i trials:" % nbtry, lnk) |
| 48 | + self.nbFail += 1 |
| 49 | + return |
| 50 | + if re.search("This page does not exist yet", page) is not None: |
| 51 | + print(" -> Page has not been written yet !") |
| 52 | + self.d[lnk] = "http://" + self.site + lnk # Link directly to site |
| 53 | + return |
| 54 | + self.pages.append([lnk, page]) |
| 55 | + for m in re.finditer(r"href\=\"(.*?)\"", page): |
| 56 | + newlink = m.group() |
| 57 | + if len(newlink) >= len(self.base): |
| 58 | + if newlink[:len(self.base)] == self.base: |
| 59 | + keep = True |
| 60 | + for x in self.exclude: |
| 61 | + if re.search(x, newlink) != None: |
| 62 | + keep = False |
| 63 | + break |
| 64 | + if keep: |
| 65 | + # print(" ->%s"%newlink) |
| 66 | + newlink = newlink[6:-1] # [6:-1] -> exlude ' href=" ' and the end ' " ' |
| 67 | + newlink = re.split('#', newlink)[0] # exclude anchors |
| 68 | + self.Weave(newlink) |
| 69 | + # else: |
| 70 | + # print(" ->%s ? NO"%newlink) |
| 71 | + |
| 72 | + def WeaveStatic(self, pagelist, nbtry=3): |
| 73 | + """ Alternative to weave: download a pre-selected list of pages |
| 74 | + """ |
| 75 | + for lnk in pagelist: |
| 76 | + self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d)) |
| 77 | + url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse ! |
| 78 | + print("Getting page: %s -> %s" % (url, self.d[lnk])) |
| 79 | + nb = nbtry |
| 80 | + cont = True |
| 81 | + while (nb > 0): |
| 82 | + try: |
| 83 | + print(url) |
| 84 | + p = urlopen(url) |
| 85 | + page = p.read().decode('utf-8') |
| 86 | + nb = -1 |
| 87 | + except IOError: |
| 88 | + nb -= 1 |
| 89 | + print("IOError..... retry #%i" % (nbtry - nb)) |
| 90 | + sleep(1) |
| 91 | + if nb == 0: |
| 92 | + print("Failed to load page, after %i trials:" % nbtry, lnk) |
| 93 | + if re.search("This page does not exist yet", page) is not None: |
| 94 | + print(" -> Page has not been written yet !") |
| 95 | + self.d[lnk] = "http://" + self.site + lnk # Link directly to site |
| 96 | + nb = 0 |
| 97 | + else: |
| 98 | + self.pages.append([lnk, page]) |
| 99 | + |
| 100 | + def Pages2Html(self, d="wikihtml"): |
| 101 | + # TODO : remove links to non-written pages |
| 102 | + if not os.path.exists(d): |
| 103 | + os.mkdir(d) |
| 104 | + # this is necessary so that urls that contain other (smaller) urls |
| 105 | + # are replaced first |
| 106 | + ks = list(self.d.keys()) |
| 107 | + ks.sort(reverse=True) |
| 108 | + for p in self.pages: |
| 109 | + for m in re.finditer(r"img .*? src\=\"(.*?)\"", p[1]): |
| 110 | + print(re.findall(r"src\=\"(.*?)\"", m.group())) |
| 111 | + url = re.findall(r"src\=\"(.*?)\"", m.group())[0] |
| 112 | + up = urlparse(url) |
| 113 | + print(url) |
| 114 | + up0, up1, up2, up3, up4, up5 = up[0], up[1], up[2], up[3], up[4], up[5] |
| 115 | + if up4 != '': |
| 116 | + name = re.split('=', up4).pop() |
| 117 | + else: |
| 118 | + name = re.split('/', up2).pop() |
| 119 | + if name not in self.img: # download image once |
| 120 | + self.img.add(name) |
| 121 | + if up0 == '': |
| 122 | + up0 = 'http' |
| 123 | + if up1 == '': |
| 124 | + up1 = self.site |
| 125 | + urlimg = urlunparse((up0, up1, up2, up3, up4, up5)).replace('&', '&') |
| 126 | + print(" %s -> %s" % (urlimg, name)) |
| 127 | + nbTry = 3 |
| 128 | + nb = nbTry |
| 129 | + while nb > 0: |
| 130 | + try: |
| 131 | + urlretrieve(urlimg, d + "/" + name) |
| 132 | + nb = -1 |
| 133 | + except IOError: |
| 134 | + nb -= 1 |
| 135 | + print("IOError..... retry #%i to get %s" % (nbTry - nb, name)) |
| 136 | + sleep(1) |
| 137 | + if nb == 0: |
| 138 | + print("Failed to load image, after %i trials: %s" % (nbtry, name)) |
| 139 | + else: # KLUDGE png->png cause htmldoc chokes on these... |
| 140 | + if name[-4:] == ".png": |
| 141 | + print("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg")) |
| 142 | + os.system("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg")) |
| 143 | + os.system("rm -f %s" % (d + "/" + name)) |
| 144 | + p[1] = p[1].replace(url, name) |
| 145 | + for k in ks: # change to local url |
| 146 | + if k != self.d[k]: |
| 147 | + p[1] = p[1].replace(k, self.d[k]) |
| 148 | + # Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg" |
| 149 | + # to "toto.jpg" |
| 150 | + p[1] = p[1].replace("%s?action=AttachFile&do=get&target=" % k, "") |
| 151 | + p[1] = p[1].replace(".png", ".jpg") |
| 152 | + f = open(d + "/" + self.d[p[0]], 'w') |
| 153 | + f.write(p[1]) |
98 | 154 |
|
99 |
| - def Pages2Html(self,d="wikihtml"): |
100 |
| - #TODO : remove links to non-written pages |
101 |
| - if not os.path.exists(d): |
102 |
| - os.mkdir(d) |
103 |
| - #this is necessary so that urls that contain other (smaller) urls |
104 |
| - #are replaced first |
105 |
| - ks=self.d.keys() |
106 |
| - ks.sort(reverse=True) |
107 |
| - for p in self.pages: |
108 |
| - for m in re.finditer(r"img .*? src\=\"(.*?)\"",p[1]): |
109 |
| - print(re.findall(r"src\=\"(.*?)\"",m.group())) |
110 |
| - url=re.findall(r"src\=\"(.*?)\"",m.group())[0] |
111 |
| - up=urlparse.urlparse(url) |
112 |
| - print(url) |
113 |
| - up0,up1,up2,up3,up4,up5=up[0],up[1],up[2],up[3],up[4],up[5] |
114 |
| - if up4 != '': |
115 |
| - name=re.split('=',up4).pop() |
116 |
| - else: |
117 |
| - name=re.split('/',up2).pop() |
118 |
| - if name not in self.img:#download image once |
119 |
| - self.img.add(name) |
120 |
| - if up0=='': |
121 |
| - up0='http' |
122 |
| - if up1=='': |
123 |
| - up1=self.site |
124 |
| - urlimg=urlparse.urlunparse((up0,up1,up2,up3,up4,up5)).replace('&','&') |
125 |
| - print(" %s -> %s"%(urlimg,name)) |
126 |
| - nbTry=3 |
127 |
| - nb=nbTry |
128 |
| - while nb>0: |
129 |
| - try: |
130 |
| - urllib.urlretrieve(urlimg,d+"/"+name) |
131 |
| - nb=-1 |
132 |
| - except IOError: |
133 |
| - nb-=1 |
134 |
| - print("IOError..... retry #%i to get %s"%(nbTry-nb,name)) |
135 |
| - sleep(1) |
136 |
| - if nb==0: |
137 |
| - print("Failed to load image, after %i trials: %s"%(nbtry,name)) |
138 |
| - else: # KLUDGE png->png cause htmldoc chokes on these... |
139 |
| - if name[-4:]==".png": |
140 |
| - print("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg")) |
141 |
| - os.system("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg")) |
142 |
| - os.system("rm -f %s"%(d+"/"+name)) |
143 |
| - p[1]=p[1].replace(url,name) |
144 |
| - for k in ks:# change to local url |
145 |
| - if k!=self.d[k]: |
146 |
| - p[1]=p[1].replace(k,self.d[k]) |
147 |
| - # Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg" to "toto.jpg" |
148 |
| - p[1]=p[1].replace("%s?action=AttachFile&do=get&target="%k,"") |
149 |
| - p[1]=p[1].replace(".png",".jpg") |
150 |
| - f=open(d+"/"+self.d[p[0]],'w') |
151 |
| - f.write(p[1]) |
152 |
| - def Html2pdf(self,d="wikihtml"): |
153 |
| - os.system("mogrify -resize '600x>' wikihtml/*.jpg") |
154 |
| - #os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d) |
155 |
| - os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 --links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm --footer Dc1 -f FoxManual.pdf"%d) |
156 |
| - #os.system("rm -f wikihtml/*") |
| 155 | + def Html2pdf(self, d="wikihtml"): |
| 156 | + os.system("mogrify -resize '600x>' wikihtml/*.jpg") |
| 157 | + # os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d) |
| 158 | + os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 " |
| 159 | + "--links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm " |
| 160 | + "--footer Dc1 -f FoxManual.pdf" % d) |
| 161 | + # os.system("rm -f wikihtml/*") |
157 | 162 |
|
158 |
| -#m=MoinSpider(site="objcryst.sourceforge.net") |
159 |
| -m=MoinSpider(site="fox.vincefn.net") |
| 163 | + |
| 164 | +# m=MoinSpider(site="objcryst.sourceforge.net") |
| 165 | +m = MoinSpider(site="fox.vincefn.net") |
160 | 166 |
|
161 | 167 | m.WeaveStatic(["/FoxWiki",
|
162 | 168 | "/BiblioReferences",
|
@@ -185,11 +191,10 @@ def Html2pdf(self,d="wikihtml"):
|
185 | 191 | "/FoxCompile",
|
186 | 192 | "/Compile/Linux",
|
187 | 193 | "/Compile/MacOSX",
|
188 |
| - #"/Compile/Windows" |
189 |
| - #"/BiblioStructures", |
190 |
| - #"/VincentFavreNicolin" |
| 194 | + # "/Compile/Windows" |
| 195 | + # "/BiblioStructures", |
| 196 | + # "/VincentFavreNicolin" |
191 | 197 | ])
|
192 | 198 |
|
193 |
| - |
194 | 199 | m.Pages2Html()
|
195 | 200 | m.Html2pdf()
|
0 commit comments