Skip to content

Commit 9e3da6a

Browse files
committed
Update wiki2pdf.py for python 3
1 parent 1b40348 commit 9e3da6a

File tree

1 file changed

+158
-153
lines changed

1 file changed

+158
-153
lines changed

Fox/wiki2pdf.py

Lines changed: 158 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -1,162 +1,168 @@
1-
import urllib
2-
import urlparse
1+
from urllib.request import urlopen, urlretrieve
2+
from urllib.parse import urlparse, urlunparse
33
import re
44
import os
55
from time import sleep
66

7+
78
class MoinSpider:
8-
def __init__(self,site='fox.vincefn.net',
9-
exclude=["RecentChanges","action=",
10-
"FindPage","TitleIndex","WordIndex",
11-
"Help","template","Template","MoinMoin",
12-
"UserPreferences","WikiSandBox",
13-
"ScriptAlias","ScriptAlias"]):
14-
self.u=urllib.URLopener()
15-
#self.u.addheader(('USER_AGENT', 'Mozilla/4.0'))
16-
self.base='href=\"/'
17-
self.suffix="?action=print"
18-
self.site=site
19-
self.pages=[] # list of pairs [relative URL, page content]
20-
self.d={} # dictionnary with keys=relative URL, value= short filename for the downloaded page
21-
self.exclude=exclude
22-
self.nbFail=0 # pages which failed to load
23-
self.img=set()
24-
def Weave(self, lnk='/Fox/FoxWiki',nbtry=3):
25-
""" Download recursively all pages, starting from one relative URL.
26-
"""
27-
if self.d.has_key(lnk): # we already got that page !
28-
return
29-
self.d[lnk]="wiki_%i.html"%(1000+len(self.d))
30-
url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse !
31-
print()
32-
print("Getting page: %s"%url)
33-
print(" -> %s"%(self.d[lnk]))
34-
nb=nbtry
35-
cont=True
36-
while(nb>0):
37-
try:
38-
p=self.u.open(url)
39-
page=p.read()
40-
nb=-1
41-
except IOError:
42-
nb-=1
43-
print("IOError..... retry #%i"%(nbtry-nb))
44-
sleep(1)
45-
if nb==0:
46-
print("Failed to load page, after %i trials:"%nbtry,lnk)
47-
self.nbFail+=1
48-
return
49-
if re.search("This page does not exist yet",page)!=None:
50-
print(" -> Page has not been written yet !")
51-
self.d[lnk]="http://"+self.site+lnk # Link directly to site
52-
return
53-
self.pages.append([lnk,page])
54-
for m in re.finditer(r"href\=\"(.*?)\"",page):
55-
newlink=m.group()
56-
if len(newlink)>=len(self.base):
57-
if newlink[:len(self.base)]==self.base:
58-
keep=True
59-
for x in self.exclude:
60-
if re.search(x,newlink)!= None:
61-
keep=False
62-
break
63-
if keep:
64-
#print(" ->%s"%newlink)
65-
newlink=newlink[6:-1]# [6:-1] -> exlude ' href=" ' and the end ' " '
66-
newlink=re.split('#',newlink)[0] # exclude anchors
67-
self.Weave(newlink)
68-
#else:
69-
# print(" ->%s ? NO"%newlink)
9+
def __init__(self, site='fox.vincefn.net',
10+
exclude=["RecentChanges", "action=",
11+
"FindPage", "TitleIndex", "WordIndex",
12+
"Help", "template", "Template", "MoinMoin",
13+
"UserPreferences", "WikiSandBox",
14+
"ScriptAlias", "ScriptAlias"]):
15+
# self.u.addheader(('USER_AGENT', 'Mozilla/4.0'))
16+
self.base = 'href=\"/'
17+
self.suffix = "?action=print"
18+
self.site = site
19+
self.pages = [] # list of pairs [relative URL, page content]
20+
self.d = {} # dictionnary with keys=relative URL, value= short filename for the downloaded page
21+
self.exclude = exclude
22+
self.nbFail = 0 # pages which failed to load
23+
self.img = set()
7024

71-
def WeaveStatic(self, pagelist,nbtry=3):
72-
""" Alternative to weave: download a pre-selected list of pages
73-
"""
74-
for lnk in pagelist:
75-
self.d[lnk]="wiki_%i.html"%(1000+len(self.d))
76-
url="http://"+self.site+lnk+self.suffix #:TODO: use urlparse !
77-
print("Getting page: %s -> %s"%(url,self.d[lnk]))
78-
nb=nbtry
79-
cont=True
80-
while(nb>0):
25+
def Weave(self, lnk='/Fox/FoxWiki', nbtry=3):
26+
""" Download recursively all pages, starting from one relative URL.
27+
"""
28+
if self.d.has_key(lnk): # we already got that page !
29+
return
30+
self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d))
31+
url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse !
32+
print()
33+
print("Getting page: %s" % url)
34+
print(" -> %s" % (self.d[lnk]))
35+
nb = nbtry
36+
cont = True
37+
while (nb > 0):
8138
try:
82-
print(url)
83-
p=self.u.open(url)
84-
page=p.read()
85-
nb=-1
39+
p = urlopen(url)
40+
page = p.read().decode('utf-8')
41+
nb = -1
8642
except IOError:
87-
nb-=1
88-
print("IOError..... retry #%i"%(nbtry-nb))
89-
sleep(1)
90-
if nb==0:
91-
print("Failed to load page, after %i trials:"%nbtry,lnk)
92-
if re.search("This page does not exist yet",page)!=None:
93-
print(" -> Page has not been written yet !")
94-
self.d[lnk]="http://"+self.site+lnk # Link directly to site
95-
nb=0
96-
else:
97-
self.pages.append([lnk,page])
43+
nb -= 1
44+
print("IOError..... retry #%i" % (nbtry - nb))
45+
sleep(1)
46+
if nb == 0:
47+
print("Failed to load page, after %i trials:" % nbtry, lnk)
48+
self.nbFail += 1
49+
return
50+
if re.search("This page does not exist yet", page) is not None:
51+
print(" -> Page has not been written yet !")
52+
self.d[lnk] = "http://" + self.site + lnk # Link directly to site
53+
return
54+
self.pages.append([lnk, page])
55+
for m in re.finditer(r"href\=\"(.*?)\"", page):
56+
newlink = m.group()
57+
if len(newlink) >= len(self.base):
58+
if newlink[:len(self.base)] == self.base:
59+
keep = True
60+
for x in self.exclude:
61+
if re.search(x, newlink) != None:
62+
keep = False
63+
break
64+
if keep:
65+
# print(" ->%s"%newlink)
66+
newlink = newlink[6:-1] # [6:-1] -> exlude ' href=" ' and the end ' " '
67+
newlink = re.split('#', newlink)[0] # exclude anchors
68+
self.Weave(newlink)
69+
# else:
70+
# print(" ->%s ? NO"%newlink)
71+
72+
def WeaveStatic(self, pagelist, nbtry=3):
73+
""" Alternative to weave: download a pre-selected list of pages
74+
"""
75+
for lnk in pagelist:
76+
self.d[lnk] = "wiki_%i.html" % (1000 + len(self.d))
77+
url = "http://" + self.site + lnk + self.suffix #:TODO: use urlparse !
78+
print("Getting page: %s -> %s" % (url, self.d[lnk]))
79+
nb = nbtry
80+
cont = True
81+
while (nb > 0):
82+
try:
83+
print(url)
84+
p = urlopen(url)
85+
page = p.read().decode('utf-8')
86+
nb = -1
87+
except IOError:
88+
nb -= 1
89+
print("IOError..... retry #%i" % (nbtry - nb))
90+
sleep(1)
91+
if nb == 0:
92+
print("Failed to load page, after %i trials:" % nbtry, lnk)
93+
if re.search("This page does not exist yet", page) is not None:
94+
print(" -> Page has not been written yet !")
95+
self.d[lnk] = "http://" + self.site + lnk # Link directly to site
96+
nb = 0
97+
else:
98+
self.pages.append([lnk, page])
99+
100+
def Pages2Html(self, d="wikihtml"):
101+
# TODO : remove links to non-written pages
102+
if not os.path.exists(d):
103+
os.mkdir(d)
104+
# this is necessary so that urls that contain other (smaller) urls
105+
# are replaced first
106+
ks = list(self.d.keys())
107+
ks.sort(reverse=True)
108+
for p in self.pages:
109+
for m in re.finditer(r"img .*? src\=\"(.*?)\"", p[1]):
110+
print(re.findall(r"src\=\"(.*?)\"", m.group()))
111+
url = re.findall(r"src\=\"(.*?)\"", m.group())[0]
112+
up = urlparse(url)
113+
print(url)
114+
up0, up1, up2, up3, up4, up5 = up[0], up[1], up[2], up[3], up[4], up[5]
115+
if up4 != '':
116+
name = re.split('=', up4).pop()
117+
else:
118+
name = re.split('/', up2).pop()
119+
if name not in self.img: # download image once
120+
self.img.add(name)
121+
if up0 == '':
122+
up0 = 'http'
123+
if up1 == '':
124+
up1 = self.site
125+
urlimg = urlunparse((up0, up1, up2, up3, up4, up5)).replace('&', '&')
126+
print(" %s -> %s" % (urlimg, name))
127+
nbTry = 3
128+
nb = nbTry
129+
while nb > 0:
130+
try:
131+
urlretrieve(urlimg, d + "/" + name)
132+
nb = -1
133+
except IOError:
134+
nb -= 1
135+
print("IOError..... retry #%i to get %s" % (nbTry - nb, name))
136+
sleep(1)
137+
if nb == 0:
138+
print("Failed to load image, after %i trials: %s" % (nbtry, name))
139+
else: # KLUDGE png->png cause htmldoc chokes on these...
140+
if name[-4:] == ".png":
141+
print("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg"))
142+
os.system("convert %s %s" % (d + "/" + name, d + "/" + name[:-3] + "jpg"))
143+
os.system("rm -f %s" % (d + "/" + name))
144+
p[1] = p[1].replace(url, name)
145+
for k in ks: # change to local url
146+
if k != self.d[k]:
147+
p[1] = p[1].replace(k, self.d[k])
148+
# Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg"
149+
# to "toto.jpg"
150+
p[1] = p[1].replace("%s?action=AttachFile&do=get&target=" % k, "")
151+
p[1] = p[1].replace(".png", ".jpg")
152+
f = open(d + "/" + self.d[p[0]], 'w')
153+
f.write(p[1])
98154

99-
def Pages2Html(self,d="wikihtml"):
100-
#TODO : remove links to non-written pages
101-
if not os.path.exists(d):
102-
os.mkdir(d)
103-
#this is necessary so that urls that contain other (smaller) urls
104-
#are replaced first
105-
ks=self.d.keys()
106-
ks.sort(reverse=True)
107-
for p in self.pages:
108-
for m in re.finditer(r"img .*? src\=\"(.*?)\"",p[1]):
109-
print(re.findall(r"src\=\"(.*?)\"",m.group()))
110-
url=re.findall(r"src\=\"(.*?)\"",m.group())[0]
111-
up=urlparse.urlparse(url)
112-
print(url)
113-
up0,up1,up2,up3,up4,up5=up[0],up[1],up[2],up[3],up[4],up[5]
114-
if up4 != '':
115-
name=re.split('=',up4).pop()
116-
else:
117-
name=re.split('/',up2).pop()
118-
if name not in self.img:#download image once
119-
self.img.add(name)
120-
if up0=='':
121-
up0='http'
122-
if up1=='':
123-
up1=self.site
124-
urlimg=urlparse.urlunparse((up0,up1,up2,up3,up4,up5)).replace('&','&')
125-
print(" %s -> %s"%(urlimg,name))
126-
nbTry=3
127-
nb=nbTry
128-
while nb>0:
129-
try:
130-
urllib.urlretrieve(urlimg,d+"/"+name)
131-
nb=-1
132-
except IOError:
133-
nb-=1
134-
print("IOError..... retry #%i to get %s"%(nbTry-nb,name))
135-
sleep(1)
136-
if nb==0:
137-
print("Failed to load image, after %i trials: %s"%(nbtry,name))
138-
else: # KLUDGE png->png cause htmldoc chokes on these...
139-
if name[-4:]==".png":
140-
print("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg"))
141-
os.system("convert %s %s"%(d+"/"+name,d+"/"+name[:-3]+"jpg"))
142-
os.system("rm -f %s"%(d+"/"+name))
143-
p[1]=p[1].replace(url,name)
144-
for k in ks:# change to local url
145-
if k!=self.d[k]:
146-
p[1]=p[1].replace(k,self.d[k])
147-
# Change src field of img from "wiki_1002.html?action=AttachFile&do=get&target=toto.jpg" to "toto.jpg"
148-
p[1]=p[1].replace("%s?action=AttachFile&do=get&target="%k,"")
149-
p[1]=p[1].replace(".png",".jpg")
150-
f=open(d+"/"+self.d[p[0]],'w')
151-
f.write(p[1])
152-
def Html2pdf(self,d="wikihtml"):
153-
os.system("mogrify -resize '600x>' wikihtml/*.jpg")
154-
#os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d)
155-
os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 --links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm --footer Dc1 -f FoxManual.pdf"%d)
156-
#os.system("rm -f wikihtml/*")
155+
def Html2pdf(self, d="wikihtml"):
156+
os.system("mogrify -resize '600x>' wikihtml/*.jpg")
157+
# os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue -f wiki.pdf"%d)
158+
os.system("htmldoc --jpeg=85 --webpage %s/*.html --linkcolor blue --size a4 --format pdf14 "
159+
"--links --book --toclevels 3 --left 1.5cm --right 1.5cm --top 1.5cm --bottom 1.5cm "
160+
"--footer Dc1 -f FoxManual.pdf" % d)
161+
# os.system("rm -f wikihtml/*")
157162

158-
#m=MoinSpider(site="objcryst.sourceforge.net")
159-
m=MoinSpider(site="fox.vincefn.net")
163+
164+
# m=MoinSpider(site="objcryst.sourceforge.net")
165+
m = MoinSpider(site="fox.vincefn.net")
160166

161167
m.WeaveStatic(["/FoxWiki",
162168
"/BiblioReferences",
@@ -185,11 +191,10 @@ def Html2pdf(self,d="wikihtml"):
185191
"/FoxCompile",
186192
"/Compile/Linux",
187193
"/Compile/MacOSX",
188-
#"/Compile/Windows"
189-
#"/BiblioStructures",
190-
#"/VincentFavreNicolin"
194+
# "/Compile/Windows"
195+
# "/BiblioStructures",
196+
# "/VincentFavreNicolin"
191197
])
192198

193-
194199
m.Pages2Html()
195200
m.Html2pdf()

0 commit comments

Comments
 (0)