Skip to content

Commit

Permalink
many bug fix
Browse files Browse the repository at this point in the history
Signed-off-by: wwqgtxx <[email protected]>
  • Loading branch information
wwqgtxx committed Jun 15, 2016
1 parent 104faac commit 201fcac
Show file tree
Hide file tree
Showing 6 changed files with 51 additions and 33 deletions.
16 changes: 12 additions & 4 deletions wwqLyParse/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,19 @@
import urllib.request,io,os,sys,json,re,gzip,time

urlcache = {}
URLCACHE_MAX = 1000
def getUrl(oUrl, encoding = 'utf-8' , headers = {}, data = None, method = None) :
global urlcache
global URLCACHE_MAX
urlcache_temp = urlcache
url_json = {"oUrl":oUrl,"encoding":encoding,"headers":headers,"data":data,"method":method}
url_json = json.dumps(url_json,sort_keys=True, ensure_ascii=False)
if url_json in urlcache:
item = urlcache[url_json]
if url_json in urlcache_temp:
item = urlcache_temp[url_json]
html_text = item["html_text"]
item["lasttimestap"] = int(time.time())
print("cache get:"+url_json)
if (len(urlcache)>100):
if (len(urlcache_temp)>URLCACHE_MAX):
cleanUrlcache()
return html_text
print("normal get:"+url_json)
Expand All @@ -34,8 +38,12 @@ def getUrl(oUrl, encoding = 'utf-8' , headers = {}, data = None, method = None)
return html_text

def cleanUrlcache():
global urlcache
global URLCACHE_MAX
sortedDict = sorted(urlcache.items(), key=lambda d: d[1]["lasttimestap"], reverse=True)
newDict = sortedDict[:100] # 从数组中取索引start开始到end-1的记录
newDict = {}
for (k, v) in sortedDict[:int(URLCACHE_MAX - URLCACHE_MAX/10)]:# 从数组中取索引start开始到end-1的记录
newDict[k] = v
urlcache = newDict
print("urlcache has been cleaned")

Expand Down
2 changes: 1 addition & 1 deletion wwqLyParse/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
version = {
'port_version' : "0.5.0",
'type' : 'parse',
'version' : '0.2.3',
'version' : '0.2.4',
'uuid' : '{C35B9DFC-559F-49E2-B80B-79B66EC77471}',
'filter' : [],
'name' : 'WWQ猎影解析插件',
Expand Down
12 changes: 10 additions & 2 deletions wwqLyParse/parsers/anypageparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class AnyPageParser(common.Parser):
TWICE_PARSE_TIMEOUT = 5

def Parse(self,input_text,types=None):
global TWICE_PARSE_TIMEOUT
if (types is not None) and ("collection" not in types):
return
html = PyQuery(common.getUrl(input_text))
Expand Down Expand Up @@ -52,7 +53,7 @@ def Parse(self,input_text,types=None):
url = 'direct:'+url
if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url):
continue
if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url):
if re.search('[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url):
continue
if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no):
continue
Expand Down Expand Up @@ -114,7 +115,7 @@ def runlist_parser(queue,parser,url):
for parser_thread in parser_threads:
parser_thread.start()
for parser_thread in parser_threads:
parser_thread.join(TWICE_PARSE_TIMEOUT)
parser_thread.join(self.TWICE_PARSE_TIMEOUT)
while not q_results.empty():
t_results.append(q_results.get())

Expand All @@ -129,6 +130,13 @@ def runlist_parser(queue,parser,url):
if ddata["url"] not in parse_urls:
#print(ddata["url"])
data["data"].append(ddata)
oldddata = data["data"]
data["data"] = []
parsed_urls = []
for ddata in oldddata:
if ddata["url"] not in parsed_urls:
data["data"].append(ddata)
parsed_urls.append(ddata["url"])
data["total"] = len(data["data"])
data["caption"] = "全页地址列表"
return data
Expand Down
4 changes: 2 additions & 2 deletions wwqLyParse/parsers/listparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def run(queue,get_list_info,html_text):
except Exception as e:
#import traceback
#traceback.print_exc()
print(e)
print(str(get_list_info)+str(e))
html_text = common.getUrl(input_text)
html = PyQuery(html_text)
title = html('h1.main_title').children('a').text()
Expand Down Expand Up @@ -276,7 +276,7 @@ def run(queue,get_list_info,html_text):
except Exception as e:
#import traceback
#traceback.print_exc()
print(e)
print(str(get_list_info_html)+e)

data["total"] = len(data["data"])

Expand Down
43 changes: 22 additions & 21 deletions wwqLyParse/parsers/lyppvparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,27 +35,28 @@ def Parse(self,url,types=None):
return []
print("call lyp_pv.run.Parse("+url+")")
out = run.Parse(url)
for data in out['data']:
data['label'] = re.compile('\(\d\)\s*').sub('',str(data['label']))
parts = data['label'].split('_')
num = int(parts[0])
if num == -3:
parts[0] = "0"
elif num == -1:
parts[0] = "1"
elif num == 0:
parts[0] = "2"
elif num == 2:
parts[0] = "3"
elif num == 4:
parts[0] = "4"
elif num == 7:
parts[0] = "5"
data['label']=('_').join(parts)
data['label'] = data['label'] + ("@lyppv")
out["caption"]= "负锐解析"
out.pop("icon")
out.pop("warning")
if "data" in out:
for data in out['data']:
data['label'] = re.compile('\(\d\)\s*').sub('',str(data['label']))
parts = data['label'].split('_')
num = int(parts[0])
if num == -3:
parts[0] = "0"
elif num == -1:
parts[0] = "1"
elif num == 0:
parts[0] = "2"
elif num == 2:
parts[0] = "3"
elif num == 4:
parts[0] = "4"
elif num == 7:
parts[0] = "5"
data['label']=('_').join(parts)
data['label'] = data['label'] + ("@lyppv")
out["caption"]= "负锐解析"
out.pop("icon")
out.pop("warning")
return out

def ParseURL(self,url,label,min=None,max=None):
Expand Down
7 changes: 4 additions & 3 deletions wwqLyParse/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def main():
#debug(Parse('http://www.iqiyi.com/lib/m_209445514.html?src=search'))
#debug(Parse('http://www.iqiyi.com/a_19rrhacdwt.html#vfrm=2-4-0-1'))
#debug(Parse('http://www.iqiyi.com/a_19rrhaare5.html'))
debug(Parse('http://www.iqiyi.com/a_19rrhbhf6d.html#vfrm=2-3-0-1'))
#debug(Parse('http://www.iqiyi.com/a_19rrhbhf6d.html#vfrm=2-3-0-1'))
#debug(Parse('http://www.le.com'))
#debug(Parse('http://www.letv.com/comic/10010294.html'))
#debug(Parse('http://www.mgtv.com/v/1/1/'))
Expand All @@ -132,6 +132,7 @@ def main():
#debug(Parse('http://v.qq.com/tv/'))
#debug(Parse('http://www.pptv.com/'))
#debug(Parse('http://yyfm.xyz/video/album/1300046802.html'))
debug(Parse('http://www.iqiyi.com/playlist392712002.html',"collection"))
#debug(Parse('http://list.iqiyi.com/www/2/----------------iqiyi--.html'))
#debug(Parse('http://www.iqiyi.com/a_19rrhb8fjp.html',"list"))
#debug(Parse('http://www.iqiyi.com/v_19rrl8pmn8.html#vfrm=2-3-0-1'))
Expand All @@ -143,8 +144,8 @@ def main():
#debug(Parse('http://v.pptv.com/show/NWR29Yzj2hh7ibWE.html?rcc_src=S1'))
#debug(Parse('http://www.bilibili.com/video/av2557971/')) #don't support
#debug(Parse('http://v.baidu.com/link?url=dm_10tBNoD-LLAMb79CB_p0kxozuoJcW0SiN3eycdo6CdO3GZgQm26uOzZh9fqcNSWZmz9aU9YYCCfT0NmZoGfEMoznyHhz3st-QvlOeyArYdIbhzBbdIrmntA4h1HsSampAs4Z3c17r_exztVgUuHZqChPeZZQ4tlmM5&page=tvplaydetail&vfm=bdvtx&frp=v.baidu.com%2Ftv_intro%2F&bl=jp_video',"formats"))
debug(Parse('http://www.hunantv.com/v/1/291976/c/3137384.html'))
#debug(ParseURL('http://www.mgtv.com/v/1/291976/c/3137384.html',"1",parsers = [mvtvparser.MgTVParser()]))
#debug(Parse('http://www.hunantv.com/v/1/291976/c/3137384.html'))
#debug(ParseURL('http://www.mgtv.com/v/1/291976/c/3137384.html',"1"))


if __name__ == '__main__':
Expand Down

0 comments on commit 201fcac

Please sign in to comment.