diff --git a/wwqLyParse/common.py b/wwqLyParse/common.py index 6945435..50578f4 100644 --- a/wwqLyParse/common.py +++ b/wwqLyParse/common.py @@ -4,15 +4,19 @@ import urllib.request,io,os,sys,json,re,gzip,time urlcache = {} +URLCACHE_MAX = 1000 def getUrl(oUrl, encoding = 'utf-8' , headers = {}, data = None, method = None) : + global urlcache + global URLCACHE_MAX + urlcache_temp = urlcache url_json = {"oUrl":oUrl,"encoding":encoding,"headers":headers,"data":data,"method":method} url_json = json.dumps(url_json,sort_keys=True, ensure_ascii=False) - if url_json in urlcache: - item = urlcache[url_json] + if url_json in urlcache_temp: + item = urlcache_temp[url_json] html_text = item["html_text"] item["lasttimestap"] = int(time.time()) print("cache get:"+url_json) - if (len(urlcache)>100): + if (len(urlcache_temp)>URLCACHE_MAX): cleanUrlcache() return html_text print("normal get:"+url_json) @@ -34,8 +38,12 @@ def getUrl(oUrl, encoding = 'utf-8' , headers = {}, data = None, method = None) return html_text def cleanUrlcache(): + global urlcache + global URLCACHE_MAX sortedDict = sorted(urlcache.items(), key=lambda d: d[1]["lasttimestap"], reverse=True) - newDict = sortedDict[:100] # 从数组中取索引start开始到end-1的记录 + newDict = {} + for (k, v) in sortedDict[:int(URLCACHE_MAX - URLCACHE_MAX/10)]:# 从数组中取索引start开始到end-1的记录 + newDict[k] = v urlcache = newDict print("urlcache has been cleaned") diff --git a/wwqLyParse/main.py b/wwqLyParse/main.py index dc836fe..5b874e9 100644 --- a/wwqLyParse/main.py +++ b/wwqLyParse/main.py @@ -37,7 +37,7 @@ version = { 'port_version' : "0.5.0", 'type' : 'parse', - 'version' : '0.2.3', + 'version' : '0.2.4', 'uuid' : '{C35B9DFC-559F-49E2-B80B-79B66EC77471}', 'filter' : [], 'name' : 'WWQ猎影解析插件', diff --git a/wwqLyParse/parsers/anypageparser.py b/wwqLyParse/parsers/anypageparser.py index cfc60ca..4c6b174 100644 --- a/wwqLyParse/parsers/anypageparser.py +++ b/wwqLyParse/parsers/anypageparser.py @@ -23,6 +23,7 @@ class AnyPageParser(common.Parser): TWICE_PARSE_TIMEOUT = 5 def Parse(self,input_text,types=None): + global TWICE_PARSE_TIMEOUT if (types is not None) and ("collection" not in types): return html = PyQuery(common.getUrl(input_text)) @@ -52,7 +53,7 @@ def Parse(self,input_text,types=None): url = 'direct:'+url if not re.match('(^(http|https)://.+\.(shtml|html|mp4|mkv|ts|avi))|(^(http|https)://.+/video/)',url): continue - if re.search('(list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url): + if re.search('[^\?](list|mall|about|help|shop|map|vip|faq|support|download|copyright|contract|product|tencent|upload|common|index.html|v.qq.com/u/|open.baidu.com|www.iqiyi.com/lib/s_|www.iqiyi.com/dv/|top.iqiyi.com)',url): continue if re.search('(下载|播 放|播放|投诉|评论|(\d{1,2}:\d{1,2}))',no): continue @@ -114,7 +115,7 @@ def runlist_parser(queue,parser,url): for parser_thread in parser_threads: parser_thread.start() for parser_thread in parser_threads: - parser_thread.join(TWICE_PARSE_TIMEOUT) + parser_thread.join(self.TWICE_PARSE_TIMEOUT) while not q_results.empty(): t_results.append(q_results.get()) @@ -129,6 +130,13 @@ def runlist_parser(queue,parser,url): if ddata["url"] not in parse_urls: #print(ddata["url"]) data["data"].append(ddata) + oldddata = data["data"] + data["data"] = [] + parsed_urls = [] + for ddata in oldddata: + if ddata["url"] not in parsed_urls: + data["data"].append(ddata) + parsed_urls.append(ddata["url"]) data["total"] = len(data["data"]) data["caption"] = "全页地址列表" return data diff --git a/wwqLyParse/parsers/listparser.py b/wwqLyParse/parsers/listparser.py index 7be4ab8..7b63743 100644 --- a/wwqLyParse/parsers/listparser.py +++ b/wwqLyParse/parsers/listparser.py @@ -241,7 +241,7 @@ def run(queue,get_list_info,html_text): except Exception as e: #import traceback #traceback.print_exc() - print(e) + print(str(get_list_info)+str(e)) html_text = common.getUrl(input_text) html = PyQuery(html_text) title = html('h1.main_title').children('a').text() @@ -276,7 +276,7 @@ def run(queue,get_list_info,html_text): except Exception as e: #import traceback #traceback.print_exc() - print(e) + print(str(get_list_info_html)+e) data["total"] = len(data["data"]) diff --git a/wwqLyParse/parsers/lyppvparser.py b/wwqLyParse/parsers/lyppvparser.py index 3cf46b2..4bcb86e 100644 --- a/wwqLyParse/parsers/lyppvparser.py +++ b/wwqLyParse/parsers/lyppvparser.py @@ -35,27 +35,28 @@ def Parse(self,url,types=None): return [] print("call lyp_pv.run.Parse("+url+")") out = run.Parse(url) - for data in out['data']: - data['label'] = re.compile('\(\d\)\s*').sub('',str(data['label'])) - parts = data['label'].split('_') - num = int(parts[0]) - if num == -3: - parts[0] = "0" - elif num == -1: - parts[0] = "1" - elif num == 0: - parts[0] = "2" - elif num == 2: - parts[0] = "3" - elif num == 4: - parts[0] = "4" - elif num == 7: - parts[0] = "5" - data['label']=('_').join(parts) - data['label'] = data['label'] + ("@lyppv") - out["caption"]= "负锐解析" - out.pop("icon") - out.pop("warning") + if "data" in out: + for data in out['data']: + data['label'] = re.compile('\(\d\)\s*').sub('',str(data['label'])) + parts = data['label'].split('_') + num = int(parts[0]) + if num == -3: + parts[0] = "0" + elif num == -1: + parts[0] = "1" + elif num == 0: + parts[0] = "2" + elif num == 2: + parts[0] = "3" + elif num == 4: + parts[0] = "4" + elif num == 7: + parts[0] = "5" + data['label']=('_').join(parts) + data['label'] = data['label'] + ("@lyppv") + out["caption"]= "负锐解析" + out.pop("icon") + out.pop("warning") return out def ParseURL(self,url,label,min=None,max=None): diff --git a/wwqLyParse/run.py b/wwqLyParse/run.py index 759f2d6..46a8fc3 100644 --- a/wwqLyParse/run.py +++ b/wwqLyParse/run.py @@ -120,7 +120,7 @@ def main(): #debug(Parse('http://www.iqiyi.com/lib/m_209445514.html?src=search')) #debug(Parse('http://www.iqiyi.com/a_19rrhacdwt.html#vfrm=2-4-0-1')) #debug(Parse('http://www.iqiyi.com/a_19rrhaare5.html')) - debug(Parse('http://www.iqiyi.com/a_19rrhbhf6d.html#vfrm=2-3-0-1')) + #debug(Parse('http://www.iqiyi.com/a_19rrhbhf6d.html#vfrm=2-3-0-1')) #debug(Parse('http://www.le.com')) #debug(Parse('http://www.letv.com/comic/10010294.html')) #debug(Parse('http://www.mgtv.com/v/1/1/')) @@ -132,6 +132,7 @@ def main(): #debug(Parse('http://v.qq.com/tv/')) #debug(Parse('http://www.pptv.com/')) #debug(Parse('http://yyfm.xyz/video/album/1300046802.html')) + debug(Parse('http://www.iqiyi.com/playlist392712002.html',"collection")) #debug(Parse('http://list.iqiyi.com/www/2/----------------iqiyi--.html')) #debug(Parse('http://www.iqiyi.com/a_19rrhb8fjp.html',"list")) #debug(Parse('http://www.iqiyi.com/v_19rrl8pmn8.html#vfrm=2-3-0-1')) @@ -143,8 +144,8 @@ def main(): #debug(Parse('http://v.pptv.com/show/NWR29Yzj2hh7ibWE.html?rcc_src=S1')) #debug(Parse('http://www.bilibili.com/video/av2557971/')) #don't support #debug(Parse('http://v.baidu.com/link?url=dm_10tBNoD-LLAMb79CB_p0kxozuoJcW0SiN3eycdo6CdO3GZgQm26uOzZh9fqcNSWZmz9aU9YYCCfT0NmZoGfEMoznyHhz3st-QvlOeyArYdIbhzBbdIrmntA4h1HsSampAs4Z3c17r_exztVgUuHZqChPeZZQ4tlmM5&page=tvplaydetail&vfm=bdvtx&frp=v.baidu.com%2Ftv_intro%2F&bl=jp_video',"formats")) - debug(Parse('http://www.hunantv.com/v/1/291976/c/3137384.html')) - #debug(ParseURL('http://www.mgtv.com/v/1/291976/c/3137384.html',"1",parsers = [mvtvparser.MgTVParser()])) + #debug(Parse('http://www.hunantv.com/v/1/291976/c/3137384.html')) + #debug(ParseURL('http://www.mgtv.com/v/1/291976/c/3137384.html',"1")) if __name__ == '__main__':