diff --git a/performers/MatureFetishPerformer.py b/performers/MatureFetishPerformer.py new file mode 100644 index 00000000..a898f034 --- /dev/null +++ b/performers/MatureFetishPerformer.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteMatureFetishPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/en/models/%s?sort=&q=&sex=female', + 'external_id': r'model/(.*)/' + } + + name = 'MatureFetishPerformer' + + start_urls = [ + 'https://maturefetish.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[@class="grid-tile-model"]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('./div/a/text()').get()) + image = performer.xpath('.//img/@data-src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Mature NL' + item['url'] = self.format_link(response, performer.xpath('./a/@href').get()) + + yield item diff --git a/performers/javJAVCTPerformer.py b/performers/javJAVCTPerformer.py new file mode 100644 index 00000000..89565fd7 --- /dev/null +++ b/performers/javJAVCTPerformer.py @@ -0,0 +1,86 @@ +import re +import string +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class JavJAVCTPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//li[contains(@class, "breadcrumb__item--active")]/text()', + 'image': '//div[@class="stats"]/img/@data-src', + 'image_blob': True, + 'birthday': '//div[@class="stats"]/ul/li[contains(./text(), "Born:")]/text()', + 're_birthday': r': (.*)', + 'height': '//div[@class="stats"]/ul/li[contains(./text(), "Height:")]/text()', + 're_height': r': (.*)', + 'measurements': 'InCode', + + 'pagination': '/models/pg-%s', + 'external_id': r'model/(.*)/' + } + + name = 'JAVJAVCTPerformer' + network = 'R18' + + start_urls = [ + 'https://javct.net', + ] + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//h3[contains(@class,"card__title")]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def get_measurements(self, response): + bust = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Breast:")]/text()') + waist = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Waist:")]/text()') + hips = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Hips:")]/text()') + + if bust and waist and hips: + bust = re.search(r'(\d+)', bust.get()) + if bust: + bust = bust.group(1) + waist = re.search(r'(\d+)', waist.get()) + if waist: + waist = waist.group(1) + hips = re.search(r'(\d+)', hips.get()) + if hips: + hips = hips.group(1) + if bust and waist and hips: + if bust: + bust = round(int(bust) / 2.54) + if waist: + waist = round(int(waist) / 2.54) + if hips: + hips = round(int(hips) / 2.54) + + if bust and waist and hips: + measurements = str(bust) + "-" + str(waist) + "-" + str(hips) + return measurements.strip() + return '' + + def get_cupsize(self, response): + bust = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Breast:")]/text()') + if bust: + bust = re.search(r'(\d+)', bust.get()) + if bust: + bust = bust.group(1) + if bust: + bust = round(int(bust) / 2.54) + if bust: + return str(bust) + return '' + + def get_name(self, response): + name = super().get_name(response) + if "(" in name: + name = re.search(r'(.*?)\(', name).group(1) + return string.capwords(name.strip()) + + def get_height(self, response): + height = super().get_height(response) + return height.replace(" ", "") diff --git a/performers/javSEXTBPerformerPlaywright.py b/performers/javSEXTBPerformerPlaywright.py new file mode 100644 index 00000000..5965e4ac --- /dev/null +++ b/performers/javSEXTBPerformerPlaywright.py @@ -0,0 +1,129 @@ +import re +import string +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class JavSEXTBPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-user")]/following-sibling::span/a/text()', + 'image': '//section[@class="tray all"]/div/img/@data-src', + 'image_blob': True, + 'birthday': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-birthday")]/following-sibling::span/text()', + 'height': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Height")]/following-sibling::span/text()', + 'measurements': 'InCode', + + 'pagination': '/list-actress/pg-%s', + 'external_id': r'model/(.*)/' + } + + custom_scraper_settings = { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62', + 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor', + 'AUTOTHROTTLE_ENABLED': True, + 'USE_PROXY': False, + 'AUTOTHROTTLE_START_DELAY': 1, + 'AUTOTHROTTLE_MAX_DELAY': 60, + 'CONCURRENT_REQUESTS': 1, + 'DOWNLOAD_DELAY': 2, + 'DOWNLOADER_MIDDLEWARES': { + # 'tpdb.helpers.scrapy_flare.FlareMiddleware': 542, + 'tpdb.middlewares.TpdbSceneDownloaderMiddleware': 543, + 'tpdb.custommiddlewares.CustomProxyMiddleware': 350, + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, + }, + 'DOWNLOAD_HANDLERS': { + "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + } + } + + name = 'JAVSEXTBPerformerPlaywright' + network = 'R18' + + start_urls = [ + 'https://sextb.net', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + meta['playwright'] = True + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta) + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//div[@class="tray-item-actress"]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer) + + def get_measurements(self, response): + bust = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Breast")]/following-sibling::span/text()') + waist = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Waist")]/following-sibling::span/text()') + hips = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Hips")]/following-sibling::span/text()') + + if bust and waist and hips: + bust = re.search(r'(\d+)', bust.get()) + if bust: + bust = bust.group(1) + waist = re.search(r'(\d+)', waist.get()) + if waist: + waist = waist.group(1) + hips = re.search(r'(\d+)', hips.get()) + if hips: + hips = hips.group(1) + if bust and waist and hips: + if bust: + bust = round(int(bust) / 2.54) + if waist: + waist = round(int(waist) / 2.54) + if hips: + hips = round(int(hips) / 2.54) + + if bust and waist and hips: + measurements = str(bust) + "-" + str(waist) + "-" + str(hips) + return measurements.strip() + return '' + + def get_cupsize(self, response): + bust = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Breast")]/following-sibling::span/text()') + if bust: + bust = re.search(r'(\d+)', bust.get()) + if bust: + bust = bust.group(1) + if bust: + bust = round(int(bust) / 2.54) + if bust: + return str(bust) + return '' + + def get_name(self, response): + name = super().get_name(response) + if "(" in name: + name = re.search(r'(.*?)\(', name).group(1) + return string.capwords(name.strip()) + + def get_height(self, response): + height = super().get_height(response) + return height.replace(" ", "") diff --git a/performers/networkAdultEmpireCashPerformer.py b/performers/networkAdultEmpireCashPerformer.py index 0ab9986d..997f4a5e 100644 --- a/performers/networkAdultEmpireCashPerformer.py +++ b/performers/networkAdultEmpireCashPerformer.py @@ -11,7 +11,7 @@ def match_path(argument): 'www.filthykings.com': "/filthy-kings-porn-stars.html?page=%s&hybridview=member", 'thirdworldxxx.com': "/third-world-media-porn-stars.html?page=%s&hybridview=member", 'latinoguysporn.com': "/latino-guys-porn-stars.html?page=%s&hybridview=member", - 'www.lethalhardcore.com': "/lethal-hardcore-porn-stars.html?page=%s&hybridview=member", + 'www.lethalhardcore.com': "/porn-stars.html?sort=ag_added&page=%s&hybridview=member", 'www.wcpclub.com': "/west-coast-productions-porn-stars.html?page=%s&hybridview=member", } return match.get(argument, "") diff --git a/performers/networkAdulttimeAPIPerformers.py b/performers/networkAdulttimeAPIPerformers.py index 4e5c7c03..e40982c7 100644 --- a/performers/networkAdulttimeAPIPerformers.py +++ b/performers/networkAdulttimeAPIPerformers.py @@ -12,6 +12,14 @@ class AdultTimeAPISpiderPerformers(BasePerformerScraper): start_urls = [ 'https://www.21sextreme.com', + 'https://www.devilsfilm.com', + 'https://www.eroticax.com', + 'https://www.falconstudios.com', + 'https://www.genderxfilms.com', + 'https://www.girlfriendsfilms.com', + 'https://www.ragingstallion.com', + 'https://www.whiteghetto.com', + 'https://www.zerotolerancefilms.com', ] image_sizes = [ @@ -83,7 +91,7 @@ def get_scenes(self, response): item['gender'] = string.capwords(performer['gender']) if item['gender'] == 'Shemale': - item['gender'] = 'Trans' + item['gender'] = 'Transgender Female' item['birthday'] = '' item['astrology'] = '' item['birthplace'] = '' @@ -110,21 +118,22 @@ def get_scenes(self, response): else: item['eyecolor'] = '' - if 'weight' in performer['attributes']: + if 'weight' in performer['attributes'] and performer['attributes']['weight']: item['weight'] = performer['attributes']['weight'] + if int(float(item['weight'])) > 70: + item['weight'] = str(int(int(float(item['weight'])) * .453592)) + "kg" else: item['weight'] = '' - if 'height' in performer['attributes']: - item['height'] = performer['attributes']['height'] + if 'height' in performer['attributes'] and performer['attributes']['height']: + item['height'] = self.conv_height(performer['attributes']['height']) else: item['height'] = '' - if 'endowment' in performer['attributes']: - item['height'] = performer['attributes']['endowment'] - else: - item['fakeboobs'] = '' - + # ~ if 'endowment' in performer['attributes']: + # ~ item['height'] = performer['attributes']['endowment'] + # ~ else: + # ~ item['fakeboobs'] = '' yield item def call_algolia(self, page, token, referrer): @@ -134,8 +143,33 @@ def call_algolia(self, page, token, referrer): 'Content-Type': 'application/json', 'Referer': self.get_next_page_url(referrer, page) } - # ~ jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=50&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3A%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&facets=%5B%5D&tagFilters=&facetFilters=%5B%5B%22content_tags%3Afemale%22%5D%2C%5B%22availableOnSite%3A%22%5D%5D"}]}' - jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=84&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Atwentyonesextury%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&facets=%5B%22availableOnSite%22%5D&tagFilters=&facetFilters=%5B%5B%22availableOnSite%3A%22%5D%5D"},{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=1&maxValuesPerFacet=10&page=0&analytics=false&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Atwentyonesextury%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=false&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&facets=availableOnSite"}]}' + + if '21sextreme' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=84&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Atwentyonesextury%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&facets=%5B%22availableOnSite%22%5D&tagFilters=&facetFilters=%5B%5B%22availableOnSite%3A%22%5D%5D"},{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=1&maxValuesPerFacet=10&page=0&analytics=false&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Atwentyonesextury%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=false&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&facets=availableOnSite"}]}' + + if 'devilsfilm' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=84&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Adevilsfilm%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22availableOnSite%22%5D&tagFilters=&facetFilters=%5B%5B%22availableOnSite%3Adevilsfilm%22%2C%22availableOnSite%3Asquirtalicious%22%2C%22availableOnSite%3Ahairyundies%22%2C%22availableOnSite%3Alesbianfactor%22%2C%22availableOnSite%3Adevilsfilmparodies%22%2C%22availableOnSite%3Agivemeteens%22%2C%22availableOnSite%3Aoutofthefamily%22%2C%22availableOnSite%3Adevilsgangbangs%22%2C%22availableOnSite%3AJaneDoePictures%22%2C%22availableOnSite%3Adevilstgirls%22%5D%5D"}]}' + + if 'eroticax' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aeroticax%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22availableOnSite%22%2C%22male%22%5D&tagFilters=&facetFilters=%5B%5B%22male%3A0%22%5D%2C%5B%22availableOnSite%3Aeroticax%22%5D%5D"}]}' + + if 'falconstudios' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Afalconstudios%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22availableOnSite%22%5D&tagFilters=&facetFilters=%5B%5B%22availableOnSite%3Afalconstudios%22%2C%22availableOnSite%3Ahothouse%22%2C%22availableOnSite%3Afalconstudiospartners%22%5D%5D"}]}' + + if 'genderxfilms' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Agenderxfilms%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=(NOT%20availableOnSite%3A\'genderxpartners\'%20AND%20NOT%20availableOnSite%3A\'evilangelpartners\'%20AND%20NOT%20availableOnSite%3A\'evilangelpartners\')&facets=%5B%22male%22%2C%22shemale%22%5D&tagFilters=&facetFilters=%5B%5B%22shemale%3A1%22%5D%2C%5B%22male%3A0%22%5D%5D"}]}' + + if 'girlfriendsfilms' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Agirlfriendsfilms%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22male%22%2C%22shemale%22%5D&tagFilters=&facetFilters=%5B%5B%22shemale%3A0%22%5D%2C%5B%22male%3A0%22%5D%5D"}]}' + + if 'ragingstallion' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aragingstallion%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=(availableOnSite%3A\'ragingstallion\')&facets=%5B%5D&tagFilters="}]}' + + if 'whiteghetto' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=84&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Awhiteghetto%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22availableOnSite%22%5D&tagFilters=&facetFilters=%5B%5B%22availableOnSite%3Awhiteghetto%22%5D%5D"}]}' + + if 'zerotolerance' in referrer: + jbody = '{"requests":[{"indexName":"all_actors_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Azerotolerancefilms%22%2C%22context%3Apornstars%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22actor_id%22%2C%22name%22%2C%22pictures%22%2C%22gender%22%2C%22sitename%22%2C%22url_name%22%2C%22last_release_date%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22description%22%2C%22attributes%22%2C%22objectID%22%2C%22shemale%22%2C%22male%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22has_pictures%22%2C%22male%22%5D&tagFilters=&facetFilters=%5B%5B%22male%3A0%22%5D%2C%5B%22has_pictures%3A1%22%5D%5D"}]}' return scrapy.Request( url=algolia_url, @@ -145,3 +179,20 @@ def call_algolia(self, page, token, referrer): callback=self.parse, headers=headers ) + + def conv_height(self, height): + if height: + tot_inches = 0 + if re.search(r'(\d+)[\'\"]', height): + feet = re.search(r'(\d+)\'', height) + if feet: + feet = feet.group(1) + tot_inches = tot_inches + (int(feet) * 12) + inches = re.search(r'\d+?\'(\d+)', height) + if inches: + inches = inches.group(1) + inches = int(inches) + tot_inches = tot_inches + inches + height = str(int(tot_inches * 2.54)) + "cm" + return height + return None diff --git a/performers/networkCXWowPerformer.py b/performers/networkCXWowPerformer.py index a9a67c64..59e243c0 100644 --- a/performers/networkCXWowPerformer.py +++ b/performers/networkCXWowPerformer.py @@ -6,12 +6,12 @@ class NetworkCXWowPerformerSpider(BasePerformerScraper): selector_map = { - 'name': '//div[@class="graybox"]/div[contains(@class,"titlebox")]/h3[1]/span/text()', - 'image': '//div[contains(@class,"profileimg")]//img/@src', + 'name': '//div[@class="bioInfo"]/h1/text()', + 'image': '//div[@class="bioPic"]/img/@src0_1x', 'image_blob': True, 'bio': '', 'gender': '', - 'astrology': '//span[contains(text(), "strological")]/following-sibling::text()[1]', + 'astrology': '//div[@class="bioInfo"]//span[contains(text(), "Sign:")]/following-sibling::text()', 'birthday': '', 'birthplace': '', 'cupsize': '', @@ -19,14 +19,14 @@ class NetworkCXWowPerformerSpider(BasePerformerScraper): 'eyecolor': '', 'fakeboobs': '', 'haircolor': '', - 'height': '//span[contains(text(), "eight")]/following-sibling::text()[1]', - 'measurements': '//span[contains(text(), "easure")]/following-sibling::text()[1]', + 'height': '//div[@class="bioInfo"]//span[contains(text(), "Height:")]/following-sibling::text()', + 'measurements': '//div[@class="bioInfo"]//span[contains(text(), "Measurements:")]/following-sibling::text()', 'nationality': '', 'piercings': '', 'tattoos': '', 'weight': '', - 'pagination': '/tour/categories/models_%s_d.html', + 'pagination': '/tour/models//models_%s_d.html', 'external_id': r'model/(.*)/' } @@ -44,12 +44,12 @@ class NetworkCXWowPerformerSpider(BasePerformerScraper): def get_gender(self, response): if "pure-bbw" in response.url: return 'Female' - if "pure-ts" in response.url or "tspov" in response.url or "becmoingfemme" in response.url: + if "pure-ts" in response.url or "tspov" in response.url or "becomingfemme" in response.url: return 'Transgender Female' return None def get_performers(self, response): - performers = response.xpath('//div[contains(@class, "models")]//a[contains(@href, "/models/")]/@href').getall() + performers = response.xpath('//div[@class="modelPic"]/a/@href').getall() for performer in performers: yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) @@ -91,5 +91,5 @@ def get_height(self, response): inches = int(inches) else: inches = 0 - return str(int((feet + inches) * 2.54)) + return str(int((feet + inches) * 2.54)) + "cm" return None diff --git a/performers/networkProjectOneServicePerformer.py b/performers/networkProjectOneServicePerformer.py index 480ede72..dffda8a5 100644 --- a/performers/networkProjectOneServicePerformer.py +++ b/performers/networkProjectOneServicePerformer.py @@ -51,6 +51,7 @@ class ProjectOneServicePerformerSpider(BasePerformerScraper): 'https://www.trueamateurs.com', 'https://www.tube8vip.com', 'https://www.twistys.com', + 'https://www.voyr.com', 'https://www.whynotbi.com', ] diff --git a/performers/networkVegasDreamworksPerformer.py b/performers/networkVegasDreamworksPerformer.py index 6f67b681..bbc5cc5e 100644 --- a/performers/networkVegasDreamworksPerformer.py +++ b/performers/networkVegasDreamworksPerformer.py @@ -21,6 +21,7 @@ class VegasDreamworksPerformerSpider(BasePerformerScraper): parent = 'Vegas Dreamworks' start_urls = [ + # ~ ['https://asiansexdiary.com/', '/models/page/%s/?sortby=date'], ['https://screwmetoo.com/', '/models/page/%s/?sortby=date'], ['https://milftrip.com/', '/models/page/%s/?sortby=date'], ['https://tuktukpatrol.com/', '/models/page/%s/?sortby=date'], @@ -120,7 +121,8 @@ def get_bio(self, response): bio = response.xpath('//div[@class="model-desc"]/text()').getall() if bio: bio = " ".join(bio) - return bio.strip() + if bio: + return bio.strip() return '' def get_gender(self, response): diff --git a/performers/siteAdultAllStarsPerformer.py b/performers/siteAdultAllStarsPerformer.py new file mode 100644 index 00000000..14312638 --- /dev/null +++ b/performers/siteAdultAllStarsPerformer.py @@ -0,0 +1,59 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteAdultAllStarsPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[@class="updatesBlock"]/div/h2/text()', + 'image': '//div[contains(@class, "model_picture")]/img/@src0_3x|//div[contains(@class, "model_picture")]/img/@src0_2x|//div[contains(@class, "model_picture")]/img/@src0_1x', + 'image_blob': True, + 'bio': '//comment()[contains(.,"Bio Extra Field")]/following-sibling::p/text()', + 'pagination': '/models/models_%s_d.html', + 'external_id': r'model/(.*)/' + } + + name = 'AdultAllStarsPerformer' + network = 'Adult All Stars' + + start_urls = [ + 'https://www.adultallstars.com', + ] + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class, "modelPic")]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def get_name(self, response): + name = super().get_name(response) + name = name.replace("/", "").strip() + return name + + def get_astrology(self, response): + astrology = response.xpath('//comment()[contains(.,"Bio Extra Fields")]/following-sibling::text()[contains(., "Astrological")]') + if astrology: + astrology = astrology.getall() + astrology = "".join(astrology) + astrology = astrology.replace("\n", "").replace("\r", "").replace("\t", "") + astrology = re.search(r'Sign:(.*)', astrology) + if astrology: + astrology = astrology.group(1) + return astrology.strip() + return None + + def get_nationality(self, response): + nationality = response.xpath('//comment()[contains(.,"Bio Extra Fields")]/following-sibling::text()[contains(., "Nationality")]') + if nationality: + nationality = nationality.getall() + nationality = "".join(nationality) + nationality = nationality.replace("\n", "").replace("\r", "").replace("\t", "") + nationality = re.search(r'Nationality:(.*)', nationality) + if nationality: + nationality = nationality.group(1) + return nationality.strip() + return None diff --git a/performers/siteAsianSexDiaryPerformer.py b/performers/siteAsianSexDiaryPerformer.py new file mode 100644 index 00000000..ab605fb2 --- /dev/null +++ b/performers/siteAsianSexDiaryPerformer.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteAsianSexDiaryPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models/page/%s/?sortby=date', + 'external_id': r'model/(.*)/' + } + + name = 'SiteAsianSexDiaryPerformer' + + start_urls = [ + 'https://asiansexdiary.com/', + ] + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class, "fsp-model")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//h4/text()').get()) + image = performer.xpath('.//amp-img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Vegas Dreamworks' + item['url'] = self.format_link(response, performer.xpath('./a/@href').get()) + + yield item diff --git a/performers/siteBaitBuddiesPerformer.py b/performers/siteBaitBuddiesPerformer.py new file mode 100644 index 00000000..b6b35b6e --- /dev/null +++ b/performers/siteBaitBuddiesPerformer.py @@ -0,0 +1,64 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteBaitBuddiesPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[@class="profile-disc"]/h1/text()', + 'image': '//img[@class="profile-pic"]/@src', + 'image_blob': True, + 'eyecolor': '//div[@class="profile-body"]//text()[contains(.,"Eyes")]/following-sibling::b[1]/text()', + 'haircolor': '//div[@class="profile-body"]//text()[contains(.,"Hair")]/following-sibling::b[1]/text()', + + 'pagination': '/?page=theguys&p=%s', + 'external_id': r'model/(.*)/' + } + + name = 'BaitBuddiesPerformer' + network = 'Bait Buddies' + + start_urls = [ + 'https://www.baitbuddies.com', + ] + + def get_gender(self, response): + return 'Male' + + def get_performers(self, response): + performers = response.xpath('//div[@class="theguys-thumb"]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def get_weight(self, response): + weight = response.xpath('//div[@class="profile-body"]//text()[contains(.,"Weight")]/following-sibling::b[1]') + if weight: + weight = weight.get() + weight = re.search(r'(\d{2,3})', weight) + if weight: + weight = weight.group(1) + weight = str(int(int(weight) * .4535)) + "kg" + return weight + + def get_height(self, response): + height = response.xpath('//div[@class="profile-body"]//text()[contains(.,"Height")]/following-sibling::b[1]') + if height: + height = height.get() + height = height.replace("``", "\"").replace("`", "'") + if "'" in height: + height = re.sub(r'[^0-9\']', '', height) + feet = re.search(r'(\d+)\'', height) + if feet: + feet = feet.group(1) + feet = int(feet) * 12 + else: + feet = 0 + inches = re.search(r'\'(\d+)', height) + if inches: + inches = inches.group(1) + inches = int(inches) + else: + inches = 0 + return str(int((feet + inches) * 2.54)) + "cm" + return None diff --git a/performers/siteChristianWildePerformer.py b/performers/siteChristianWildePerformer.py new file mode 100644 index 00000000..f113983d --- /dev/null +++ b/performers/siteChristianWildePerformer.py @@ -0,0 +1,91 @@ +import scrapy +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteChristianWildePerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'ChristianWildePerformer' + + start_url = 'https://christianwilde.com' + + paginations = [ + '/models/models_%s.html?g=m', + '/models/models_%s.html?g=f', + '/models/models_%s.html?g=tf', + '/models/models_%s.html?g=tm', + '/models/models_%s.html?g=nb', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + for pagination in self.paginations: + meta['pagination'] = pagination + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['pagination'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def get_performers(self, response): + performers = response.xpath('//div[@class="model"]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//p/a/text()').get()) + image = performer.xpath('.//img/@src0_3x|.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Male' + if "g=f" in response.url: + item['gender'] = 'Female' + if "g=m" in response.url: + item['gender'] = 'Male' + if "g=tf" in response.url: + item['gender'] = 'Trans Female' + if "g=tm" in response.url: + item['gender'] = 'Trans Male' + if "g=nb" in response.url: + item['gender'] = 'Non Binary' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Christian Wilde' + item['url'] = self.format_link(response, performer.xpath('.//p/a/@href').get()) + + yield item diff --git a/performers/siteCombatZonePerformer.py b/performers/siteCombatZonePerformer.py new file mode 100644 index 00000000..ca8338a3 --- /dev/null +++ b/performers/siteCombatZonePerformer.py @@ -0,0 +1,81 @@ +import scrapy +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteCombatZonePerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'CombatZonePerformer' + + start_url = 'https://tour.combatzonexxx.com' + + paginations = [ + '/models/models_%s_d.html?g=f', + '/models/models_%s_d.html?g=t', + ] + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def start_requests(self): + meta = {} + meta['page'] = self.page + + for pagination in self.paginations: + meta['pagination'] = pagination + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class, "item-portrait")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('./div[1]/a/span/text()').get()) + image = performer.xpath('./div[1]//img/@src0_2x|./div[1]//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + if "g=t" in response.url: + item['gender'] = "Trans Female" + else: + item['gender'] = "Female" + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Combat Zone' + item['url'] = self.format_link(response, performer.xpath('./div[1]/a/@href').get()) + + yield item diff --git a/performers/siteDanniPerformer.py b/performers/siteDanniPerformer.py new file mode 100644 index 00000000..8c2e557a --- /dev/null +++ b/performers/siteDanniPerformer.py @@ -0,0 +1,52 @@ +import re +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteDanniPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models/models_%s_d', + 'external_id': r'model/(.*)/' + } + + name = 'DanniPerformer' + + start_urls = [ + 'https://www.danni.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[@class="danni-card"]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//div[contains(@class, "card-name")]/a/text()').get()) + image = performer.xpath('.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + if "?" in item['image']: + item['image'] = re.search(r'(.*)\?', item['image']).group(1) + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Sexual Prime' + item['url'] = self.format_link(response, performer.xpath('./a[1]/@href').get()) + + yield item diff --git a/performers/siteDownblouseWowPerformer.py b/performers/siteDownblouseWowPerformer.py new file mode 100644 index 00000000..885a065f --- /dev/null +++ b/performers/siteDownblouseWowPerformer.py @@ -0,0 +1,30 @@ +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteDownblouseWowPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[contains(@class,"modelinfo")]/p/strong[contains(text(), "Name")]/following-sibling::text()[1]', + 'image': '//div[contains(@class,"modelpic")]/img/@src', + 'image_blob': True, + 'cupsize': '//div[contains(@class,"modelinfo")]/p/strong[contains(text(), "Bra")]/following-sibling::text()[1]', + + 'pagination': '/show.php?a=147_%s', + 'external_id': r'model/(.*)/' + } + + name = 'DownblouseWowPerformer' + network = 'Downblouse Wow' + + start_urls = [ + 'https://downblousewow.com', + ] + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//div[@class="itemminfo"]/p/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) diff --git a/performers/siteEnjoyXPerformer.py b/performers/siteEnjoyXPerformer.py new file mode 100644 index 00000000..c97356f4 --- /dev/null +++ b/performers/siteEnjoyXPerformer.py @@ -0,0 +1,89 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class PerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//h1/text()', + 'image': '//img[contains(@class, "detail__picture")]/@src', + 'image_blob': True, + 'birthday': '//div[contains(@class, "pornstar-detail__description-block")]//strong[contains(text(), "Birthday")]/following-sibling::text()[1]', + 're_birthday': r'(\w+ \d{1,2}, \d{4})', + 'height': '//div[contains(@class, "pornstar-detail__description-block")]//strong[contains(text(), "Height")]/following-sibling::text()[1]', + 're_height': r'(\d+ cm)', + 'measurements': '//div[contains(@class, "pornstar-detail__description-block")]//strong[contains(text(), "Measurements")]/following-sibling::text()[1]', + + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'EnjoyxPerformer' + network = "Enjoyx" + + start_url = 'https://enjoyx.com' + + paginations = [ + '/model/girls?page=%s', + '/model/boys?page=%s', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + for pagination in self.paginations: + meta['pagination'] = pagination + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta) + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def get_performers(self, response): + meta = response.meta + performers = response.xpath('//div[contains(@class, "pornstar-card")]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, meta=meta) + + def get_gender(self, response): + if "girls" in response.meta['pagination']: + return "Female" + if "boys" in response.meta['pagination']: + return "Male" + + def get_measurements(self, response): + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements): + measurements = re.search(r'(\d+\w+-\d+-\d+)', measurements).group(1) + return measurements.strip() + return '' + + def get_cupsize(self, response): + if 'cupsize' in self.selector_map and self.get_selector_map('cupsize'): + cupsize = self.process_xpath(response, self.get_selector_map('cupsize')).get() + return cupsize.strip() + else: + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements): + cupsize = re.search(r'(\d+\w+)-\d+-\d+', measurements) + if cupsize: + cupsize = cupsize.group(1) + return cupsize.strip() + return '' diff --git a/performers/siteExposedNursesPerformer.py b/performers/siteExposedNursesPerformer.py new file mode 100644 index 00000000..1e491266 --- /dev/null +++ b/performers/siteExposedNursesPerformer.py @@ -0,0 +1,68 @@ +import re +import scrapy +from tpdb.items import PerformerItem +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteExposedNursesPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'ExposedNursesPerformer' + network = 'Apollo Cash' + + start_urls = [ + 'https://www.exposednurses.com', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = "https://www.exposednurses.com/pussy-hd.php?pagea=" + yield scrapy.Request(link, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + meta = response.meta + performers = response.xpath('//a[@class="model_link_abc"]') + for performer in performers: + meta['name'] = performer.xpath('./text()[1]').get() + performer = performer.xpath('./@href').get() + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, meta=meta) + + def parse_performer(self, response): + meta = response.meta + item = PerformerItem() + + item['name'] = self.cleanup_title(meta['name']) + image = response.xpath('//img[@class="image1" and contains(@src, "1_1")]/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Apollo Cash' + item['url'] = response.url + if "&nats" in item['url']: + item['url'] = re.search(r'(.*)\&nats', item['url']).group(1) + + yield item diff --git a/performers/siteGapeMyPussyPerformer.py b/performers/siteGapeMyPussyPerformer.py new file mode 100644 index 00000000..9bb1b9f7 --- /dev/null +++ b/performers/siteGapeMyPussyPerformer.py @@ -0,0 +1,68 @@ +import re +import scrapy +from tpdb.items import PerformerItem +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteGapeMyPussyPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'GapeMyPussyPerformer' + network = 'Apollo Cash' + + start_urls = [ + 'https://www.gapemypussy.com', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = "https://www.gapemypussy.com/pussy-gaping-hd-download.php?pagem=" + yield scrapy.Request(link, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + meta = response.meta + performers = response.xpath('//a[@class="model_link_abc"]') + for performer in performers: + meta['name'] = performer.xpath('./text()[1]').get() + performer = performer.xpath('./@href').get() + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, meta=meta) + + def parse_performer(self, response): + meta = response.meta + item = PerformerItem() + + item['name'] = self.cleanup_title(meta['name']) + image = response.xpath('//img[contains(@src, "1_2")]/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Apollo Cash' + item['url'] = response.url + if "&nats" in item['url']: + item['url'] = re.search(r'(.*)\&nats', item['url']).group(1) + + yield item diff --git a/performers/siteIFeelMyselfPerformer.py b/performers/siteIFeelMyselfPerformer.py new file mode 100644 index 00000000..bf2cc2da --- /dev/null +++ b/performers/siteIFeelMyselfPerformer.py @@ -0,0 +1,56 @@ +import scrapy +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteIFeelMyselfPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/public/main.php?page=view&mode=all&offset=%s', + 'external_id': r'model/(.*)/' + } + + name = 'IFeelMyselfPerformer' + network = 'Feck Erotica' + + start_urls = [ + 'https://ifeelmyself.com', + ] + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), + callback=self.parse, + meta=meta, + headers=self.headers, + cookies=self.cookies) + + def get_performers(self, response): + performers = response.xpath('//table[@class="DispResults"]') + for performer in performers: + item = self.init_performer() + + item['name'] = self.cleanup_title(performer.xpath('.//a[contains(@href, "artist_bio")]/text()').get()) + image = performer.xpath('.//img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['gender'] = 'Female' + item['network'] = 'Feck Erotica' + item['url'] = self.format_link(response, performer.xpath('.//a[contains(@href, "artist_bio")]/@href').get()) + + yield item + + def get_next_page_url(self, base, page): + page = str((int(page) - 1) * 12) + return self.format_url(base, self.get_selector_map('pagination') % page) diff --git a/performers/siteLadyFyrePerformer.py b/performers/siteLadyFyrePerformer.py new file mode 100644 index 00000000..6f3bfc5b --- /dev/null +++ b/performers/siteLadyFyrePerformer.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteLadyFyrePerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/tour/models/models_%s_d.html?g=f', + 'external_id': r'model/(.*)/' + } + + name = 'LadyFyrePerformer' + + start_urls = [ + 'https://ladyfyre.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[@class="model"]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//p/a/text()').get()) + image = performer.xpath('.//img/@src0_3x|.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Lady Fyre' + item['url'] = self.format_link(response, performer.xpath('./div/a/@href').get()) + + yield item diff --git a/performers/siteLadyboyGoldPerformer.py b/performers/siteLadyboyGoldPerformer.py new file mode 100644 index 00000000..e7ab0181 --- /dev/null +++ b/performers/siteLadyboyGoldPerformer.py @@ -0,0 +1,43 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteLadyboyGoldPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[@class="container"]/div[1]/div[2]/div[1]/text()', + 'image': '//div[@class="container"]//div[contains(@class,"photoUpdate-image")]//img/@src', + 'image_blob': True, + 'bio': '//div[@class="container"]//div[@class="profileBio"]/text()', + 'height': '//div[@class="container"]//li[contains(text(), "Height:")]/text()', + 're_height': r'(\d+cm)', + 'measurements': '//div[@class="container"]//li[contains(text(), "Measurements:")]/text()', + 're_measurements': r'(\d+\w+?-\d+-\d+)', + 'cupsize': '//div[@class="container"]//li[contains(text(), "Measurements:")]/text()', + 're_cupsize': r'(\d+\w+?)-\d+-\d+', + 'weight': '//div[@class="container"]//li[contains(text(), "Weight:")]/text()', + 're_weight': r'(\d+kg)', + + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'LadyboyGoldPerformer' + network = 'Ladyboy Gold' + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = 'https://www.ladyboygold.com/index.php?section=1813' + yield scrapy.Request(link, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_gender(self, response): + return 'Trans Female' + + def get_performers(self, response): + performers = response.xpath('//p[@class="setModel"]/a/@href').getall() + for performer in performers: + performer = re.search(r'(.*?)\&nats', performer).group(1) + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) diff --git a/performers/siteLukeHardyPerformer.py b/performers/siteLukeHardyPerformer.py new file mode 100644 index 00000000..c03b6672 --- /dev/null +++ b/performers/siteLukeHardyPerformer.py @@ -0,0 +1,111 @@ +import re +import string +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteLukeHardyPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[@class="modelBioBlock"]/h2/text()', + 'image': '//div[@class="bioPic"]/img/@src', + 'image_blob': True, + 'bio': '//div[@class="modelBioBlock"]/div[@class="aboutGirl"]/p/text()', + 'astrology': '//div[@class="modelBioBlock"]//li[contains(text(), "Sign:")]/span/text()', + 'height': '//div[@class="modelBioBlock"]//li[contains(text(), "Height:")]/span/text()', + 'measurements': '//div[@class="modelBioBlock"]//li[contains(text(), "Measurements:")]/span/text()', + + 'pagination': '', + 'external_id': r'model/(.*)/' + } + + name = 'LukeHardyPerformer' + network = 'Luke Hardy' + + cookies = {"name": "_warning_page", "value": "1"} + + start_urls = [ + '', + ] + + def start_requests(self): + meta = {} + + for c in string.ascii_uppercase: + link = f"https://www.lukehardyxxx.com/army/models.php?letter={c}" + # ~ link = f"https://www.lukehardyxxx.com/army/models.php?letter=A" + yield scrapy.Request(link, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//div[@class="modelPhoto"]/a/@href').getall() + for performer in performers: + if "model-" in performer: + link = f"https://www.lukehardyxxx.com/army/{performer}" + yield scrapy.Request(link, callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def get_measurements(self, response): + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + measurements = measurements.replace(" ", "") + if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements): + measurements = re.search(r'(\d+\w+-\d+-\d+)', measurements).group(1) + return measurements.strip() + return '' + + def get_cupsize(self, response): + if 'cupsize' in self.selector_map and self.get_selector_map('cupsize'): + cupsize = self.process_xpath(response, self.get_selector_map('cupsize')).get() + return cupsize.strip() + else: + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + measurements = measurements.replace(" ", "") + if measurements and re.search(r'(\d+\w+-\d+-\d+)', measurements): + cupsize = re.search(r'(\d+\w+)-\d+-\d+', measurements) + if cupsize: + cupsize = cupsize.group(1) + return cupsize.strip() + return '' + + def get_height(self, response): + height = super().get_height(response) + height = height.lower().replace(" ", "").replace("feet", "'").replace("and", "").replace("inches", "\"") + if "'" in height: + height = re.sub(r'[^0-9\']', '', height) + feet = re.search(r'(\d+)\'', height) + if feet: + feet = feet.group(1) + feet = int(feet) * 12 + else: + feet = 0 + inches = re.search(r'\'(\d+)', height) + if inches: + inches = inches.group(1) + inches = int(inches) + else: + inches = 0 + return str(int((feet + inches) * 2.54)) + "cm" + return None + + def get_image(self, response, path=None): + force_update = self.settings.get('force_update') + if force_update: + force_update = True + force_fields = self.settings.get('force_fields') + if force_fields: + force_fields = force_fields.split(",") + + if not force_update or (force_update and "image" in force_fields): + if 'image' in self.get_selector_map(): + image = self.get_element(response, 'image', 're_image') + if isinstance(image, list): + image = image[0] + image = image.replace(" ", "%20") + if path: + return self.format_url(path, image) + else: + return f"https://www.lukehardyxxx.com/army/{image}" + return '' diff --git a/performers/siteMilfCandyPerformer.py b/performers/siteMilfCandyPerformer.py new file mode 100644 index 00000000..07d1d5b5 --- /dev/null +++ b/performers/siteMilfCandyPerformer.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteMilfCandyPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models/models_%s_d.html?g=f', + 'external_id': r'model/(.*)/' + } + + name = 'MilfCandyPerformer' + + start_urls = [ + 'https://tour.milfcandy.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class,"item-portrait")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('./div[1]/a/span/text()').get()) + image = performer.xpath('.//img/@src0_3x|.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Milf Candy' + item['url'] = self.format_link(response, performer.xpath('./div[@class="timeDate"]/a/@href').get()) + + yield item diff --git a/performers/siteMilfCandyPerformerMale.py b/performers/siteMilfCandyPerformerMale.py new file mode 100644 index 00000000..6d2315db --- /dev/null +++ b/performers/siteMilfCandyPerformerMale.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteMilfCandyPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models/models_%s_d.html?g=m', + 'external_id': r'model/(.*)/' + } + + name = 'MilfCandyPerformerMale' + + start_urls = [ + 'https://tour.milfcandy.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class,"item-portrait")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('./div[1]/a/span/text()').get()) + image = performer.xpath('.//img/@src0_3x|.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Male' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Milf Candy' + item['url'] = self.format_link(response, performer.xpath('./div[@class="timeDate"]/a/@href').get()) + + yield item diff --git a/performers/siteMinnanoAVPerformer.py b/performers/siteMinnanoAVPerformer.py new file mode 100644 index 00000000..4ddcb6ca --- /dev/null +++ b/performers/siteMinnanoAVPerformer.py @@ -0,0 +1,114 @@ +import re +import string +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteMinnanoAVPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//section[contains(@class,"main-column")]/h1/span/text()', + 'image': '//section[contains(@class,"main-column")]//div[@class="act-area"]/div/img/@src', + 'image_blob': True, + 'birthday': '//section[contains(@class,"main-column")]//div[@class="act-profile"]//a[contains(@href, "birthday")]/@href', + 're_birthday': r'(\d{4}-\d{2}-\d{2})', + 'pagination': '/actress_list.php?page=%s', + 'external_id': r'model/(.*)/' + } + + name = 'MinnanoAV' + network = 'R18' + + start_urls = [ + 'https://www.minnano-av.com', + ] + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//h2[@class="ttl"]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def get_measurements(self, response): + measurements = response.xpath('//section[contains(@class,"main-column")]//div[@class="act-profile"]//span[contains(text(), "サイズ")]/following-sibling::p//text()') + if measurements: + measurements = "".join(measurements.getall()).strip() + measurements = re.sub(r'[^a-zA-Z0-9/]', '', measurements) + + bust = re.search(r'B(\d+)', measurements) + waist = re.search(r'W(\d+)', measurements) + hips = re.search(r'H(\d+)', measurements) + if re.search(r'B\d+([A-Za-z]+)', measurements): + cup = re.search(r'B\d+([A-Za-z]+)', measurements).group(1) + else: + cup = "" + + if bust and waist and hips: + bust = bust.group(1) + bust = round(int(bust) / 2.54) + waist = waist.group(1) + waist = round(int(waist) / 2.54) + hips = hips.group(1) + hips = round(int(hips) / 2.54) + measurements = f"{str(bust)}{str(cup).upper()}-{str(waist)}-{str(hips)}" + if measurements: + return measurements.strip() + return '' + + def get_cupsize(self, response): + measurements = response.xpath('//section[contains(@class,"main-column")]//div[@class="act-profile"]//span[contains(text(), "サイズ")]/following-sibling::p//text()') + if measurements: + measurements = "".join(measurements.getall()).strip() + measurements = re.sub(r'[^a-zA-Z0-9/]', '', measurements) + + bust = re.search(r'B(\d+)', measurements) + if re.search(r'B\d+([A-Za-z]+)', measurements): + cup = re.search(r'B\d+([A-Za-z]+)', measurements).group(1) + else: + cup = "" + + if bust and cup: + bust = bust.group(1) + bust = round(int(bust) / 2.54) + cup = str(bust) + cup + return cup + + return "" + + def get_name(self, response): + name = super().get_name(response) + if "/" in name: + name = re.search(r'/(.*)', name).group(1) + name = re.sub(r'[^a-zA-Z0-9-\.\'_ ]', '', name) + return string.capwords(name.strip()) + + def get_ethnicity(self, response): + ethnicity_test = response.xpath('//section[contains(@class,"main-column")]//div[@class="act-profile"]//span[contains(text(), "出身地")]/following-sibling::p/a/text()') + if ethnicity_test: + ethnicity_test = ethnicity_test.get() + if ethnicity_test != "海外": + return "Asian" + return "" + + def get_height(self, response): + height = response.xpath('//section[contains(@class,"main-column")]//div[@class="act-profile"]//span[contains(text(), "サイズ")]/following-sibling::p//text()') + if height: + height = "".join(height.getall()).strip() + if re.search(r'T(\d+)', height): + height = re.search(r'T(\d+)', height).group(1) + return f"{height}cm" + return "" + + def get_image(self, response): + image = super().get_image(response) + if "?" in image: + image = re.search(r'(.*?)\?', image).group(1) + return image + + def get_url(self, response): + perfurl = super().get_url(response) + if "?" in perfurl: + perfurl = re.search(r'(.*?)\?', perfurl).group(1) + return perfurl diff --git a/performers/siteMongerInAsiaPerformer.py b/performers/siteMongerInAsiaPerformer.py index 96396c91..84f104d4 100644 --- a/performers/siteMongerInAsiaPerformer.py +++ b/performers/siteMongerInAsiaPerformer.py @@ -1,59 +1,132 @@ -import html - -from tpdb.items import PerformerItem +import re +import string +import scrapy from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem class SiteMongerInAsiaPerformerSpider(BasePerformerScraper): + name = 'MongerInAsiaPerformer' + start_url = 'https://mongerinasia.com/' + selector_map = { - 'pagination': '/categories/models_%s_d', - 'external_id': r'models/(.*).html' + 'external_id': r'', + 'pagination': '/_next/data//models.json?page=%s&order_by=name&sort_by=asc', } - name = 'MongerInAsiaPerformer' - network = "Monger In Asia" + def start_requests(self): + meta = {} + meta['page'] = self.page + yield scrapy.Request('https://mongerinasia.com/', callback=self.start_requests_2, meta=meta, headers=self.headers, cookies=self.cookies) - start_urls = [ - 'https://www.mongerinasia.com' - ] + def start_requests_2(self, response): + meta = response.meta + buildId = re.search(r'\"buildId\":\"(.*?)\"', response.text) + if buildId: + meta['buildID'] = buildId.group(1) + link = self.get_next_page_url(self.start_url, self.page, meta['buildID']) + yield scrapy.Request(link, callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - def get_performers(self, response): - performers = response.xpath('//div[@class="model"]') + def get_next_page_url(self, base, page, buildID): + pagination = self.get_selector_map('pagination') + pagination = pagination.replace("", buildID) + return self.format_url(base, pagination % page) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 for performer in performers: + count += 1 + yield performer + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['buildID']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + jsondata = response.json() + jsondata = jsondata['pageProps']['models']['data'] + for performer in jsondata: item = PerformerItem() - name = performer.xpath('./div/p/text()').get() - if name: - item['name'] = html.unescape(name.strip().title()) + item['name'] = performer['name'] + item['image'] = performer['thumb'] + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['bio'] = '' + if "gender" in performer and performer['gender']: + item['gender'] = string.capwords(performer['gender']) else: - item['name'] = '' - - image = performer.xpath('.//img/@src').get() - if image: - item['image'] = image.strip() + item['gender'] = "Female" + item['astrology'] = '' + if "birthdate" in performer and performer['birthdate'] and "1969" not in performer['birthdate']: + item['birthday'] = performer['birthdate'] else: - item['image'] = None - item['image_blob'] = self.get_image_blob_from_link(item['image']) - - item['url'] = response.url + item['birthday'] = '' - item['network'] = 'Monger in Asia' + if "location" in performer and performer['location']: + item['birthplace'] = performer['location'] + else: + item['birthplace'] = '' - item['astrology'] = '' - item['bio'] = '' - item['birthday'] = '' - item['birthplace'] = '' + item['measurements'] = '' item['cupsize'] = '' - item['ethnicity'] = '' + + item['ethnicity'] = 'Asian' item['eyecolor'] = '' item['fakeboobs'] = '' - item['gender'] = 'Female' item['haircolor'] = '' - item['height'] = '' - item['measurements'] = '' - item['nationality'] = '' + + if "height" in performer and performer['height']: + item['height'] = self.get_height(performer['height']) + else: + item['height'] = '' + + if "weight" in performer and performer['weight']: + item['weight'] = self.get_weight(performer['weight']) + else: + item['weight'] = '' + + if item['birthplace'] and "," in item['birthplace']: + item['nationality'] = re.search(r', (.*)', item['birthplace']).group(1) + item['nationality'] = item['nationality'].strip() + else: + item['nationality'] = '' + item['piercings'] = '' item['tattoos'] = '' - item['weight'] = '' + item['network'] = 'Monger In Asia' + item['url'] = f"https://www.mongerinasia.com/models/{performer['slug']}" yield item + + def get_weight(self, weight): + if weight: + if "kilos" in weight.lower(): + weight = re.sub(r'[^0-9a-z]', "", weight.lower()) + weight = re.search(r'(\d+)kilo', weight).group(1) + return weight + "kg" + return None + + def get_height(self, height): + height = height.replace("’", "'").replace("”", "\"") + height = re.sub(r'[^0-9\'\"]', "", height) + if "'" in height and '"' in height: + height = re.search(r'(\d+\'\d+\")', height).group(1) + height = re.sub(r'[^0-9\']', '', height) + feet = re.search(r'(\d+)\'', height) + if feet: + feet = feet.group(1) + feet = int(feet) * 12 + else: + feet = 0 + inches = re.search(r'\'(\d+)', height) + if inches: + inches = inches.group(1) + inches = int(inches) + else: + inches = 0 + return str(int((feet + inches) * 2.54)) + "cm" + return None diff --git a/performers/siteOnlyBBCPerformer.py b/performers/siteOnlyBBCPerformer.py new file mode 100644 index 00000000..a75363ba --- /dev/null +++ b/performers/siteOnlyBBCPerformer.py @@ -0,0 +1,127 @@ +import re +import scrapy +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteOnlyBBCPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//h2/a/following-sibling::text()', + 'image': '//img[contains(@class,"model_bio_thumb")]/@src0_1x', + + 'height': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Height")]', + 'eyecolor': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Eye Color")]', + 'haircolor': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Hair Color")]', + 'measurements': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Measurements")]', + 'piercings': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Piercings")]', + 'astrology': '//span[@class="model_bio_heading"]/following-sibling::text()[contains(.,"Astrological")]', + 'bio': '//span[@class="model_bio_heading"]/following-sibling::comment()[contains(.,"Bio Extra Field") and not(contains(.,"Accompanying"))]/following-sibling::text()', + + 'pagination': '/tour/models/models_%s_d.html?g=f', + 'external_id': 'models/(.+).html$' + } + + name = 'OnlyBBCPerformer' + network = 'Only BBC' + + start_urls = [ + 'https://www.onlybbc.com' + ] + + def get_performers(self, response): + performers = response.xpath('//div[@class="model"]/div/a/@href').getall() + for performer in performers: + yield scrapy.Request( + url=self.format_link(response, performer), + callback=self.parse_performer + ) + + def get_name(self, response): + + name = self.process_xpath(response, self.get_selector_map('name')).get().strip() + name = re.sub(r'[^a-zA-Z0-9 ]', '', name) + return name.strip() + + def get_gender(self, response): + return "Female" + + def get_height(self, response): + height = super().get_height(response) + if height: + tot_inches = 0 + if re.search(r'(\d+)[\'\"]', height): + feet = re.search(r'(\d+)\'', height) + if feet: + feet = feet.group(1) + tot_inches = tot_inches + (int(feet) * 12) + inches = re.search(r'\d+?\'(\d+)', height) + if inches: + inches = inches.group(1) + inches = int(inches) + tot_inches = tot_inches + inches + height = str(int(tot_inches * 2.54)) + "cm" + return height + return None + + def get_eyecolor(self, response): + if 'eyecolor' in self.selector_map: + eyecolor = self.process_xpath(response, self.get_selector_map('eyecolor')).get() + if eyecolor: + eyecolor = eyecolor.replace(" ", "").replace("\n", "") + eyecolor = re.search(r'Eye Color:\s+(.*?)\s{3}', eyecolor) + if eyecolor: + eyecolor = eyecolor.group(1) + return eyecolor.strip() + return '' + + def get_haircolor(self, response): + if 'haircolor' in self.selector_map: + haircolor = self.process_xpath(response, self.get_selector_map('haircolor')).get() + if haircolor: + haircolor = haircolor.replace(" ", "").replace("\n", "") + haircolor = re.search(r'Hair Color:\s+(.*?)\s{3}', haircolor) + if haircolor: + haircolor = haircolor.group(1) + return haircolor.strip() + return '' + + def get_piercings(self, response): + if 'piercings' in self.selector_map: + piercings = self.process_xpath(response, self.get_selector_map('piercings')).get() + if piercings: + piercings = piercings.replace(" ", "").replace("\n", "") + piercings = re.search(r'Piercings:\s+(.*?)\s{3}', piercings) + if piercings: + piercings = piercings.group(1) + return piercings.strip() + return '' + + def get_measurements(self, response): + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + if measurements: + measurements = measurements.replace("\n", "").replace("\r", "") + measurements = re.search('Measurements:(.*)', measurements).group(1) + return measurements.strip() + return '' + + def get_cupsize(self, response): + if 'measurements' in self.selector_map: + measurements = self.process_xpath(response, self.get_selector_map('measurements')).get() + if measurements: + measurements = measurements.replace("\n", "").replace("\r", "") + measurements = re.search('Measurements:(.*)', measurements).group(1) + if measurements: + measurements = measurements.strip() + if re.search(r'(\d+\w+)-', measurements): + cupsize = re.search(r'(\d+\w+)-', measurements).group(1) + return cupsize.strip().upper() + return '' + + def get_astrology(self, response): + if 'astrology' in self.selector_map: + astrology = self.process_xpath(response, self.get_selector_map('astrology')).get() + if astrology: + astrology = astrology.replace("\n", "").replace("\r", "") + astrology = re.search('Sign:(.*)', astrology).group(1) + return astrology.strip() + return '' diff --git a/performers/sitePPPTVPerformer.py b/performers/sitePPPTVPerformer.py new file mode 100644 index 00000000..80003a20 --- /dev/null +++ b/performers/sitePPPTVPerformer.py @@ -0,0 +1,56 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SitePPPTVPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/en/actress?sort=a.position&direction=asc&page=%s', + 'external_id': r'model/(.*)/' + } + + name = 'PPPTVPerformer' + + start_urls = [ + 'https://p-p-p.tv', + ] + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class, "model-card") and contains(@class, "mt-4")]') + for performer in performers: + item = PerformerItem() + + perf_name = performer.xpath('.//div[contains(@class, "model-card-title")]/text()').get() + item['name'] = self.cleanup_title(perf_name.strip()) + if "/" in item['name']: + names = item['name'].split("/") + item['name'] = self.cleanup_title(names[-1]) + image = performer.xpath('.//a/img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + if "Ts " in item['name']: + item['gender'] = 'Trans Female' + else: + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'P-P-P TV' + item['url'] = self.format_link(response, performer.xpath('.//a[contains(@class, "profile-link")]/@href').get()) + + yield item diff --git a/performers/sitePervectPerformer.py b/performers/sitePervectPerformer.py new file mode 100644 index 00000000..3eae31c0 --- /dev/null +++ b/performers/sitePervectPerformer.py @@ -0,0 +1,56 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper +from scrapy.utils.project import get_project_settings +from tpdb.items import PerformerItem + + +class SitePerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': "", + 'external_id': r'model/(.*)/' + } + + name = 'PervectPerformer' + network = 'Pervect' + + start_urls = [ + 'https://pervect.com/models/', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + url = "https://pervect.com/models/" + yield scrapy.Request(url, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + performers = response.xpath('//a[contains(@class, "model-item")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//div[contains(@class, "model-name")]/text()').get()).replace("-", "") + item['image'] = performer.xpath('.//img/@src').get() + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['bio'] = "" + item['gender'] = "Female" + item['astrology'] = "" + item['birthday'] = "" + item['birthplace'] = "" + item['cupsize'] = "" + item['ethnicity'] = "" + item['eyecolor'] = "" + item['fakeboobs'] = "" + item['haircolor'] = "" + item['height'] = "" + item['measurements'] = "" + item['nationality'] = "" + item['piercings'] = "" + item['tattoos'] = "" + item['weight'] = "" + item['network'] = "Pervect" + item['url'] = performer.xpath('./@href').get() + + yield item diff --git a/performers/sitePutaLocuraPerformer.py b/performers/sitePutaLocuraPerformer.py new file mode 100644 index 00000000..2d8027c2 --- /dev/null +++ b/performers/sitePutaLocuraPerformer.py @@ -0,0 +1,59 @@ +import scrapy +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem +from string import ascii_lowercase + + +class SitePutaLocuraPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/en/porn-actresses/%s', + 'external_id': r'model/(.*)/' + } + + name = 'SitePutaLocuraPerformer' + + start_urls = [ + 'https://www.putalocura.com', + ] + + def start_requests(self): + meta = {} + meta['page'] = self.page + + for char in ascii_lowercase: + url = f"https://www.putalocura.com/en/porn-actresses/{char}" + yield scrapy.Request(url, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_performers(self, response): + performers = response.xpath('//a[contains(@class,"c-boxlist__box")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('.//h2/text()').get()) + image = performer.xpath('.//img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'Puta Locura' + item['url'] = self.format_link(response, performer.xpath('./@href').get()) + + yield item diff --git a/performers/siteThatFetishGirlPerformer.py b/performers/siteThatFetishGirlPerformer.py new file mode 100644 index 00000000..f14d2419 --- /dev/null +++ b/performers/siteThatFetishGirlPerformer.py @@ -0,0 +1,49 @@ +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteThatFetishGirlPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models/models_%s_d.html?g=f', + 'external_id': r'model/(.*)/' + } + + name = 'ThatFetishGirlPerformer' + + start_urls = [ + 'https://thatfetishgirl.com', + ] + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class,"updateItem ")]') + for performer in performers: + item = PerformerItem() + + item['name'] = self.cleanup_title(performer.xpath('./p/a/text()').get()) + image = performer.xpath('.//img/@src0_3x|.//img/@src0_2x|.//img/@src0_1x') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + item['bio'] = '' + item['gender'] = 'Female' + item['astrology'] = '' + item['birthday'] = '' + item['birthplace'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + item['height'] = '' + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'That Fetish Girl' + item['url'] = self.format_link(response, performer.xpath('./div/a/@href').get()) + + yield item diff --git a/performers/siteTranzVRPerformer.py b/performers/siteTranzVRPerformer.py new file mode 100644 index 00000000..7d85280e --- /dev/null +++ b/performers/siteTranzVRPerformer.py @@ -0,0 +1,76 @@ +import json +import scrapy +from tpdb.items import PerformerItem +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteTranzVRPerformerPerformerSpider(BasePerformerScraper): + selector_map = { + 'pagination': '/models?o=d&p=%s', + 'external_id': r'model/(.*)/' + } + + name = 'TranzVRPerformer' + network = 'TranzVR' + + start_urls = [ + 'https://www.tranzvr.com', + ] + + def get_performers(self, response): + performers = response.xpath('//li[contains(@class,"cards-list__item")]/div[1]/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) + + def parse_performer(self, response): + performer = response.xpath('//script[contains(@type, "ld+json")]/text()').get() + performer = json.loads(performer) + item = PerformerItem() + + item['name'] = performer['name'] + + image = performer['image'] + if image: + item['image'] = self.format_link(response, image) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + + item['bio'] = '' + if "trans" in performer['gender'].lower(): + item['gender'] = 'Trans Female' + else: + item['gender'] = 'Female' + + if "birthDate" in performer and performer['birthDate']: + item['birthday'] = performer['birthDate'] + else: + item['birthday'] = '' + + if "birthPlace" in performer and performer['birthPlace']: + item['birthplace'] = performer['birthPlace'] + else: + item['birthplace'] = '' + + item['astrology'] = '' + item['cupsize'] = '' + item['ethnicity'] = '' + item['eyecolor'] = '' + item['fakeboobs'] = '' + item['haircolor'] = '' + + if "height" in performer and performer['height']: + item['height'] = performer['height'].replace(" ", "") + else: + item['height'] = '' + + item['measurements'] = '' + item['nationality'] = '' + item['piercings'] = '' + item['tattoos'] = '' + item['weight'] = '' + item['network'] = 'TranzVR' + item['url'] = response.url + + yield item diff --git a/performers/siteVurigVlaanderenPerformer.py b/performers/siteVurigVlaanderenPerformer.py new file mode 100644 index 00000000..b2e71cf7 --- /dev/null +++ b/performers/siteVurigVlaanderenPerformer.py @@ -0,0 +1,101 @@ +import string +import scrapy +from deep_translator import GoogleTranslator +from tpdb.BasePerformerScraper import BasePerformerScraper +from tpdb.items import PerformerItem + + +class SiteVurigVlaanderenPerformerSpider(BasePerformerScraper): + name = 'VurigVlaanderenPerformer' + start_url = 'https://vurigvlaanderen.be' + + selector_map = { + 'external_id': r'', + 'pagination': '/_next/data//models.json?page=%s', + } + + base_url = 'https://vurigvlaanderen.be' + + cookies = {"name": "agecookies", "value": "true"} + + headers_json = { + 'origin': 'https://vurigvlaanderen.be', + 'referer': 'https://vurigvlaanderen.be/', + 'Credentials': 'Syserauth 3-585d92b35321e910bc1c25b734531c9adf52e2679c0d42aefad09e2556cde47f-65be7945', + } + + def get_next_page_url(self, base, page): + url = 'https://api.sysero.nl/models?page={}&count=16&include=images:types(square):limit(1|0),products,categories&filter[status]=published&sort[published_at]=DESC&video_images=thumb&frontend=3' + return self.format_url(base, url.format(page)) + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = "https://vurigvlaanderen.be/modellen" + yield scrapy.Request(link, callback=self.start_requests_2, meta=meta, cookies=self.cookies) + + def start_requests_2(self, response): + meta = response.meta + link = self.get_next_page_url(self.base_url, meta['page']) + yield scrapy.Request(link, callback=self.parse, meta=meta, headers=self.headers_json) + + def parse(self, response, **kwargs): + performers = self.get_performers(response) + count = 0 + for performer in performers: + count += 1 + yield performer + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.get_performers, meta=meta, headers=self.headers_json) + + def get_performers(self, response): + jsondata = response.json() + jsondata = jsondata['data'] + for performer in jsondata: + performer = performer['attributes'] + item = PerformerItem() + + item['name'] = performer['title'] + item['image'] = f"https://cdndo.sysero.nl{performer['images']['square'][0]['path']}" + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['bio'] = '' + item['gender'] = "Female" + item['astrology'] = '' + if 'birth_date' in performer and (performer['birth_date'] and performer['birth_date'] < "2006-01-01"): + item['birthday'] = performer['birth_date'] + else: + item['birthday'] = '' + + item['birthplace'] = string.capwords(GoogleTranslator(source='nl', target='en').translate(performer['county'])) + item['measurements'] = '' + item['cupsize'] = '' + + item['ethnicity'] = '' + + item['haircolor'] = string.capwords(GoogleTranslator(source='nl', target='en').translate(performer['hair_color'])) + item['eyecolor'] = string.capwords(GoogleTranslator(source='nl', target='en').translate(performer['eye_color'])) + + item['fakeboobs'] = '' + if "length" in performer and performer['length']: + item['height'] = performer['length'] + "cm" + else: + item['height'] = '' + + if "weight" in performer and performer['weight']: + item['weight'] = performer['weight'] + "kg" + else: + item['weight'] = '' + + item['nationality'] = string.capwords(performer['country']) + item['piercings'] = '' + item['tattoos'] = '' + item['network'] = 'Vurig Vlaanderen' + item['url'] = f"https://vurigvlaanderen.be/modellen/{performer['slug']}" + + yield item diff --git a/performers/siteXX-CelPerformer.py b/performers/siteXX-CelPerformer.py new file mode 100644 index 00000000..02815046 --- /dev/null +++ b/performers/siteXX-CelPerformer.py @@ -0,0 +1,46 @@ +import re +import scrapy + +from tpdb.BasePerformerScraper import BasePerformerScraper + + +class SiteXXCelPerformerPerformerSpider(BasePerformerScraper): + selector_map = { + 'name': '//div[contains(@class, "model-details")]/h2/text()', + 'image': '//div[contains(@class, "model-details")]/preceding-sibling::div[1]/img/@src', + 'image_blob': True, + 'bio': '', + 'gender': '', + 'astrology': '', + 'birthday': '', + 'birthplace': '//div[contains(@class, "model-details")]//strong[contains(text(), "rom")]/following-sibling::text()[1]', + 'cupsize': '//div[contains(@class, "model-details")]//strong[contains(text(), "reasts")]/following-sibling::text()[1]', + 'ethnicity': '', + 'eyecolor': '', + 'fakeboobs': '', + 'haircolor': '', + 'height': '', + 'measurements': '', + 'nationality': '', + 'piercings': '', + 'tattoos': '', + 'weight': '', + + 'pagination': '/models/page-%s/?type=&sort=recent&', + 'external_id': r'model/(.*)/' + } + + name = 'XX-CelPerformer' + network = 'XX-Cel' + + start_urls = [ + 'https://xx-cel.com', + ] + + def get_gender(self, response): + return 'Female' + + def get_performers(self, response): + performers = response.xpath('//div[contains(@class, "model-cover")]/a[1]/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers) diff --git a/scenes/JMPlaywrightJSON.py b/scenes/JMPlaywrightJSON.py index c718540d..60564ee9 100644 --- a/scenes/JMPlaywrightJSON.py +++ b/scenes/JMPlaywrightJSON.py @@ -2,7 +2,7 @@ import json import scrapy from scrapy.utils.project import get_project_settings - +import base64 from tpdb.BaseSceneScraper import BaseSceneScraper from tpdb.items import SceneItem @@ -10,8 +10,11 @@ class JMPlaywrightJSONSpider(BaseSceneScraper): name = 'JMPlaywrightJSON' - start_urls = [ - 'https://www.jacquieetmicheltv.net', + start_url = 'https://www.jacquieetmicheltv.net' + + paginations = [ + '/en/content/list?studio=6352b65e8b4552ba57ee0e7d&page=%s', + '/en/content/list?studio=63626ce889913bab98631473&page=%s', ] # ~ cookies = { @@ -61,39 +64,33 @@ class JMPlaywrightJSONSpider(BaseSceneScraper): } def start_requests(self): - settings = get_project_settings() - - if not hasattr(self, 'start_urls'): - raise AttributeError('start_urls missing') - - if not self.start_urls: - raise AttributeError('start_urls selector missing') - meta = {} # ~ meta['playwright'] = True meta['page'] = self.page - if 'USE_PROXY' in self.settings.attributes.keys(): - use_proxy = self.settings.get('USE_PROXY') - elif 'USE_PROXY' in settings.attributes.keys(): - use_proxy = settings.get('USE_PROXY') - else: - use_proxy = None - if use_proxy: - print(f"Using Settings Defined Proxy: True ({settings.get('PROXY_ADDRESS')})") - else: - try: - if self.proxy_address: - meta['proxy'] = self.proxy_address - print(f"Using Scraper Defined Proxy: True ({meta['proxy']})") - except Exception: - print("Using Proxy: False") + for pagination in self.paginations: + meta['pagination'] = pagination + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers={"Accept": "application/json"}, cookies=self.cookies) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - for link in self.start_urls: - yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers={"Accept": "application/json"}, cookies=self.cookies) + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) def get_scenes(self, response): - print(response.text) + # ~ print(response.text) jsondata = json.loads(response.text) taglist = jsondata['facets']['tags'] scenelist = jsondata['contents'] @@ -103,23 +100,31 @@ def get_scenes(self, response): # ~ item['description'] = scene['description'] # ~ item['description'] = re.sub('<[^<]+?>', '', item['description']).replace("\n", " ").replace("\r", " ").replace("\t", " ").replace(" ", " ").strip() # ~ item['duration'] = str(int(scene['duration']) * 60) + if "mixpanel" in scene and scene['mixpanel']: + mixpanel = json.loads(base64.b64decode(scene['mixpanel'])) + if mixpanel['contentDuration']: + item['duration'] = str(mixpanel['contentDuration']) item['description'] = "" item['date'] = scene['publication_date']['iso'] item['image'] = scene['poster']['thumbnail']['srcSet'] item['image'] = re.search(r'^(http.*?)\s', item['image']).group(1) item['image_blob'] = self.get_image_blob_from_link(item['image']) item['type'] = 'Scene' - item['id'] = scene['id'] item['url'] = self.format_link(response, scene['routes']['content']).replace("www.", "") - item['site'] = "Jacquie et Michel" + item['id'] = re.search(r'content/(.*?)/', item['url']).group(1) + if "6352b65e8b4552ba57ee0e7d" in response.url: + item['site'] = "Jacquie et Michel" + if "63626ce889913bab98631473" in response.url: + item['site'] = "Dompteuse" item['parent'] = "Jacquie et Michel" item['network'] = "Jacquie et Michel" item['tags'] = [] - for tag in scene['tags']: - for tagref in taglist: - if tag == tagref['id']: - item['tags'].append(tagref['name']) - break + if "tags" in scene: + for tag in scene['tags']: + for tagref in taglist: + if tag == tagref['id']: + item['tags'].append(tagref['name']) + break item['performers'] = [] item['trailer'] = '' diff --git a/scenes/MovieDVDErotikByStudio.py b/scenes/MovieDVDErotikByStudio.py deleted file mode 100644 index 1adffb77..00000000 --- a/scenes/MovieDVDErotikByStudio.py +++ /dev/null @@ -1,99 +0,0 @@ -import re -import json -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper - - -class MoviesErotikSpider(BaseSceneScraper): - name = 'MoviesDVDErotikByStudio' - network = 'Erotik' - - start_urls = [ - 'https://api.dvderotik.com', - ] - - selector_map = { - 'description': '//div[@class="details-teaser"]/div/p/text()', - 'date': '', - 'performers': '//div[@class="star-grid-element"]//div[@class="star-image"]/a/div/div/div/span/text()', - 'tags': '//section[contains(@class, "details-movie")]//div[@class="inner-wrapper"]//div[@class="details-element"]/h4[contains(text(), "Categories")]/following-sibling::a/text()', - 'external_id': r'', - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=99&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: GGG - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=157&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Deutscheland Porno - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=441&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Mannermagnet - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=111&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: 666 - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=784&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Aische Perverse - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=9&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: AlexD - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=440&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Amateur Check In - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=1498&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Anny Aurora - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=1679&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Anstoss - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=1487&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Blue Movie - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=724&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Color Climax - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=1704&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Create-X - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=1416&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Danger Women - # ~ 'pagination': '/dvd/search/movies?filter[studio][]=605&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: Gang Bang Amateure - 'pagination': '/dvd/search/movies?filter[studio][]=87&itemsPerPage=48&page=%s&source=moviesoverview', # Studio: MMV - 'type': 'Movie', - } - - custom_scraper_settings = { - 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57', - 'AUTOTHROTTLE_ENABLED': True, - 'AUTOTHROTTLE_START_DELAY': 1, - 'AUTOTHROTTLE_MAX_DELAY': 10, - 'CONCURRENT_REQUESTS': 4, - 'RANDOMIZE_DOWNLOAD_DELAY': True, - 'CONCURRENT_REQUESTS_PER_DOMAIN': 4, - 'CONCURRENT_REQUESTS_PER_IP': 4, - } - - def get_scenes(self, response): - meta = response.meta - jsondata = json.loads(response.text) - for movie in jsondata['movies']: - meta['id'] = movie['itemNumber'] - meta['image'] = movie['src']['default'] - # ~ meta['duration'] = movie['durationSeconds'] - if "studio" in movie and movie['studio']: - if "name" in movie['studio'] and movie['studio']['name']: - meta['site'] = movie['studio']['name'] - meta['studio'] = meta['site'] - if movie['name']['en']: - meta['title'] = movie['name']['en'] - else: - meta['title'] = movie['name']['de'] - if movie['directors']: - if "name" in movie['directors'][0] and movie['directors'][0]['name']: - meta['directors'] = movie['directors'][0]['name'] - if len(movie['href']['en'].strip()) > 1: - meta['url'] = "https://en.erotik.com/" + movie['href']['en'] - yield scrapy.Request(meta['url'], callback=self.parse_scene, meta=meta) - # ~ print(meta) - - def get_date(self, response): - prod_year = response.xpath('//h4[contains(text(),"Release")]/following-sibling::p/text()') - if prod_year: - prod_year = prod_year.get().strip() - prod_year = re.sub(r'[^0-9]', '', prod_year) - if not prod_year: - prod_year = response.xpath('//h4[contains(text(),"Production")]/following-sibling::p/text()') - if prod_year: - prod_year = prod_year.get().strip() - prod_year = re.sub(r'[^0-9]', '', prod_year) - - if prod_year: - prod_date = f"{prod_year}-01-01" - return prod_date - return None - - def get_tags(self, response): - tags = super().get_tags(response) - if tags: - tags2 = tags.copy() - for tag in tags2: - matches = ['hour', 'uhd', 'avn', 'award', 'best', 'recommended', 'feature'] - if any(x in tag.lower() for x in matches): - tags.remove(tag) - tags = list(map(lambda x: x.strip().title(), tags)) - - return tags diff --git a/scenes/MovieErotik.py b/scenes/MovieErotik.py deleted file mode 100644 index 6321f77f..00000000 --- a/scenes/MovieErotik.py +++ /dev/null @@ -1,85 +0,0 @@ -import re -import json -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper - - -class MoviesErotikSpider(BaseSceneScraper): - name = 'MoviesErotik' - network = 'Erotik' - - start_urls = [ - 'https://api.erotik.com', - ] - - selector_map = { - 'description': '//div[@class="details-teaser"]/div/p/text()', - 'date': '', - 'performers': '//div[@class="star-grid-element"]//div[@class="star-image"]/a/div/div/div/span/text()|//figcaption[contains(@class,"swinger-avatar__caption")]/a/text()', - 'tags': '//section[contains(@class, "details-movie")]//div[@class="inner-wrapper"]//div[@class="details-element"]/h4[contains(text(), "Categories")]/following-sibling::a/text()|//div[@class="swinger-hero__content"]/div[@class="swinger-tags"]//a[@class="swinger-tags__item"]/text()', - 'external_id': r'', - 'pagination': '/vod/search/movies?filter[languages][]=de?filter[languages][]=pt&filter[languages][]=fr&filter[languages][]=de&filter[languages][]=it&filter[languages][]=cz&filter[languages][]=es&itemsPerPage=96&page=%s&source=moviesoverview', - 'type': 'Movie', - } - - custom_scraper_settings = { - 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57', - 'AUTOTHROTTLE_ENABLED': True, - 'AUTOTHROTTLE_START_DELAY': 1, - 'AUTOTHROTTLE_MAX_DELAY': 10, - 'CONCURRENT_REQUESTS': 4, - 'RANDOMIZE_DOWNLOAD_DELAY': True, - 'CONCURRENT_REQUESTS_PER_DOMAIN': 4, - 'CONCURRENT_REQUESTS_PER_IP': 4, - } - - def get_scenes(self, response): - meta = response.meta - jsondata = json.loads(response.text) - for movie in jsondata['movies']: - meta['id'] = movie['itemNumber'] - meta['image'] = movie['src']['default'] - meta['duration'] = movie['durationSeconds'] - if "studio" in movie and movie['studio']: - if "name" in movie['studio'] and movie['studio']['name']: - meta['site'] = movie['studio']['name'] - meta['studio'] = meta['site'] - if movie['name']['en']: - meta['title'] = movie['name']['en'] - else: - meta['title'] = movie['name']['de'] - if movie['directors']: - if "name" in movie['directors'][0] and movie['directors'][0]['name']: - meta['directors'] = movie['directors'][0]['name'] - if len(movie['href']['en'].strip()) > 1: - meta['url'] = "https://en.erotik.com/" + movie['href']['en'] - yield scrapy.Request(meta['url'], callback=self.parse_scene, meta=meta) - # ~ print(meta) - - def get_date(self, response): - prod_year = response.xpath('//h4[contains(text(),"Release")]/following-sibling::p/text()') - if prod_year: - prod_year = prod_year.get().strip() - prod_year = re.sub(r'[^0-9]', '', prod_year) - if not prod_year: - prod_year = response.xpath('//h4[contains(text(),"Production")]/following-sibling::p/text()') - if prod_year: - prod_year = prod_year.get().strip() - prod_year = re.sub(r'[^0-9]', '', prod_year) - - if prod_year: - prod_date = f"{prod_year}-01-01" - return prod_date - return None - - def get_tags(self, response): - tags = super().get_tags(response) - if tags: - tags2 = tags.copy() - for tag in tags2: - matches = ['hour', 'uhd', 'avn', 'award', 'best', 'recommended', 'feature'] - if any(x in tag.lower() for x in matches): - tags.remove(tag) - tags = list(map(lambda x: x.strip().title(), tags)) - - return tags diff --git a/scenes/MovieSexVideoAllByStudio.py b/scenes/MovieSexVideoAllByStudio.py deleted file mode 100644 index b7de6e19..00000000 --- a/scenes/MovieSexVideoAllByStudio.py +++ /dev/null @@ -1,184 +0,0 @@ -import re -import json -import scrapy -from slugify import slugify -import unidecode -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class MovieSexVideoAllByStudioSpider(BaseSceneScraper): - name = 'MovieSexVideoAllByStudio' - - start_urls = [ - 'https://www.sexvideoall.com', - ] - - selector_map = { - 'external_id': r'', - 'pagination': '', # Eromaxx - 'type': 'Movie', - } - - headers = {'Content-Type': 'application/json'} - - studios = [ - {"id": 609, "name": "SG Video"}, # Scat Videos - {"id": 502, "name": "Eromaxx"}, - {"id": 865, "name": "Czech"}, - {"id": 424, "name": "Purzel"}, - {"id": 148, "name": "MMV"}, - {"id": 145, "name": "Goldlight"}, - {"id": 323, "name": "Z-Faktor"}, - {"id": 185, "name": "Tabu"}, - {"id": 174, "name": "DBM"}, - {"id": 181, "name": "Inflagranti"}, - {"id": 221, "name": "Muschi Movie"}, - {"id": 330, "name": "Ribu Film"}, - {"id": 452, "name": "Telsev"}, - {"id": 187, "name": "Videorama"}, - {"id": 735, "name": "Belrose"}, - {"id": 307, "name": "EVS"}, - {"id": 541, "name": "Foxy Media"}, - {"id": 303, "name": "Herzog Video"}, - {"id": 182, "name": "MJP"}, - {"id": 667, "name": "Pervision"}, - {"id": 147, "name": "Puaka"}, - {"id": 295, "name": "Erotic Planet"}, - {"id": 2050, "name": "Movie Star"}, - {"id": 3317, "name": "Mia Bella"}, - {"id": 780, "name": "MVW.xxx"}, - {"id": 146, "name": "Magma Film"}, - {"id": 291, "name": "GMV"}, - {"id": 278, "name": "Fun Movies"}, - {"id": 736, "name": "Uschi Haller"}, - {"id": 920, "name": "Fantas P"}, - {"id": 3284, "name": "Love Arts"}, - {"id": 2060, "name": "Jodete Porn"}, - {"id": 2076, "name": "GB Media"}, - {"id": 401, "name": "Oftly Goldwin"}, - {"id": 479, "name": "Metabolic"}, - {"id": 454, "name": "Corrupt Media"}, - {"id": 532, "name": "Nebenan"}, - {"id": 294, "name": "Create-X"}, - {"id": 511, "name": "Cruel Media"}, - {"id": 601, "name": "Maximum Grind"}, - {"id": 522, "name": "Euro Extrem"}, - {"id": 243, "name": "Shots Video"}, - {"id": 563, "name": "Fick Tiv"}, - {"id": 179, "name": "Horny Heaven"}, - {"id": 128, "name": "Erotic Entertainment"}, - {"id": 319, "name": "Pleasure Verlag"}, - {"id": 503, "name": "21 Sextury Video"}, - {"id": 944, "name": "Teen X"}, - {"id": 483, "name": "Ultra Deca"}, - ] - - def start_requests(self): - meta = {} - meta['page'] = self.page - link = "https://www.sexvideoall.com/main" - yield scrapy.Request(link, callback=self.start_requests_2, meta=meta, headers=self.headers, cookies=self.cookies) - - def start_requests_2(self, response): - meta = response.meta - for studio in self.studios: - meta['studio'] = studio['name'] - meta['studio_id'] = studio['id'] - link = f"https://www.sexvideoall.com/api/item/getStudioCat/{meta['studio_id']}" - yield scrapy.Request(link, callback=self.start_requests_3, meta=meta, headers=self.headers, cookies=self.cookies) - - def start_requests_3(self, response): - meta = response.meta - meta['categories'] = json.loads(response.text) - meta['categories'] = meta['categories']['studiocate'] - meta['pagination'] = self.get_selector_map('pagination') - meta['payload'] = {"cat": 0, "studio": meta['studio_id'], "subcat": 0, "email": "", "lan": "en", "ip": "66.85.229.220", "page": int(meta['page']), "size": 20, "sort": 4} - meta['link'] = 'https://www.sexvideoall.com/api/item/StudioCategory' - yield scrapy.Request(meta['link'], callback=self.parse, method="POST", body=json.dumps(meta['payload']), meta=meta, headers=self.headers) - - def parse(self, response): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - yield movie - - page_json = json.loads(response.text) - total_pages = page_json['totalPages'] - if 'page' in response.meta and response.meta['page'] < self.limit_pages and response.meta['page'] < total_pages: - meta['page'] = meta['page'] + 1 - meta['payload']['page'] = meta['page'] - print(f"NEXT PAGE: {str(meta['page'])} of {total_pages} for Studio: {meta['studio']}: The last page had: {count} items") - yield scrapy.Request(meta['link'], callback=self.parse, method="POST", body=json.dumps(meta['payload']), meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - moviejson = json.loads(response.text) - moviejson = moviejson['results']['cresults'] - for movie in moviejson: - link = f"https://www.sexvideoall.com/api/item/Getitem/{movie['id']}/x" - yield scrapy.Request(link, callback=self.parse_movie, method="POST", meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - moviedata = json.loads(response.text) - movie = moviedata['cresults'] - item = SceneItem() - - item['title'] = self.cleanup_title(unidecode.unidecode(movie['nameE'])).replace("&", "and") - if movie['releaseDate']: - item['date'] = self.parse_date(movie['releaseDate'], date_formats=['%m/%d/%Y']).strftime('%Y-%m-%d') - else: - item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', movie['datum']).group(1) - if "length" in movie and movie['length']: - item['duration'] = str(int(movie['length']) * 60) - else: - item['duration'] = None - item['description'] = movie['beschreibungE'] - if "No Regional Coding" in item['description']: - item['description'] = '' - item['id'] = movie['id'] - item['url'] = f"https://www.sexvideoall.com/item/{item['id']}/en/{slugify(item['title'])}" - item['performers'] = [] - for performer in moviedata['pornstars']: - item['performers'].append(performer['name']) - item['site'] = meta['studio'] - item['parent'] = meta['studio'] - item['network'] = meta['studio'] - item['image'] = f"https://static.sexvideoall.com/SamplePhoto/{item['id']}.jpg" - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['type'] = "Movie" - - item['trailer'] = None - item['tags'] = [] - for kat in movie['kat'].split(','): - for genre in meta['categories']: - if int(kat.strip()) == genre['id']: - if "European " in genre['kat']: - genre['kat'] = genre['kat'].replace("European ", "") - item['tags'].append("European") - if "3p" in genre['kat']: - genre['kat'] = "Threesome" - if "Cumshots" in genre['kat']: - genre['kat'] = "Cumshots" - if "Out Door" in genre['kat']: - genre['kat'] = "Outdoors" - if "Teens/College Girls" in genre['kat']: - genre['kat'] = "18+ Teens" - if "Nylon/Stocking" in genre['kat']: - genre['kat'] = "Stockings" - if "Extreme/Rough Sex" in genre['kat']: - genre['kat'] = "Rough Sex" - if "Women with Glasses" in genre['kat']: - genre['kat'] = "Glasses" - if "P.O.V." in genre['kat']: - genre['kat'] = "POV" - if "Cream Pie" in genre['kat']: - genre['kat'] = "Creampie" - if "WC" in genre['kat']: - genre['kat'] = "Bathroom" - item['tags'].append(genre['kat']) - - yield self.check_item(item, self.days) diff --git a/scenes/aggregatorPornbox.py b/scenes/aggregatorPornbox.py index 82e56247..8269bda3 100644 --- a/scenes/aggregatorPornbox.py +++ b/scenes/aggregatorPornbox.py @@ -19,7 +19,7 @@ class PornboxSingleSiteSpider(BaseSceneScraper): ] studios = [ - {'studio': 144, 'site': 'Culioneros'} + {'studio': 5899, 'site': 'Oksana Katysheva'} ] content_json_url = 'https://pornbox.com/contents/%s' @@ -83,7 +83,10 @@ def parse_scene(self, response): item = SceneItem() item['title'] = jsondata['scene_name'] - item['description'] = self.cleanup_description(jsondata['small_description']) + if "small_description" in jsondata and jsondata['small_description']: + item['description'] = self.cleanup_description(jsondata['small_description']) + else: + item['description'] = "" item['site'] = response.meta['site'] item['date'] = self.parse_date(jsondata['publish_date']).isoformat() item['image'] = jsondata['player_poster'] diff --git a/scenes/javJAVCT.py b/scenes/javJAVCT.py new file mode 100644 index 00000000..ad3c0a09 --- /dev/null +++ b/scenes/javJAVCT.py @@ -0,0 +1,325 @@ +import re +import string +import requests +import scrapy +import json +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class JavJAVCTSpider(BaseSceneScraper): + name = 'JAVCT' + + start_urls = [ + 'https://javct.net', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/new-releases/pg-%s', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + if self.limit_pages == 1: + self.limit_pages = 10 + + if self.days == 20: + self.days = 99999 + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//h3[contains(@class, "card__title")]/a/@href').getall() + for scene in scenes: + meta['id'] = re.search(r'.*/(.*?)$', scene).group(1) + if re.search(r'(\w+-\w+)-\w+', meta['id']): + meta['id'] = re.search(r'(\w+-\w+)-\w+', meta['id']).group(1) + if meta['id'] and "ppv" not in meta['id'].lower(): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + meta = response.meta + item = SceneItem() + item = self.prep_item(item) + + item['id'] = meta['id'].upper() + + r18 = self.get_r18(item['id']) + # ~ if item['id'] == 'HOWS-002': + # ~ print(r18) + if r18: + if r18['title']: + item['title'] = string.capwords(self.uncensor(r18['title'])) + else: + title = response.xpath('//h1[contains(@class,"section__title")]/text()').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + if r18['label'] and "name" in r18['label']: + item['site'] = string.capwords(r18['label']['name'].replace(".", "")) + else: + site = response.xpath('//li/span[contains(text(), "Studio:")]/following-sibling::a//text()|//li/span[contains(text(), "Studio:")]/following-sibling::text()').getall() + site = list(map(lambda x: string.capwords(x.strip()), site)) + if site: + site = list(filter(None, site)) + item['site'] = string.capwords(site[0].replace(".", "")) + + if r18['maker'] and "name" in r18['maker']: + item['parent'] = string.capwords(r18['maker']['name'].replace(".", "")) + else: + parent = response.xpath('//li/span[contains(text(), "Label:")]/following-sibling::a//text()|//li/span[contains(text(), "Label:")]/following-sibling::text()').getall() + parent = list(map(lambda x: string.capwords(x.strip()), parent)) + if parent: + parent = list(filter(None, parent)) + item['parent'] = string.capwords(parent[0].replace(".", "")) + if item['parent'] and not item['site']: + item['site'] = item['parent'] + if item['site'] and not item['parent']: + item['parent'] = item['site'] + item['date'] = r18['release_date'] + + r18image = False + if r18['images']['jacket_image']['large'] and "http" in r18['images']['jacket_image']['large']: + r18image = r18['images']['jacket_image']['large'] + elif r18['images']['jacket_image']['large2'] and "http" in r18['images']['jacket_image']['large2']: + r18image = r18['images']['jacket_image']['large2'] + + if r18['actresses']: + for performer in r18['actresses']: + item['performers'].append(string.capwords(performer['name'])) + if not item['performers']: + item['performers'] = response.xpath('//li/span[contains(text(), "Model(s):")]/following-sibling::a//text()').getall() + + if r18['sample']: + if r18['sample']['high']: + item['trailer'] = r18['sample']['high'] + + if r18['categories']: + for tag in r18['categories']: + item['tags'].append(string.capwords(tag['name'])) + + if r18['director']: + item['director'] = r18['director'] + + if r18['runtime_minutes']: + item['duration'] = str(int(r18['runtime_minutes']) * 60) + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={r18['content_id']}/" + item['network'] = 'R18' + else: + title = response.xpath('//h1[contains(@class,"section__title")]/text()').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + director = response.xpath('//li/span[contains(text(), "Director:")]/following-sibling::text()') + if director: + item['director'] = string.capwords(director.get()) + + scenedate = response.xpath('//li/span[contains(text(), "Release Date:")]/following-sibling::text()') + if scenedate: + item['date'] = self.parse_date(scenedate.get(), date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + + item['performers'] = response.xpath('//li/span[contains(text(), "Model(s):")]/following-sibling::a//text()').getall() + + duration = response.xpath('//li/span[contains(text(), "Running time:")]/following-sibling::text()') + if duration: + duration = re.search(r'(\d+)', duration.get()) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + site = response.xpath('//li/span[contains(text(), "Studio:")]/following-sibling::a//text()|//li/span[contains(text(), "Studio:")]/following-sibling::text()').getall() + site = list(map(lambda x: string.capwords(x.strip()), site)) + if site: + site = list(filter(None, site)) + item['site'] = string.capwords(site[0].replace(".", "")) + + parent = response.xpath('//li/span[contains(text(), "Label:")]/following-sibling::a//text()|//li/span[contains(text(), "Label:")]/following-sibling::text()').getall() + parent = list(map(lambda x: string.capwords(x.strip()), parent)) + if parent: + parent = list(filter(None, parent)) + item['parent'] = string.capwords(parent[0].replace(".", "")) + + item['network'] = 'R18' + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={re.sub('[^a-z0-9]', '', item['id'].lower())}" + + # ### Tasks for both sources + + if not item['parent'] or "---" in item['parent']: + item['parent'] = item['site'] + + if not item['site'] or "---" in item['site']: + item['site'] = item['parent'] + + # Strip the ID from the title, then re-add it in uppercase and without embellishments + title = re.search(fr"{item['id']}(?:-\w+)? (.*)", item['title'].upper()) + if title: + title = title.group(1) + item['title'] = f"{item['id']}: {string.capwords(title)}" + else: + item['title'] = f"{item['id']}: {string.capwords(item['title'])}" + + # Get the Front and Back images from site, using R18 image if not available + item['image'] = response.xpath('//img[@class="cover"]/@src').get() + if not item['image'] and r18image: + item['image'] = r18image + + # Get the blobs + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + # Add site tags to existing tags pulled from R18 if available + tags = response.xpath('//li/span[contains(text(), "Categories:")]/following-sibling::a//text()|//li/span[contains(text(), "Studio:")]/following-sibling::text()').getall() + tags = list(map(lambda x: string.capwords(x.strip()), tags)) + tags = list(filter(None, tags)) + if tags: + for tag in tags: + item['tags'].append(string.capwords(tag)) + + # General purpose removal of any additonal tokens for image url + if item['image']: + if "?" in item['image'] and ("token" in item['image'].lower() or "expire" in item['image'].lower()): + item['image'] = re.search(r'(.*?)\?', item['image']).group(1) + + item['type'] = 'JAV' + + if "duration" not in item or not item['duration'] or int(item['duration']) > 4200: + yield self.check_item(item, self.days) + + def get_r18(self, javid): + javid = javid.replace("-", "").lower().strip() + link = f"https://r18.dev/videos/vod/movies/detail/-/dvd_id={javid}/json" + req = requests.get(link) + if req: + content = json.loads(req.content) + else: + content = False + return content + + def uncensor(self, title): + title = title.replace("A*****t", "Assault") + title = title.replace("A****p", "Asleep") + title = title.replace("A***e", "Abuse") + title = title.replace("B***d", "Blood") + title = title.replace("B**d", "Bled") + title = title.replace("C***d", "Child") + title = title.replace("C*ck", "Cock") + title = title.replace("D******e", "Disgrace") + title = title.replace("D***king", "Drinking") + title = title.replace("D***k", "Drunk") + title = title.replace("D**g", "Drug") + title = title.replace("F*****g", "Forcing") + title = title.replace("F***e", "Force") + title = title.replace("G*******g", "Gangbang") + title = title.replace("G******g", "Gangbang") + title = title.replace("H*********n", "Humiliation") + title = title.replace("H*******e", "Hypnotize") + title = title.replace("H********d", "Hypnotized") + title = title.replace("H*******m", "Hypnotism") + title = title.replace("H**t", "Hurt") + title = title.replace("I****t", "Incest") + title = title.replace("K****p", "Kidnap") + title = title.replace("K****r", "Killer") + title = title.replace("K**l", "Kill") + title = title.replace("K*d", "Kid") + title = title.replace("L****a", "Lolita") + title = title.replace("M************n", "Mother And Son") + title = title.replace("M****t", "Molest") + title = title.replace("P********t", "Passed Out") + title = title.replace("P****h", "Punish") + title = title.replace("R****g", "Raping") + title = title.replace("R**e", "Rape") + title = title.replace("RStepB****************r", "Stepbrother and Sister") + title = title.replace("S*********l", "School Girl") + title = title.replace("S**********s", "School Girls") + title = title.replace("S********l", "Schoolgirl") + title = title.replace("S*********s", "Schoolgirls") + title = title.replace("S******g", "Sleeping") + title = title.replace("S*****t", "Student") + title = title.replace("S***e", "Slave") + title = title.replace("S**t", "Scat") + title = title.replace("Sch**l", "School") + title = title.replace("StepM************n", "Stepmother and Son") + title = title.replace("T******e", "Tentacle") + title = title.replace("T*****e", "Torture") + title = title.replace("U*********s", "Unconscious") + title = title.replace("V*****e", "Violate") + title = title.replace("V*****t", "Violent") + title = title.replace("Y********l", "Young Girl") + title = title.replace("A*****t", "Assault") + title = title.replace("a*****t", "assault") + title = title.replace("a****p", "asleep") + title = title.replace("a***e", "abuse") + title = title.replace("b***d", "blood") + title = title.replace("b**d", "bled") + title = title.replace("c***d", "child") + title = title.replace("c*ck", "cock") + title = title.replace("d******e", "disgrace") + title = title.replace("d***king", "drinking") + title = title.replace("d***k", "drunk") + title = title.replace("d**g", "drug") + title = title.replace("f*****g", "forcing") + title = title.replace("f***e", "force") + title = title.replace("g*******g", "gangbang") + title = title.replace("g******g", "gangbang") + title = title.replace("h*********n", "humiliation") + title = title.replace("h*******e", "hypnotize") + title = title.replace("h********d", "hypnotized") + title = title.replace("h*******m", "hypnotism") + title = title.replace("h**t", "hurt") + title = title.replace("i****t", "incest") + title = title.replace("k****p", "kidnap") + title = title.replace("k****r", "killer") + title = title.replace("k**l", "kill") + title = title.replace("k*d", "kid") + title = title.replace("l****a", "lolita") + title = title.replace("m************n", "mother and son") + title = title.replace("m****t", "molest") + title = title.replace("p********t", "passed out") + title = title.replace("p****h", "punish") + title = title.replace("r****g", "raping") + title = title.replace("r**e", "rape") + title = title.replace("rstepb****************r", "stepbrother and sister") + title = title.replace("s*********l", "school girl") + title = title.replace("s********l", "schoolgirl") + title = title.replace("s**********s", "school girls") + title = title.replace("s*********s", "schoolgirls") + title = title.replace("s******g", "sleeping") + title = title.replace("s*****t", "student") + title = title.replace("s***e", "slave") + title = title.replace("s**t", "scat") + title = title.replace("sch**l", "school") + title = title.replace("stepm************n", "stepmother and son") + title = title.replace("t******e", "tentacle") + title = title.replace("t*****e", "torture") + title = title.replace("u*********s", "unconscious") + title = title.replace("v*****e", "violate") + title = title.replace("v*****t", "violent") + title = title.replace("y********l", "young girl") + + return title + + def prep_item(self, item): + item['title'] = '' + item['date'] = '' + item['description'] = '' + item['image'] = '' + item['image_blob'] = '' + item['tags'] = [] + item['performers'] = [] + item['trailer'] = '' + item['type'] = 'JAV' + item['director'] = '' + item['site'] = '' + item['parent'] = '' + item['network'] = '' + + return item diff --git a/scenes/javJAVTrailers.py b/scenes/javJAVTrailers.py new file mode 100644 index 00000000..f048d4c4 --- /dev/null +++ b/scenes/javJAVTrailers.py @@ -0,0 +1,426 @@ +import re +import string +import requests +import scrapy +import json +import unidecode +import html +import codecs +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class JavJAVTrailersSpider(BaseSceneScraper): + name = 'JAVTrailers' + + start_urls = [ + 'https://javtrailers.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/videos?page=%s', + 'type': 'Scene', + } + + custom_scraper_settings = { + 'AUTOTHROTTLE_ENABLED': True, + 'AUTOTHROTTLE_START_DELAY': 1, + 'AUTOTHROTTLE_MAX_DELAY': 2, + # ~ 'CONCURRENT_REQUESTS': 4, + 'RANDOMIZE_DOWNLOAD_DELAY': True, + # ~ 'CONCURRENT_REQUESTS_PER_DOMAIN': 4, + # ~ 'CONCURRENT_REQUESTS_PER_IP': 4, + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + if self.limit_pages == 1: + self.limit_pages = 10 + + if self.days == 20: + self.days = 99999 + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="card-container"]/a/@href').getall() + for scene in scenes: + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + item = SceneItem() + item = self.prep_item(item) + + jsondata = response.xpath('//script[contains(text(), "__NUXT__")]/text()') + if jsondata: + jsondata = jsondata.get() + jsondata = re.search(r'(video:.*?),popunder', jsondata) + if jsondata: + jsondata = jsondata.group(1) + + contentid = False + sceneid = response.xpath('//span[contains(text(), "DVD ID:")]/following-sibling::text()[1]') + if not sceneid: + sceneid = response.xpath('//span[contains(text(), "Content ID:")]/following-sibling::text()[1]') + print(f" * Using Content_ID for: {sceneid.get().strip().upper()}") + contentid = True + if sceneid: + item['id'] = sceneid.get().strip().upper() + + + r18 = self.get_r18(item['id']) + # ~ if contentid: + # ~ print(r18) + # ~ if item['id'] == 'HOWS-002': + # ~ print(r18) + r18image = False + + if r18: + if r18['title']: + item['title'] = string.capwords(self.uncensor(r18['title'])) + else: + title = response.xpath('//h1[@class="lead"]/text()').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + if r18['label'] and "name" in r18['label']: + item['site'] = string.capwords(r18['label']['name'].replace(".", "")) + else: + site = response.xpath('//span[contains(text(), "Studio:")]/following-sibling::*/text()').getall() + site = list(map(lambda x: string.capwords(x.strip()), site)) + if site: + site = list(filter(None, site)) + item['site'] = string.capwords(site[0].replace(".", "")) + + if r18['maker'] and "name" in r18['maker']: + item['parent'] = string.capwords(r18['maker']['name'].replace(".", "")) + else: + parent = response.xpath('//span[contains(text(), "Studio:")]/following-sibling::*/text()').getall() + parent = list(map(lambda x: string.capwords(x.strip()), parent)) + if parent: + parent = list(filter(None, parent)) + item['parent'] = string.capwords(parent[0].replace(".", "")) + + if item['parent'] and not item['site']: + item['site'] = item['parent'] + if item['site'] and not item['parent']: + item['parent'] = item['site'] + item['date'] = r18['release_date'] + + r18image = False + if r18['images']['jacket_image']['large'] and "http" in r18['images']['jacket_image']['large']: + r18image = r18['images']['jacket_image']['large'] + elif r18['images']['jacket_image']['large2'] and "http" in r18['images']['jacket_image']['large2']: + r18image = r18['images']['jacket_image']['large2'] + + if r18['actresses']: + for performer in r18['actresses']: + item['performers'].append(string.capwords(performer['name'])) + if not item['performers']: + item['performers'] = response.xpath('//span[contains(text(), "Cast(s):")]/following-sibling::a/text()').getall() + + if r18['sample']: + if r18['sample']['high']: + item['trailer'] = r18['sample']['high'] + + if r18['categories']: + for tag in r18['categories']: + item['tags'].append(string.capwords(tag['name'])) + + if r18['director']: + item['director'] = r18['director'] + + if r18['runtime_minutes']: + item['duration'] = str(int(r18['runtime_minutes']) * 60) + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={r18['content_id']}/" + item['network'] = 'R18' + else: + title = response.xpath('//h1[@class="lead"]/text()').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + if "director" in jsondata: + director = string.capwords(re.search(r'director:.*?[\'\"](.*?)[\'\"]', jsondata).group(1)) + else: + director = response.xpath('//span[contains(text(), "Director:")]/following-sibling::text()') + if director: + director = string.capwords(director.get()) + + scenedate = response.xpath('//span[contains(text(), "Release Date:")]/following-sibling::text()') + if scenedate: + item['date'] = self.parse_date(scenedate.get(), date_formats=['%d %b %Y']).strftime('%Y-%m-%d') + + if "casts" in jsondata: + performers = re.search(r'casts:(.*?)\]', jsondata).group(1) + item['performers'] = re.findall(r'name.*?[\'\"](.*?)[\'\"]', performers) + else: + item['performers'] = response.xpath('//span[contains(text(), "Cast(s):")]/following-sibling::a/text()').getall() + + duration = response.xpath('//span[contains(text(), "Duration:")]/following-sibling::text()') + if duration: + duration = re.search(r'(\d+)', duration.get()) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + site = response.xpath('//span[contains(text(), "Studio:")]/following-sibling::*/text()').getall() + site = list(map(lambda x: string.capwords(x.strip()), site)) + if site: + site = list(filter(None, site)) + item['site'] = string.capwords(site[0].replace(".", "")) + + parent = response.xpath('//span[contains(text(), "Studio:")]/following-sibling::*/text()').getall() + parent = list(map(lambda x: string.capwords(x.strip()), parent)) + if parent: + parent = list(filter(None, parent)) + item['parent'] = string.capwords(parent[0].replace(".", "")) + + item['network'] = 'R18' + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={re.sub('[^a-z0-9]', '', item['id'].lower())}" + + # ### Tasks for both sources + + if not item['parent'] or "---" in item['parent']: + item['parent'] = item['site'] + + if not item['site'] or "---" in item['site']: + item['site'] = item['parent'] + + # Strip the ID from the title, then re-add it in uppercase and without embellishments + title = re.search(fr"{item['id']}(?:-\w+)? (.*)", item['title'].upper()) + if title: + title = title.group(1) + item['title'] = f"{item['id']}: {string.capwords(title)}" + else: + item['title'] = f"{item['id']}: {string.capwords(item['title'])}" + + # Get the Front and Back images from site, using R18 image if not available + if "image" in jsondata: + image = re.search(r'image.*?[\'\"](.*?)[\'\"]', jsondata) + if image: + image = unidecode.unidecode(html.unescape(image.group(1))) + if not item['image'] and r18image: + item['image'] = r18image + + item['image'] = item['image'].strip() + if not item['image']: + image = response.xpath('//meta[@property="og:image"]/@content') + if image: + item['image'] = image.get() + + # Get the blobs + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + # Add site tags to existing tags pulled from R18 if available + + if "categories" in jsondata: + tags = re.search(r'categories:(.*?)\]', jsondata).group(1) + tags = re.findall(r'name.*?[\'\"](.*?)[\'\"]', tags) + else: + tags = response.xpath('//span[contains(text(), "Categories:")]/following-sibling::*/text()').getall() + + if tags: + tags = list(map(lambda x: codecs.decode(unidecode.unidecode(html.unescape(string.capwords(x.strip()))), 'unicode-escape'), tags)) + tags = list(filter(None, tags)) + for tag in tags: + item['tags'].append(string.capwords(tag)) + + matches = ['Actress Best', 'Exclusive', 'Featured', 'Foreign Imports', 'Hi-def', 'High-quality', 'Hours', 'Sample'] + for tag in item['tags']: + for match in matches: + if match.lower() in tag.lower(): + item['tags'].remove(tag) + + item['performers_data'] = [] + if len(item['performers']): + for place, performer in enumerate(item['performers']): + hiragana = re.compile('[\u3040-\u309F]') + performer = hiragana.sub('', performer) + katakana = re.compile('[\u30A0-\u30FF]') + performer = katakana.sub('', performer) + CJK = re.compile('[\u4300-\u9faf]') + performer = CJK.sub('', performer) + + performer = performer.replace("\n", "").replace("\n", "").replace("\n", "").strip() + + if performer != item['performers'][place]: + item['performers'][place] = performer + + if re.search(r'(\d{3})', performer): + item['performers'].remove(performer) + elif re.search(r'[a-z0-9]{20,26}', performer.lower()): + item['performers'].remove(performer) + else: + perf = {} + perf['name'] = performer + perf['extra'] = {} + perf['extra']['gender'] = "Female" + perf['network'] = item['network'] + perf['site'] = item['network'] + item['performers_data'].append(perf) + + # General purpose removal of any additonal tokens for image url + if item['image']: + if "?" in item['image'] and ("token" in item['image'].lower() or "expire" in item['image'].lower()): + item['image'] = re.search(r'(.*?)\?', item['image']).group(1) + + item['type'] = 'JAV' + + # ~ if contentid: + # ~ print(item) + if ("duration" not in item or not item['duration'] or int(item['duration']) > 3900) and item['site'] and item['title']: + yield self.check_item(item, self.days) + + def get_r18(self, javid): + javid = javid.replace("-", "").lower().strip() + link = f"https://r18.dev/videos/vod/movies/detail/-/dvd_id={javid}/json" + req = requests.get(link) + if req: + content = json.loads(req.content) + else: + content = False + return content + + def uncensor(self, title): + title = title.replace("A*****t", "Assault") + title = title.replace("A****p", "Asleep") + title = title.replace("A***e", "Abuse") + title = title.replace("B***d", "Blood") + title = title.replace("B**d", "Bled") + title = title.replace("C***d", "Child") + title = title.replace("C*ck", "Cock") + title = title.replace("D******e", "Disgrace") + title = title.replace("D***king", "Drinking") + title = title.replace("D***k", "Drunk") + title = title.replace("D**g", "Drug") + title = title.replace("D*****d", "Drugged") + title = title.replace("F*****g", "Forcing") + title = title.replace("F***e", "Force") + title = title.replace("G*******g", "Gangbang") + title = title.replace("G******g", "Gangbang") + title = title.replace("H*********n", "Humiliation") + title = title.replace("H*******e", "Hypnotize") + title = title.replace("H********d", "Hypnotized") + title = title.replace("H*******m", "Hypnotism") + title = title.replace("H**t", "Hurt") + title = title.replace("I****t", "Incest") + title = title.replace("K****p", "Kidnap") + title = title.replace("K****r", "Killer") + title = title.replace("K**l", "Kill") + title = title.replace("K*d", "Kid") + title = title.replace("L****a", "Lolita") + title = title.replace("M************n", "Mother And Son") + title = title.replace("M****t", "Molest") + title = title.replace("P********t", "Passed Out") + title = title.replace("P****h", "Punish") + title = title.replace("R****g", "Raping") + title = title.replace("R**e", "Rape") + title = title.replace("RStepB****************r", "Stepbrother and Sister") + title = title.replace("S*********l", "School Girl") + title = title.replace("S**********s", "School Girls") + title = title.replace("S********l", "Schoolgirl") + title = title.replace("S*********s", "Schoolgirls") + title = title.replace("S******g", "Sleeping") + title = title.replace("S*****t", "Student") + title = title.replace("S***e", "Slave") + title = title.replace("S**t", "Scat") + title = title.replace("S*******y", "Scatology") + title = title.replace("Sch**l", "School") + title = title.replace("StepM************n", "Stepmother and Son") + title = title.replace("T******e", "Tentacle") + title = title.replace("T*****e", "Torture") + title = title.replace("U*********s", "Unconscious") + title = title.replace("V*****e", "Violate") + title = title.replace("V*****t", "Violent") + title = title.replace("Y********l", "Young Girl") + title = title.replace("A*****t", "Assault") + title = title.replace("a*****t", "assault") + title = title.replace("a****p", "asleep") + title = title.replace("a***e", "abuse") + title = title.replace("b***d", "blood") + title = title.replace("b**d", "bled") + title = title.replace("c***d", "child") + title = title.replace("c*ck", "cock") + title = title.replace("d******e", "disgrace") + title = title.replace("d***king", "drinking") + title = title.replace("d***k", "drunk") + title = title.replace("d**g", "drug") + title = title.replace("d*****d", "drugged") + title = title.replace("f*****g", "forcing") + title = title.replace("f***e", "force") + title = title.replace("g*******g", "gangbang") + title = title.replace("g******g", "gangbang") + title = title.replace("h*********n", "humiliation") + title = title.replace("h*******e", "hypnotize") + title = title.replace("h********d", "hypnotized") + title = title.replace("h*******m", "hypnotism") + title = title.replace("h**t", "hurt") + title = title.replace("i****t", "incest") + title = title.replace("k****p", "kidnap") + title = title.replace("k****r", "killer") + title = title.replace("k**l", "kill") + title = title.replace("k*d", "kid") + title = title.replace("l****a", "lolita") + title = title.replace("m************n", "mother and son") + title = title.replace("m****t", "molest") + title = title.replace("p********t", "passed out") + title = title.replace("p****h", "punish") + title = title.replace("r****g", "raping") + title = title.replace("r**e", "rape") + title = title.replace("rstepb****************r", "stepbrother and sister") + title = title.replace("s*********l", "school girl") + title = title.replace("s********l", "schoolgirl") + title = title.replace("s**********s", "school girls") + title = title.replace("s*********s", "schoolgirls") + title = title.replace("s******g", "sleeping") + title = title.replace("s*****t", "student") + title = title.replace("s***e", "slave") + title = title.replace("s**t", "scat") + title = title.replace("s*******y", "scatology") + title = title.replace("sch**l", "school") + title = title.replace("stepm************n", "stepmother and son") + title = title.replace("t******e", "tentacle") + title = title.replace("t*****e", "torture") + title = title.replace("u*********s", "unconscious") + title = title.replace("v*****e", "violate") + title = title.replace("v*****t", "violent") + title = title.replace("y********l", "young girl") + + return title + + def prep_item(self, item): + item['title'] = '' + item['date'] = '' + item['description'] = '' + item['image'] = '' + item['image_blob'] = '' + item['tags'] = [] + item['performers'] = [] + item['trailer'] = '' + item['type'] = 'JAV' + item['director'] = '' + item['site'] = '' + item['parent'] = '' + item['network'] = '' + + return item + + def json_validate(self, input_str): + founds = re.findall(r"{(?:[^{}]*{[^{]*})*[^{}]*}", input_str) + valid_jsons = [] + for x in founds: + try: + valid_jsons.append(json.loads(x)) + except json.JSONDecodeError: + continue + + return valid_jsons diff --git a/scenes/javJadeNet.py b/scenes/javJadeNet.py new file mode 100644 index 00000000..f3e7646d --- /dev/null +++ b/scenes/javJadeNet.py @@ -0,0 +1,89 @@ +import re +import string +import scrapy +from dateutil.relativedelta import relativedelta +import datetime +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class JAVJadeNetSpider(BaseSceneScraper): + name = 'JAVJadeNet' + network = 'R18' + + start_urls = [ + '', + ] + + selector_map = { + 'title': '//section[@id="detailMain"]//h1//text()', + 'description': '', + 'date': '//div[contains(./h2/text(), "Information")]/following-sibling::dl/dt[contains(text(), "Downloadable")]/following-sibling::dd[1]/text()', + 'date_formats': ['%Y/%m/%d'], + 'image': '//div[@class="detailPackage"]/img/@src', + 'performers': '', + 'tags': '//div[contains(./h2/text(), "Information")]/following-sibling::dl/dt[contains(text(), "Related")]/following-sibling::dd[1]/a/text()', + 'trailer': '', + 'external_id': r'.*/(\d+)', + 'pagination': 'https://www.jade-net-home.com/categories/whats_new?utf8=%E2%9C%93&ps%5Brelease_date_m%5D=01&ps%5Brelease_date_y%5D=2024', + 'type': 'JAV', + } + + def start_requests(self): + if self.days == 20 and self.page == 1 and self.limit_pages == 1: + current_month = datetime.datetime.now().strftime('%m') + current_year = datetime.datetime.now().strftime('%Y') + link = f"https://www.jade-net-home.com/categories/whats_new?utf8=%E2%9C%93&ps%5Brelease_date_m%5D={current_month}&ps%5Brelease_date_y%5D={current_year}" + yield scrapy.Request(link, callback=self.get_scenes, headers=self.headers, cookies=self.cookies) + else: + begin = datetime.datetime(year=2004, month=4, day=1) + now = datetime.datetime.now() + delta = relativedelta(now, begin) + delta = delta.months + (delta.years * 12) + for x in range(0, delta + 1): + target = datetime.datetime.now() - relativedelta(months=x) + targetMonth = target.strftime('%m') + targetYear = target.strftime('%Y') + link = f"https://www.jade-net-home.com/categories/whats_new?utf8=%E2%9C%93&ps%5Brelease_date_m%5D={targetMonth}&ps%5Brelease_date_y%5D={targetYear}" + yield scrapy.Request(link, callback=self.get_scenes, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="thumb"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + sceneid = response.xpath('//div[contains(./h2/text(), "Information")]/following-sibling::dl/dt[contains(text(), "Productcode")]/following-sibling::dd[1]/text()') + if sceneid: + return sceneid.get().strip().upper() + return None + + def get_duration(self, response): + duration = response.xpath('//div[contains(./h2/text(), "Information")]/following-sibling::dl/dt[contains(text(), "Time")]/following-sibling::dd[1]/text()') + if duration: + duration = duration.get() + duration = re.search(r'(\d+)', duration) + if duration: + duration = duration.group(1) + return str(int(duration) * 60) + return None + + def get_site(self, response): + site = response.xpath('//div[contains(./h2/text(), "Information")]/following-sibling::dl/dt[contains(text(), "Studios")]/following-sibling::dd[1]//text()') + if site: + return string.capwords(site.get().strip()) + return None + + def get_parent(self, response): + return self.get_site(response) + + def get_tags(self, response): + tags = super().get_tags(response) + tags2 = [] + for tag in tags: + if "mbps" not in tag.lower() and "fetishism" not in tag.lower() and "tasty" not in tag.lower(): + tags2.append(tag) + if "fetishism" in tag.lower(): + tags2.append("Fetish") + return tags2 diff --git a/scenes/javSEXTBPlaywright.py b/scenes/javSEXTBPlaywright.py new file mode 100644 index 00000000..379d6623 --- /dev/null +++ b/scenes/javSEXTBPlaywright.py @@ -0,0 +1,347 @@ +import re +import string +import requests +import scrapy +import json +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class JavSEXTBSpider(BaseSceneScraper): + name = 'JAVSEXTBPlaywright' + + start_url = 'https://sextb.net' + + paginations = [ + '/censored/pg-%s', + '/uncensored/pg-%s', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '', + 'type': 'JAV', + } + + custom_scraper_settings = { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62', + 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor', + 'AUTOTHROTTLE_ENABLED': True, + 'USE_PROXY': False, + 'AUTOTHROTTLE_START_DELAY': 1, + 'AUTOTHROTTLE_MAX_DELAY': 60, + 'CONCURRENT_REQUESTS': 1, + 'DOWNLOAD_DELAY': 2, + 'DOWNLOADER_MIDDLEWARES': { + # 'tpdb.helpers.scrapy_flare.FlareMiddleware': 542, + 'tpdb.middlewares.TpdbSceneDownloaderMiddleware': 543, + 'tpdb.custommiddlewares.CustomProxyMiddleware': 350, + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, + }, + 'DOWNLOAD_HANDLERS': { + "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + } + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + meta['playwright'] = True + for pagination in self.paginations: + meta['pagination'] = pagination + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "tray-item")]/a[1]/@href').getall() + for scene in scenes: + meta['id'] = re.search(r'.*/(.*?)$', scene).group(1) + if re.search(r'(\w+-\w+)-\w+', meta['id']): + meta['id'] = re.search(r'(\w+-\w+)-\w+', meta['id']).group(1) + if meta['id'] and "ppv" not in meta['id'].lower(): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse_scene(self, response): + meta = response.meta + item = SceneItem() + item = self.prep_item(item) + + item['id'] = meta['id'].upper() + + r18 = self.get_r18(item['id']) + if r18: + if r18['title']: + item['title'] = string.capwords(self.uncensor(r18['title'])) + else: + title = response.xpath('//h1[@class="film-info-title"]/strong/text()[1]').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + if r18['label'] and "name" in r18['label']: + item['site'] = string.capwords(r18['label']['name'].replace(".", "")) + if r18['maker'] and "name" in r18['maker']: + item['parent'] = string.capwords(r18['maker']['name'].replace(".", "")) + item['date'] = r18['release_date'] + + r18image = False + if r18['images']['jacket_image']['large'] and "http" in r18['images']['jacket_image']['large']: + r18image = r18['images']['jacket_image']['large'] + elif r18['images']['jacket_image']['large2'] and "http" in r18['images']['jacket_image']['large2']: + r18image = r18['images']['jacket_image']['large2'] + + if r18['actresses']: + for performer in r18['actresses']: + item['performers'].append(string.capwords(performer['name'])) + + if r18['sample']: + if r18['sample']['high']: + item['trailer'] = r18['sample']['high'] + + if r18['categories']: + for tag in r18['categories']: + item['tags'].append(string.capwords(tag['name'])) + + if r18['director']: + item['director'] = r18['director'] + + if r18['runtime_minutes']: + item['duration'] = str(int(r18['runtime_minutes']) * 60) + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={r18['content_id']}/" + item['network'] = 'R18' + else: + title = response.xpath('//h1[@class="film-info-title"]/strong/text()[1]').get() + if "[" in title and "]" in title: + title = re.sub(r'\[.*?\]', '', title) + item['title'] = string.capwords(title) + + director = response.xpath('//i[@class="fa fa-user" and contains(./following-sibling::text(), "Director")]/following-sibling::a//text()|//i[@class="fa fa-user"]/following-sibling::strong/text()') + if director: + item['director'] = string.capwords(director.get()) + + scenedate = response.xpath('//div[@class="description"]/i[@class="fa fa-calendar"]/following-sibling::strong/text()') + if scenedate: + item['date'] = self.parse_date(scenedate.get(), date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + + item['performers'] = response.xpath('//i[@class="fa fa-users" and contains(./following-sibling::text(), "Cast")]/following-sibling::a/strong/text()').getall() + + duration = response.xpath('//i[@class="fa fa-clock" and contains(./following-sibling::text(), "Runtime")]/following-sibling::strong/text()') + if duration: + duration = re.search(r'(\d+)', duration.get()) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + site = response.xpath('//i[@class="fa fa-tag" and contains(./following-sibling::text(), "Label")]/following-sibling::a/strong/text()').getall() + site = list(map(lambda x: string.capwords(x.strip()), site)) + if site: + site = list(filter(None, site)) + item['site'] = string.capwords(site[0].replace(".", "")) + + parent = response.xpath('//i[@class="fa fa-camera" and contains(./following-sibling::text(), "Studio")]/following-sibling::a/strong/text()').getall() + parent = list(map(lambda x: string.capwords(x.strip()), parent)) + if parent: + parent = list(filter(None, parent)) + item['parent'] = string.capwords(parent[0].replace(".", "")) + + if not item['parent'] or "---" in item['parent']: + item['parent'] = item['site'] + + item['network'] = 'R18' + + item['url'] = f"https://r18.dev/videos/vod/movies/detail/-/id={re.sub('[^a-z0-9]', '', item['id'].lower())}" + + # ### Tasks for both sources + + # Strip the ID from the title, then re-add it in uppercase and without embellishments + title = re.search(fr"{item['id']}(?:-\w+)? (.*)", item['title'].upper()) + if title: + title = title.group(1) + item['title'] = f"{item['id']}: {string.capwords(title)}" + else: + item['title'] = f"{item['id']}: {string.capwords(item['title'])}" + + # Verify we have both Site and Parent + if item['parent'] and not item['site']: + item['site'] = item['parent'] + if item['site'] and not item['parent']: + item['parent'] = item['site'] + + # Get the Front and Back images from site, using R18 image if not available + item['image'] = response.xpath('//img[@class="cover"]/@src|//div[@id="trailer"]/video/@poster').get() + if not item['image'] and r18image: + item['image'] = r18image + + # Get the blobs + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + # Add site tags to existing tags pulled from R18 if available + tags = response.xpath('//i[@class="fa fa-list" and contains(./following-sibling::text(), "Genre")]/following-sibling::a/strong/text()').getall() + tags = list(map(lambda x: string.capwords(x.strip()), tags)) + tags = list(filter(None, tags)) + if tags: + for tag in tags: + item['tags'].append(string.capwords(tag)) + + # General purpose removal of any additonal tokens for image url + if item['image']: + if "?" in item['image'] and ("token" in item['image'].lower() or "expire" in item['image'].lower()): + item['image'] = re.search(r'(.*?)\?', item['image']).group(1) + + item['type'] = 'JAV' + + if not item['duration'] or int(item['duration']) > 4200: + yield self.check_item(item, self.days) + + def get_r18(self, javid): + javid = javid.replace("-", "").lower().strip() + link = f"https://r18.dev/videos/vod/movies/detail/-/dvd_id={javid}/json" + req = requests.get(link) + if req: + content = json.loads(req.content) + else: + content = False + return content + + def uncensor(self, title): + title = title.replace("A*****t", "Assault") + title = title.replace("A****p", "Asleep") + title = title.replace("A***e", "Abuse") + title = title.replace("B***d", "Blood") + title = title.replace("B**d", "Bled") + title = title.replace("C***d", "Child") + title = title.replace("C*ck", "Cock") + title = title.replace("D******e", "Disgrace") + title = title.replace("D***king", "Drinking") + title = title.replace("D***k", "Drunk") + title = title.replace("D**g", "Drug") + title = title.replace("F*****g", "Forcing") + title = title.replace("F***e", "Force") + title = title.replace("G*******g", "Gangbang") + title = title.replace("G******g", "Gangbang") + title = title.replace("H*********n", "Humiliation") + title = title.replace("H*******e", "Hypnotize") + title = title.replace("H********d", "Hypnotized") + title = title.replace("H*******m", "Hypnotism") + title = title.replace("H**t", "Hurt") + title = title.replace("I****t", "Incest") + title = title.replace("K****p", "Kidnap") + title = title.replace("K****r", "Killer") + title = title.replace("K**l", "Kill") + title = title.replace("K*d", "Kid") + title = title.replace("L****a", "Lolita") + title = title.replace("M************n", "Mother And Son") + title = title.replace("M****t", "Molest") + title = title.replace("P********t", "Passed Out") + title = title.replace("P****h", "Punish") + title = title.replace("R****g", "Raping") + title = title.replace("R**e", "Rape") + title = title.replace("RStepB****************r", "Stepbrother and Sister") + title = title.replace("S*********l", "School Girl") + title = title.replace("S**********s", "School Girls") + title = title.replace("S********l", "Schoolgirl") + title = title.replace("S*********s", "Schoolgirls") + title = title.replace("S******g", "Sleeping") + title = title.replace("S*****t", "Student") + title = title.replace("S***e", "Slave") + title = title.replace("S**t", "Scat") + title = title.replace("S*******y", "Scatology") + title = title.replace("Sch**l", "School") + title = title.replace("StepM************n", "Stepmother and Son") + title = title.replace("T******e", "Tentacle") + title = title.replace("T*****e", "Torture") + title = title.replace("U*********s", "Unconscious") + title = title.replace("V*****e", "Violate") + title = title.replace("V*****t", "Violent") + title = title.replace("Y********l", "Young Girl") + title = title.replace("A*****t", "Assault") + title = title.replace("a*****t", "assault") + title = title.replace("a****p", "asleep") + title = title.replace("a***e", "abuse") + title = title.replace("b***d", "blood") + title = title.replace("b**d", "bled") + title = title.replace("c***d", "child") + title = title.replace("c*ck", "cock") + title = title.replace("d******e", "disgrace") + title = title.replace("d***king", "drinking") + title = title.replace("d***k", "drunk") + title = title.replace("d**g", "drug") + title = title.replace("f*****g", "forcing") + title = title.replace("f***e", "force") + title = title.replace("g*******g", "gangbang") + title = title.replace("g******g", "gangbang") + title = title.replace("h*********n", "humiliation") + title = title.replace("h*******e", "hypnotize") + title = title.replace("h********d", "hypnotized") + title = title.replace("h*******m", "hypnotism") + title = title.replace("h**t", "hurt") + title = title.replace("i****t", "incest") + title = title.replace("k****p", "kidnap") + title = title.replace("k****r", "killer") + title = title.replace("k**l", "kill") + title = title.replace("k*d", "kid") + title = title.replace("l****a", "lolita") + title = title.replace("m************n", "mother and son") + title = title.replace("m****t", "molest") + title = title.replace("p********t", "passed out") + title = title.replace("p****h", "punish") + title = title.replace("r****g", "raping") + title = title.replace("r**e", "rape") + title = title.replace("rstepb****************r", "stepbrother and sister") + title = title.replace("s*********l", "school girl") + title = title.replace("s********l", "schoolgirl") + title = title.replace("s**********s", "school girls") + title = title.replace("s*********s", "schoolgirls") + title = title.replace("s******g", "sleeping") + title = title.replace("s*****t", "student") + title = title.replace("s***e", "slave") + title = title.replace("s**t", "scat") + title = title.replace("s*******y", "scatology") + title = title.replace("sch**l", "school") + title = title.replace("stepm************n", "stepmother and son") + title = title.replace("t******e", "tentacle") + title = title.replace("t*****e", "torture") + title = title.replace("u*********s", "unconscious") + title = title.replace("v*****e", "violate") + title = title.replace("v*****t", "violent") + title = title.replace("y********l", "young girl") + + return title + + def prep_item(self, item): + item['title'] = '' + item['date'] = '' + item['description'] = '' + item['image'] = '' + item['image_blob'] = '' + item['tags'] = [] + item['performers'] = [] + item['trailer'] = '' + item['type'] = 'JAV' + item['director'] = '' + item['site'] = '' + item['parent'] = '' + item['network'] = '' + + return item diff --git a/scenes/movieEBoobStore.py b/scenes/movieEBoobStore.py deleted file mode 100644 index 8bc5b9b2..00000000 --- a/scenes/movieEBoobStore.py +++ /dev/null @@ -1,47 +0,0 @@ -import re -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper - - -class MovieEBoobStoreSpider(BaseSceneScraper): - name = 'EBoobStore' - network = 'Score' - parent = 'Score' - site = 'Score' - - start_urls = [ - 'https://www.eboobstore.com', - ] - - selector_map = { - 'title': '//meta[@name="twitter:title"]/@content|//meta[@property="og:title"]/@content', - 'description': '//div[contains(@class, "description") and contains(@class, "full")]/div//text()', - 'date': '', - 'image': '//meta[@name="twitter:image"]/@content|//meta[@itemprop="image"]/@content', - 'performers': '//h2[contains(text(), "Models")]/../following-sibling::article/div/a/div/text()', - 'tags': '', - 'duration': '', - 'trailer': '//video/source/@src', - 'external_id': r'.*/(\d+)/', - 'pagination': '/adult-movies/c/255/DVD/?page=%s', - 'type': 'Movie', - } - - def get_scenes(self, response): - meta = response.meta - scenes = response.xpath('//article[contains(@class, "list-item")]/div[1]/a/@href').getall() - for scene in scenes: - if "?nats" in scene: - scene = re.search(r'(.*)\?nats', scene).group(1) - if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) - - def get_duration(self, response): - duration = response.xpath('//div[@class="options"]/div[@class="format-details"]//strong[contains(text(), "Duration")]/following-sibling::text()') - if duration: - duration = duration.get() - if "minutes" in duration.lower(): - duration = re.search(r'(\d+)min', duration.lower().replace(" ", "")) - if duration: - return str(int(duration.group(1)) * 60) - return None diff --git a/scenes/moviesAdultDVDEmpire.py b/scenes/moviesAdultDVDEmpire.py deleted file mode 100644 index 2cbecb91..00000000 --- a/scenes/moviesAdultDVDEmpire.py +++ /dev/null @@ -1,195 +0,0 @@ -import re -import html -import string -import os.path -from datetime import date, datetime, timedelta -from pathlib import Path -import unidecode -import dateparser -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class AdultDVDEmpireMovieSpider(BaseSceneScraper): - name = 'AdultDVDEmpireMovie' - store = "Adult DVD Empire" - - start_urls = [ - 'https://www.adultdvdempire.com' - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False'} - - selector_map = { - 'title': '//div[contains(@class,"title-rating-section")]/div/h1/text()', - 'description': '//h4[contains(@class,"synopsis")]/p/text()', - 'date': '//li/small[contains(text(),"Released")]/following-sibling::text()', - 'image': '//a[@id="front-cover"]/img/@src', - 'back': '//a[@id="back-cover"]/@href', - 'performers': '//strong[contains(text(),"Starring")]/following-sibling::a/div//text()|//strong[contains(text(),"Starring")]/following-sibling::a//text()', - 'tags': '//strong[contains(text(),"Categories")]/following-sibling::a/text()', - 'external_id': r'/(\d+)/', - 'studio': '//li/small[contains(text(), "Studio:")]/following-sibling::a/text()', - 'director': '//a[@label="Director"]/text()', - 'format': '//div[contains(@class, "pricing")]/h2/text()[1]', - 'duration': '//li/small[contains(text(), "Length:")]/following-sibling::text()', - 'sku': '//li/small[contains(text(), "SKU:")]/following-sibling::text()', - 'pagination': '/new-release-porn-movies.html?page=%s', - # ~ 'pagination': '/29773/studio/lethal-hardcore-porn-movies.html?page=%s&media=2', - } - - def get_scenes(self, response): - movies = response.xpath('//div[@class="product-card"]/div/a/@href').getall() - for movie in movies: - movie = movie.strip() - if re.search(self.get_selector_map('external_id'), movie): - yield scrapy.Request(url=self.format_link(response, movie), callback=self.parse_movie) - - def get_description(self, response): - description = response.xpath('//h4[contains(@class,"synopsis")]/p/text()|//h4[contains(@class,"synopsis")]/following-sibling::p/text()') - if description: - description = description.getall() - description = " ".join(description).replace(" ", " ").strip() - return description - return "" - - def get_tags(self, response): - if self.get_selector_map('tags'): - tags = self.process_xpath( - response, self.get_selector_map('tags')).getall() - if tags: - return self.clean_tags(list(map(lambda x: x.strip().title(), tags))) - return [] - - def get_date(self, response): - dvddate = self.process_xpath(response, self.get_selector_map('date')).get() - if dvddate: - dvddate.replace('Released:', '').replace('Added:', '').strip() - else: - dvddate = response.xpath('//li/small[contains(text(),"Production")]/following-sibling::text()').get() - if dvddate: - dvddate = dvddate + "-01-01" - if not dvddate: - return datetime.now().isoformat() - - return dateparser.parse(dvddate.strip()).isoformat() - - def get_duration(self, response): - length = super().get_duration(response) - if length: - length = length.lower() - if "hr" in length and "min" in length: - if re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length) - hour = int(length.group(1)) - minute = int(length.group(2)) - length = str((hour * 3660) + (minute * 60)) - elif "min" in length: - if re.search(r'(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?min', length) - minute = int(length.group(1)) - length = str((minute * 60)) - else: - length = None - return length - - def get_format(self, response): - if 'format' in self.get_selector_map(): - if self.get_selector_map('format'): - movieformat = self.process_xpath(response, self.get_selector_map('format')) - if movieformat: - movieformat = list(map(lambda x: string.capwords(x.strip()), movieformat.getall())) - movieformat.sort() - movieformat = " / ".join(movieformat) - return movieformat - - return "Dvd" - - def clean_tags(self, tags): - cleanlist = [ - 'movie', - 'vod exclusive', - '4k', - 'hd', - 'feature', - '4k ultra hd', - 'boxed sets', - ] - newlist = [] - for word in tags: - if word.lower() not in cleanlist: - newlist.append(word) - return newlist - - def parse_movie(self, response): - item = SceneItem() - - item['title'] = self.clean_text(self.get_title(response)) - item['title'] = re.sub(r'\(.*?dvd.*?\)|\(.*?blu-ray.*?\)|\(.*?combo.*?\)', '', item['title'], flags=re.IGNORECASE) - item['description'] = self.clean_text(self.get_description(response)) - item['store'] = "Adult DVD Empire" - item['date'] = self.get_date(response) - item['image'] = self.get_image(response) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['back'] = self.get_back_image(response) - item['back_blob'] = self.get_image_blob_from_link(item['back']) - item['performers'] = self.get_performers(response) - item['tags'] = self.get_tags(response) - item['id'] = self.get_id(response) - item['trailer'] = self.get_trailer(response) - item['network'] = "Adult DVD Empire" - item['site'] = self.get_studio(response) - item['parent'] = self.get_studio(response) - item['director'] = self.get_director(response) - item['format'] = self.get_format(response) - item['duration'] = self.get_duration(response) - item['sku'] = self.get_sku(response) - item['type'] = 'Movie' - - item['url'] = self.get_url(response) - - if self.days > 27375: - filter_date = '0000-00-00' - else: - days = self.days - filter_date = date.today() - timedelta(days) - filter_date = filter_date.strftime('%Y-%m-%d') - - foundpointer = 0 - matches = ['bangbros', 'jeffsmodels', 'private', 'dorcel', 'bluebirdfilms', 'privateblack', 'dorcelclub', 'evilangel', 'wicked'] - if item['title'] and item['site'] and not any(x in re.sub(r'[^a-zA-Z0-9]', '', item['site']).lower().replace(" ", "") for x in matches): - year = re.search(r'(\d{4})-\d{2}-\d{2}', item['date']).group(1) - teststring = item['title'] + year + item['site'] - teststring = re.sub(r'[^A-Za-z0-9#]+', '', teststring).lower() - if not os.path.exists('adedupelist.txt'): - Path('adedupelist.txt').touch() - with open('adedupelist.txt', 'r', encoding="utf-8") as file1: - for i in file1.readlines(): - if teststring in i: - foundpointer = 1 - break - - if not foundpointer and "dvd" not in item['format'].lower(): - with open('adedupelist.txt', 'a', encoding="utf-8") as file1: - file1.write(teststring + "\n") - - if self.debug: - if not item['date'] > filter_date: - item['filtered'] = 'movie filtered due to date restraint' - print(item) - else: - if filter_date: - if item['date'] > filter_date: - yield item - else: - yield item - - def clean_text(self, textstring): - if textstring is not None: - textstring = textstring.strip() - textstring = unidecode.unidecode(textstring) - textstring = html.unescape(textstring) - textstring = re.sub('<[^<]+?>', '', textstring) - return textstring diff --git a/scenes/moviesAdultDVDEmpire_VOD.py b/scenes/moviesAdultDVDEmpire_VOD.py deleted file mode 100644 index 93fceb93..00000000 --- a/scenes/moviesAdultDVDEmpire_VOD.py +++ /dev/null @@ -1,293 +0,0 @@ -import re -import json -import html -import string -import os.path -from datetime import date, datetime, timedelta -from pathlib import Path -import unidecode -import dateparser -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class AdultDVDEmpireMovieSpider(BaseSceneScraper): - name = 'AdultDVDEmpireMovie_VOD' - store = "Adult DVD Empire" - - start_urls = [ - 'https://www.adultdvdempire.com' - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False'} - - selector_map = { - 'title': '//div[contains(@class,"title-rating-section")]/div/h1/text()', - 'description': '//h4[contains(@class,"synopsis")]/p/text()', - 'date': '//li/small[contains(text(),"Released")]/following-sibling::text()', - 'image': '//a[@id="front-cover"]/img/@src', - 'back': '//a[@id="back-cover"]/@href', - 'performers': '//strong[contains(text(),"Starring")]/following-sibling::a/div//text()|//strong[contains(text(),"Starring")]/following-sibling::a//text()', - 'tags': '//strong[contains(text(),"Categories")]/following-sibling::a/text()', - 'external_id': r'/(\d+)/', - 'studio': '//li/small[contains(text(), "Studio:")]/following-sibling::a/text()', - 'director': '//a[@label="Director"]/text()', - 'format': '//div[contains(@class, "pricing")]/h2/text()[1]', - 'duration': '//li/small[contains(text(), "Length:")]/following-sibling::text()', - 'sku': '//li/small[contains(text(), "SKU:")]/following-sibling::text()', - 'pagination': '/new-addition-porn-videos.html?page=%s&media=14' - } - - def get_scenes(self, response): - movies = response.xpath('//div[@class="product-card"]/div/a/@href').getall() - for movie in movies: - movie = movie.strip() - if re.search(self.get_selector_map('external_id'), movie): - yield scrapy.Request(url=self.format_link(response, movie), callback=self.parse_movie) - - def get_title(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - title = None - if jsondata: - jsondata = jsondata.get() - title = self.check_json(jsondata, 'name') - if not title: - title = response.xpath('//meta[@name="og:title"]/@content') - if title: - title = title.get() - title = re.search(r'(.*?)\|', title) - if title: - title = title.group(1) - title = re.sub(r'\(\d{4}\)', '', title).strip() - return self.clean_text(title) - - def get_studio(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - studio = None - if jsondata: - jsondata = jsondata.get() - studio = self.check_json(jsondata, 'productionCompany', 'name') - if not studio: - studio = super().get_studio(response) - return studio.strip() - - def get_description(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - description = None - if jsondata: - jsondata = jsondata.get() - description = self.check_json(jsondata, 'description') - if not description: - description = response.xpath('//h4[contains(@class,"synopsis")]/p/text()|//h4[contains(@class,"synopsis")]/following-sibling::p/text()') - if description: - description = description.getall() - description = " ".join(description).replace(" ", " ").strip() - - return self.clean_text(description) - - def get_tags(self, response): - tags = [] - tags = response.xpath('//meta[@property="og:video:tag"]/@content') - if tags: - tags = tags.getall() - if not tags: - tags = self.process_xpath(response, self.get_selector_map('tags')).getall() - if tags: - return list(map(lambda x: string.capwords(x.strip()), tags)) - return [] - - def get_performers(self, response): - performers = [] - performers = response.xpath('//meta[@property="og:video:actor"]/@content') - if performers: - performers = performers.getall() - if not performers: - performers = self.process_xpath(response, self.get_selector_map('performers')).getall() - if performers: - return list(map(lambda x: string.capwords(x.strip()), performers)) - return [] - - def get_date(self, response): - scenedate = None - scenedate = response.xpath('//meta[@property="og:video:release_date"]/@content') - if scenedate: - scenedate = scenedate.get().strip() - if not scenedate: - scenedate = self.process_xpath(response, self.get_selector_map('date')) - if scenedate: - scenedate = scenedate.get() - scenedate.replace('Released:', '').replace('Added:', '').strip() - else: - scenedate = response.xpath('//li/small[contains(text(),"Production")]/following-sibling::text()').get() - if scenedate: - scenedate = scenedate + "-01-01" - if not scenedate: - return datetime.now().isoformat() - - return dateparser.parse(scenedate).isoformat() - - def get_duration(self, response): - length = None - length = response.xpath('//meta[@property="og:video:duration"]/@content') - if length: - length = length.get() - if length: - if not int(length): - length = None - if not length: - length = super().get_duration(response) - if length: - length = length.lower() - if "hr" in length and "min" in length: - if re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length) - hour = int(length.group(1)) - minute = int(length.group(2)) - length = str((hour * 3660) + (minute * 60)) - return length - - def get_format(self, response): - movieformat = self.process_xpath(response, self.get_selector_map('format')) - if movieformat: - movieformat = list(map(lambda x: string.capwords(x.strip()), movieformat.getall())) - movieformat.sort() - movieformat = " / ".join(movieformat) - return movieformat - - return "Video on Demand" - - def clean_tags(self, tags): - cleanlist = [ - 'movie', - 'vod exclusive', - '4k', - 'hd', - 'feature', - '4k ultra hd', - ] - newlist = [] - for word in tags: - if word.lower() not in cleanlist: - newlist.append(word) - return newlist - - def get_image(self, response): - front = super().get_image(response) - if not front or ".com/" not in front: - front = response.xpath('//meta[@property="og:image"]/@content') - if front: - front = front.get() - if not front or ".com/" not in front: - front = response.xpath('//a[contains(@href, "h.jpg") and not(contains(@href, "bh.jpg"))]/@href') - if front: - front = front.get() - if front: - if ".com/" in front: - return self.format_link(response, front) - return None - - def get_back_image(self, response): - back = super().get_back_image(response) - if not back: - back = response.xpath('//a[contains(@href, "bh.jpg")]/@href') - if back: - back = back.get() - if back: - if ".com/" in back: - return self.format_link(response, back) - return None - - def parse_movie(self, response): - item = SceneItem() - num_scenes = response.xpath('//h3/a[contains(@label, "Scene Title")]') - if len(num_scenes) > 1 or not len(num_scenes): - item['title'] = self.clean_text(self.get_title(response)) - item['description'] = self.clean_text(self.get_description(response)) - item['store'] = "Adult DVD Empire" - item['date'] = self.get_date(response) - item['image'] = self.get_image(response) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['back'] = self.get_back_image(response) - item['back_blob'] = self.get_image_blob_from_link(item['back']) - item['performers'] = self.get_performers(response) - item['tags'] = self.get_tags(response) - item['id'] = self.get_id(response) - item['trailer'] = self.get_trailer(response) - item['network'] = "Adult DVD Empire" - item['site'] = self.get_studio(response) - item['parent'] = self.get_studio(response) - item['director'] = self.get_director(response) - item['format'] = self.get_format(response) - item['duration'] = self.get_duration(response) - item['sku'] = self.get_sku(response) - item['type'] = 'Movie' - - item['url'] = self.get_url(response) - - if self.days > 27375: - filter_date = '0000-00-00' - else: - days = self.days - filter_date = date.today() - timedelta(days) - filter_date = filter_date.strftime('%Y-%m-%d') - - foundpointer = 0 - matches = ['bangbros', 'jeffsmodels', 'private', 'dorcel', 'bluebirdfilms', 'privateblack', 'dorcelclub', 'evilangel', 'wicked'] - if item['title'] and item['site'] and not any(x in re.sub(r'[^a-zA-Z0-9]', '', item['site']).lower().replace(" ", "") for x in matches): - year = re.search(r'(\d{4})-\d{2}-\d{2}', item['date']).group(1) - teststring = item['title'] + year + item['site'] - teststring = re.sub(r'[^A-Za-z0-9#]+', '', teststring).lower() - if not os.path.exists('adedupelist.txt'): - Path('adedupelist.txt').touch() - with open('adedupelist.txt', 'r', encoding="utf-8") as file1: - for i in file1.readlines(): - if teststring in i: - foundpointer = 1 - break - - if not foundpointer and "dvd" not in item['format'].lower(): - with open('adedupelist.txt', 'a', encoding="utf-8") as file1: - file1.write(teststring + "\n") - - if self.debug: - if not item['date'] > filter_date: - item['filtered'] = 'movie filtered due to date restraint' - print(item) - else: - if filter_date: - if item['date'] > filter_date: - yield item - else: - yield item - else: - if "dvd" in item['format'].lower(): - print(f"Skipping {item['title']} due to dvd format") - elif foundpointer: - print(f"Skipping {item['title']} due to duplicate") - else: - print(f"Skipping {item['title']} but not sure why") - else: - print(f"Skipping {item['title']} Due to blocked Studio: {item['site']}") - else: - urltitle = re.search(r'.*/(.*?)$', response.url).group(1) - print(f"Skipping Due to Low Scene Count: {len(num_scenes)} {urltitle} :: ({response.url})") - - def check_json(self, jsondata, arg1, arg2=None): - jsondata = json.loads(jsondata) - if jsondata: - if arg2: - result = jsondata[arg1][arg2] - else: - result = jsondata[arg1] - return result - return None - - def clean_text(self, textstring): - if textstring is not None: - textstring = textstring.strip() - textstring = unidecode.unidecode(textstring) - textstring = html.unescape(textstring) - textstring = re.sub('<[^<]+?>', '', textstring) - return textstring diff --git a/scenes/moviesBang.py b/scenes/moviesBang.py deleted file mode 100644 index 74ba2247..00000000 --- a/scenes/moviesBang.py +++ /dev/null @@ -1,154 +0,0 @@ -import re -import html -import json -import unidecode -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class SiteBangMoviesSpider(BaseSceneScraper): - name = 'BangMovies' - network = 'Bang' - parent = 'Bang' - - start_urls = [ - 'https://www.bang.com', - ] - - selector_map = { - 'title': '', - 'description': '', - 'date': '', - 'image': '', - 'performers': '', - 'tags': '//div[contains(@class, "actions")]/a[contains(@href, "with")]/text()', - 'duration': '', - 'trailer': '', - 'external_id': r'video/(.*?)/', - 'pagination': '', - 'type': 'Scene', - } - - def get_next_page_url(self, base, page): - # ~ pagination = f"https://www.bang.com/movies?by=date.desc&page={page}" - pagination = f"https://www.bang.com/studio/157/video-art-holland/movies?by=date.desc&page={page}" - # ~ pagination = f"https://www.bang.com/studio/239/melting-images/movies?by=trending&page={page}" - # ~ pagination = f"https://www.bang.com/videos?by=date.desc&in=BANG%21%20Real%20Teens&page={page}" - # ~ pagination = f"https://www.bang.com/videos?in=BANG!%20Surprise&page={page}" - return pagination - - def parse(self, response, **kwargs): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - meta['movie'] = movie - yield movie - # ~ for sceneurl in movie['sceneurls']: - # ~ yield scrapy.Request(self.format_link(response, sceneurl), meta=meta, callback=self.parse_scene, headers=self.headers, cookies=self.cookies) - - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - movies = response.xpath('//div[contains(@class,"movie-preview")]/a/@href').getall() - for movie in movies: - movieurl = self.format_link(response, movie) - yield scrapy.Request(movieurl, callback=self.parse_movie, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - scene_count = response.xpath('//div[@class="scene-section" and not(contains(./div/h2/text(), "Bonus"))]') - if len(scene_count) > 1: - item = SceneItem() - jsondata = response.xpath('//script[contains(@type, "json") and contains(text(), "duration")]/text()') - if jsondata: - jsondata = json.loads(jsondata.get(), strict=False) - item['title'] = self.cleanup_title(unidecode.unidecode(jsondata['name'])).replace("&", "and") - item['date'] = jsondata['datePublished'] - if 'description' in jsondata: - item['description'] = html.unescape(jsondata['description']) - else: - item['description'] = '' - item['image'] = jsondata['thumbnailUrl'] - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['id'] = response.xpath('//a[contains(@href, "related-movie")]/@href').get() - item['id'] = re.search(r'movie=(.*)$', item['id']).group(1) - item['type'] = 'Movie' - item['url'] = response.url - item['duration'] = self.duration_to_seconds(jsondata['duration']) - item['performers'] = [] - for person in jsondata['actor']: - item['performers'].append(person['name']) - - item['tags'] = response.xpath('//div[@class="relative"]/div/a[@class="genres"]/text()').getall() - - item['site'] = re.sub('[^a-zA-Z0-9-]', '', response.xpath('//p[contains(text(), "Studio:")]/a/text()').get()) - item['trailer'] = "" - item['store'] = 'Bang' - item['network'] = 'Bang' - item['parent'] = item['site'] - - # ~ sceneurls = response.xpath('//div[@class="scene-section"]/div/div/a[contains(@href, "/video/")][1]/@href').getall() - item['scenes'] = [] - scenes = response.xpath('//div[@class="scene-section"]') - sceneurls = [] - for scene in scenes: - sceneurls.append(scene.xpath('.//h2/following-sibling::div[1]/a[1]/@href').get()) - sceneid = scene.xpath('.//div[contains(@class, "hidden") and contains(@class, "px-4")]/a[1]/@href').get() - if "related-video" in sceneid: - sceneid = re.search(r'related-video=(\w+)', sceneid).group(1) - item['scenes'].append({'site': item['site'], 'external_id': sceneid}) - meta['movie'] = item.copy() - # ~ matches = ['private', 'bluebird', 'lethalhardcore', 'thagson', 'samurai', 'premiumx', 'littledragon', 'karups', 'joybear', 'heatwave', 'fillyfilms', 'baeb'] - matches = ['private', 'lethalhardcore', 'thagson', 'samurai', 'premiumx', 'littledragon', 'karups', 'joybear', 'heatwave', 'fillyfilms', 'baeb'] - if not any(x in re.sub('[^a-zA-Z0-9]', '', item['site']).lower() for x in matches): - if item['id']: - yield self.check_item(item, self.days) - # ~ if self.check_item(item, self.days): - # ~ for sceneurl in sceneurls: - # ~ yield scrapy.Request(self.format_link(response, sceneurl), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - item = SceneItem() - jsondata = response.xpath('//script[contains(@type, "json") and contains(text(), "duration")]/text()') - if jsondata: - jsondata = json.loads(jsondata.get(), strict=False) - item['title'] = self.cleanup_title(jsondata['name']) - item['date'] = jsondata['datePublished'] - if 'description' in jsondata: - item['description'] = html.unescape(jsondata['description']) - else: - item['description'] = '' - item['image'] = jsondata['thumbnailUrl'] - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['id'] = jsondata['@id'] - item['type'] = 'Scene' - item['url'] = response.url - item['duration'] = self.duration_to_seconds(jsondata['duration']) - item['performers'] = [] - for person in jsondata['actor']: - item['performers'].append(person['name']) - - item['tags'] = self.get_tags(response) - site = jsondata['productionCompany']['name'] - item['site'] = re.sub('[^a-zA-Z0-9-]', '', site) - trailer = response.xpath('//video[@data-modal-target="videoImage"]/source[contains(@type, "mp4")]/@src') - if not trailer: - trailer = response.xpath('//video[@data-modal-target="videoImage"]/source[contains(@type, "webm")]/@src') - if trailer: - item['trailer'] = trailer.get() - else: - item['trailer'] = '' - item['movies'] = [{'site': meta['movie']['site'], 'external_id': meta['movie']['id']}] - item['network'] = 'Bang' - item['parent'] = 'Bang' - - yield self.check_item(item, self.days) diff --git a/scenes/moviesGayDVDEmpire.py b/scenes/moviesGayDVDEmpire.py deleted file mode 100644 index f4a3c032..00000000 --- a/scenes/moviesGayDVDEmpire.py +++ /dev/null @@ -1,195 +0,0 @@ -import re -import html -import string -import os.path -from datetime import date, datetime, timedelta -from pathlib import Path -import unidecode -import dateparser -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class GayDVDEmpireMovieSpider(BaseSceneScraper): - name = 'GayDVDEmpireMovie' - store = "Gay DVD Empire" - - start_urls = [ - 'https://www.gaydvdempire.com' - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False'} - - selector_map = { - 'title': '//div[contains(@class,"title-rating-section")]/div/h1/text()', - 'description': '//h4[contains(@class,"synopsis")]/p/text()', - 'date': '//li/small[contains(text(),"Released")]/following-sibling::text()', - 'image': '//a[@id="front-cover"]/img/@src', - 'back': '//a[@id="back-cover"]/@href', - 'performers': '//strong[contains(text(),"Starring")]/following-sibling::a/div//text()|//strong[contains(text(),"Starring")]/following-sibling::a//text()', - 'tags': '//strong[contains(text(),"Categories")]/following-sibling::a/text()', - 'external_id': r'/(\d+)/', - 'studio': '//li/small[contains(text(), "Studio:")]/following-sibling::a/text()', - 'director': '//a[@label="Director"]/text()', - 'format': '//div[contains(@class, "pricing")]/h2/text()[1]', - 'duration': '//li/small[contains(text(), "Length:")]/following-sibling::text()', - 'sku': '//li/small[contains(text(), "SKU:")]/following-sibling::text()', - 'pagination': '/new-release-gay-porn-movies.html?page=%s', - # ~ 'pagination': '/29773/studio/lethal-hardcore-porn-movies.html?page=%s&media=2', - } - - def get_scenes(self, response): - movies = response.xpath('//div[@class="product-card"]/div/a/@href').getall() - for movie in movies: - movie = movie.strip() - if re.search(self.get_selector_map('external_id'), movie): - yield scrapy.Request(url=self.format_link(response, movie), callback=self.parse_movie) - - def get_description(self, response): - description = response.xpath('//h4[contains(@class,"synopsis")]/p/text()|//h4[contains(@class,"synopsis")]/following-sibling::p/text()') - if description: - description = description.getall() - description = " ".join(description).replace(" ", " ").strip() - return description - return "" - - def get_tags(self, response): - if self.get_selector_map('tags'): - tags = self.process_xpath( - response, self.get_selector_map('tags')).getall() - if tags: - return self.clean_tags(list(map(lambda x: x.strip().title(), tags))) - return [] - - def get_date(self, response): - dvddate = self.process_xpath(response, self.get_selector_map('date')).get() - if dvddate: - dvddate.replace('Released:', '').replace('Added:', '').strip() - else: - dvddate = response.xpath('//li/small[contains(text(),"Production")]/following-sibling::text()').get() - if dvddate: - dvddate = dvddate + "-01-01" - if not dvddate: - return datetime.now().isoformat() - - return dateparser.parse(dvddate.strip()).isoformat() - - def get_duration(self, response): - length = super().get_duration(response) - if length: - length = length.lower() - if "hr" in length and "min" in length: - if re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length) - hour = int(length.group(1)) - minute = int(length.group(2)) - length = str((hour * 3660) + (minute * 60)) - elif "min" in length: - if re.search(r'(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?min', length) - minute = int(length.group(1)) - length = str((minute * 60)) - else: - length = None - return length - - def get_format(self, response): - if 'format' in self.get_selector_map(): - if self.get_selector_map('format'): - movieformat = self.process_xpath(response, self.get_selector_map('format')) - if movieformat: - movieformat = list(map(lambda x: string.capwords(x.strip()), movieformat.getall())) - movieformat.sort() - movieformat = " / ".join(movieformat) - return movieformat - - return "Dvd" - - def clean_tags(self, tags): - cleanlist = [ - 'movie', - 'vod exclusive', - '4k', - 'hd', - 'feature', - '4k ultra hd', - 'boxed sets', - ] - newlist = [] - for word in tags: - if word.lower() not in cleanlist: - newlist.append(word) - return newlist - - def parse_movie(self, response): - item = SceneItem() - - item['title'] = self.clean_text(self.get_title(response)) - item['title'] = re.sub(r'\(.*?dvd.*?\)|\(.*?blu-ray.*?\)|\(.*?combo.*?\)', '', item['title'], flags=re.IGNORECASE) - item['description'] = self.clean_text(self.get_description(response)) - item['store'] = "Gay DVD Empire" - item['date'] = self.get_date(response) - item['image'] = self.get_image(response) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['back'] = self.get_back_image(response) - item['back_blob'] = self.get_image_blob_from_link(item['back']) - item['performers'] = self.get_performers(response) - item['tags'] = self.get_tags(response) - item['id'] = self.get_id(response) - item['trailer'] = self.get_trailer(response) - item['network'] = "Adult DVD Empire" - item['site'] = self.get_studio(response) - item['parent'] = self.get_studio(response) - item['director'] = self.get_director(response) - item['format'] = self.get_format(response) - item['duration'] = self.get_duration(response) - item['sku'] = self.get_sku(response) - item['type'] = 'Movie' - - item['url'] = self.get_url(response) - - if self.days > 27375: - filter_date = '0000-00-00' - else: - days = self.days - filter_date = date.today() - timedelta(days) - filter_date = filter_date.strftime('%Y-%m-%d') - - foundpointer = 0 - matches = ['bangbros', 'jeffsmodels', 'private', 'dorcel', 'bluebirdfilms', 'privateblack'] - if item['title'] and item['site'] and not any(x in re.sub(r'[^a-zA-Z0-9]', '', item['site']).lower().replace(" ", "") for x in matches): - year = re.search(r'(\d{4})-\d{2}-\d{2}', item['date']).group(1) - teststring = item['title'] + year + item['site'] - teststring = re.sub(r'[^A-Za-z0-9#]+', '', teststring).lower() - if not os.path.exists('adedupelist.txt'): - Path('adedupelist.txt').touch() - with open('adedupelist.txt', 'r', encoding="utf-8") as file1: - for i in file1.readlines(): - if teststring in i: - foundpointer = 1 - break - - if not foundpointer and "dvd" not in item['format'].lower(): - with open('adedupelist.txt', 'a', encoding="utf-8") as file1: - file1.write(teststring + "\n") - - if self.debug: - if not item['date'] > filter_date: - item['filtered'] = 'movie filtered due to date restraint' - print(item) - else: - if filter_date: - if item['date'] > filter_date: - yield item - else: - yield item - - def clean_text(self, textstring): - if textstring is not None: - textstring = textstring.strip() - textstring = unidecode.unidecode(textstring) - textstring = html.unescape(textstring) - textstring = re.sub('<[^<]+?>', '', textstring) - return textstring diff --git a/scenes/moviesGayDVDEmpire_VOD.py b/scenes/moviesGayDVDEmpire_VOD.py deleted file mode 100644 index 7dce73c7..00000000 --- a/scenes/moviesGayDVDEmpire_VOD.py +++ /dev/null @@ -1,297 +0,0 @@ -import re -import json -import html -import string -import os.path -from datetime import date, datetime, timedelta -from pathlib import Path -import unidecode -import dateparser -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class GayDVDEmpireMovieSpider(BaseSceneScraper): - name = 'GayDVDEmpireMovie_VOD' - store = "Adult DVD Empire" - - start_urls = [ - 'https://www.gaydvdempire.com' - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False'} - - selector_map = { - 'title': '//div[contains(@class,"title-rating-section")]/div/h1/text()', - 'description': '//h4[contains(@class,"synopsis")]/p/text()', - 'date': '//li/small[contains(text(),"Released")]/following-sibling::text()', - 'image': '//a[@id="front-cover"]/img/@src', - 'back': '//a[@id="back-cover"]/@href', - 'performers': '//strong[contains(text(),"Starring")]/following-sibling::a/div//text()|//strong[contains(text(),"Starring")]/following-sibling::a//text()', - 'tags': '//strong[contains(text(),"Categories")]/following-sibling::a/text()', - 'external_id': r'/(\d+)/', - 'studio': '//li/small[contains(text(), "Studio:")]/following-sibling::a/text()', - 'director': '//a[@label="Director"]/text()', - 'format': '//div[contains(@class, "pricing")]/h2/text()[1]', - 'duration': '//li/small[contains(text(), "Length:")]/following-sibling::text()', - 'sku': '//li/small[contains(text(), "SKU:")]/following-sibling::text()', - 'pagination': '/new-release-gay-porn-videos.html?page=%s&media=14' - } - - def get_scenes(self, response): - movies = response.xpath('//div[@class="product-card"]/div/a/@href').getall() - for movie in movies: - movie = movie.strip() - if re.search(self.get_selector_map('external_id'), movie): - yield scrapy.Request(url=self.format_link(response, movie), callback=self.parse_movie) - - def get_title(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - title = None - if jsondata: - jsondata = jsondata.get() - title = self.check_json(jsondata, 'name') - if not title: - title = response.xpath('//meta[@name="og:title"]/@content') - if title: - title = title.get() - title = re.search(r'(.*?)\|', title) - if title: - title = title.group(1) - title = re.sub(r'\(\d{4}\)', '', title).strip() - if not title: - title = super().get_title(response) - return self.clean_text(title) - - def get_studio(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - studio = None - if jsondata: - jsondata = jsondata.get() - studio = self.check_json(jsondata, 'productionCompany', 'name') - if not studio: - studio = super().get_studio(response) - return studio.strip() - - def get_description(self, response): - jsondata = response.xpath('//script[contains(text(), "uploadDate")][1]/text()') - description = None - if jsondata: - jsondata = jsondata.get() - description = self.check_json(jsondata, 'description') - if not description: - description = response.xpath('//h4[contains(@class,"synopsis")]/p/text()|//h4[contains(@class,"synopsis")]/following-sibling::p/text()') - if description: - description = description.getall() - description = " ".join(description).replace(" ", " ").strip() - if not description: - description = super().get_description(response) - return self.clean_text(description) - - def get_tags(self, response): - tags = [] - tags = response.xpath('//meta[@property="og:video:tag"]/@content') - if tags: - tags = tags.getall() - if not tags: - tags = super().get_tags(response) - if tags: - return list(map(lambda x: string.capwords(x.strip()), tags)) - return [] - - def get_performers(self, response): - performers = [] - performers = response.xpath('//meta[@property="og:video:actor"]/@content') - if performers: - performers = performers.getall() - if not performers: - performers = super().get_performers(response) - if performers: - return list(map(lambda x: string.capwords(x.strip()), performers)) - return [] - - def get_date(self, response): - scenedate = None - scenedate = response.xpath('//meta[@property="og:video:release_date"]/@content') - if scenedate: - scenedate = scenedate.get().strip() - if not scenedate: - scenedate = self.process_xpath(response, self.get_selector_map('date')) - if scenedate: - scenedate = scenedate.get() - scenedate.replace('Released:', '').replace('Added:', '').strip() - else: - scenedate = response.xpath('//li/small[contains(text(),"Production")]/following-sibling::text()').get() - if scenedate: - scenedate = scenedate + "-01-01" - if not scenedate: - return super().get_date(response) - - return dateparser.parse(scenedate).isoformat() - - def get_duration(self, response): - length = None - length = response.xpath('//meta[@property="og:video:duration"]/@content') - if length: - length = length.get() - if length: - if not int(length): - length = None - if not length: - length = super().get_duration(response) - if length: - length = re.search(r'(\d+)', length).group(1) - length = length.lower() - if "hr" in length and "min" in length: - if re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length): - length = re.search(r'(\d{1,2}).+?hr.+?(\d{1,2}).+?min', length) - hour = int(length.group(1)) - minute = int(length.group(2)) - length = str((hour * 3660) + (minute * 60)) - return length - - def get_format(self, response): - movieformat = self.process_xpath(response, self.get_selector_map('format')) - if movieformat: - movieformat = list(map(lambda x: string.capwords(x.strip()), movieformat.getall())) - movieformat.sort() - movieformat = " / ".join(movieformat) - return movieformat - - return "Video on Demand" - - def clean_tags(self, tags): - cleanlist = [ - 'movie', - 'vod exclusive', - '4k', - 'hd', - 'feature', - '4k ultra hd', - ] - newlist = [] - for word in tags: - if word.lower() not in cleanlist: - newlist.append(word) - return newlist - - def get_image(self, response): - front = super().get_image(response) - if not front or ".com/" not in front: - front = response.xpath('//meta[@property="og:image"]/@content') - if front: - front = front.get() - if not front or ".com/" not in front: - front = response.xpath('//a[contains(@href, "h.jpg") and not(contains(@href, "bh.jpg"))]/@href') - if front: - front = front.get() - if front: - if ".com/" in front: - return self.format_link(response, front) - return None - - def get_back_image(self, response): - back = super().get_back_image(response) - if not back: - back = response.xpath('//a[contains(@href, "bh.jpg")]/@href') - if back: - back = back.get() - if back: - if ".com/" in back: - return self.format_link(response, back) - return None - - def parse_movie(self, response): - item = SceneItem() - num_scenes = response.xpath('//h3/a[contains(@label, "Scene Title")]') - if len(num_scenes) > 1 or not len(num_scenes): - item['title'] = self.clean_text(self.get_title(response)) - item['description'] = self.clean_text(self.get_description(response)) - item['store'] = "Gay DVD Empire" - item['date'] = self.get_date(response) - item['image'] = self.get_image(response) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['back'] = self.get_back_image(response) - item['back_blob'] = self.get_image_blob_from_link(item['back']) - item['performers'] = self.get_performers(response) - item['tags'] = self.get_tags(response) - item['id'] = self.get_id(response) - item['trailer'] = self.get_trailer(response) - item['network'] = "Adult DVD Empire" - item['site'] = self.get_studio(response) - item['parent'] = self.get_studio(response) - item['director'] = self.get_director(response) - item['format'] = self.get_format(response) - item['duration'] = self.get_duration(response) - item['sku'] = self.get_sku(response) - item['type'] = 'Movie' - - item['url'] = self.get_url(response) - - if self.days > 27375: - filter_date = '0000-00-00' - else: - days = self.days - filter_date = date.today() - timedelta(days) - filter_date = filter_date.strftime('%Y-%m-%d') - - foundpointer = 0 - matches = ['bangbros', 'jeffsmodels', 'private', 'dorcel', 'bluebirdfilms', 'privateblack'] - if item['title'] and item['site'] and not any(x in re.sub(r'[^a-zA-Z0-9]', '', item['site']).lower().replace(" ", "") for x in matches): - year = re.search(r'(\d{4})-\d{2}-\d{2}', item['date']).group(1) - teststring = item['title'] + year + item['site'] - teststring = re.sub(r'[^A-Za-z0-9#]+', '', teststring).lower() - if not os.path.exists('adedupelist.txt'): - Path('adedupelist.txt').touch() - with open('adedupelist.txt', 'r', encoding="utf-8") as file1: - for i in file1.readlines(): - if teststring in i: - foundpointer = 1 - break - - if not foundpointer and "dvd" not in item['format'].lower(): - with open('adedupelist.txt', 'a', encoding="utf-8") as file1: - file1.write(teststring + "\n") - - if self.debug: - if not item['date'] > filter_date: - item['filtered'] = 'movie filtered due to date restraint' - print(item) - else: - if filter_date: - if item['date'] > filter_date: - yield item - else: - yield item - else: - if "dvd" in item['format'].lower(): - print(f"Skipping {item['title']} due to dvd format") - elif foundpointer: - print(f"Skipping {item['title']} due to duplicate") - else: - print(f"Skipping {item['title']} but not sure why") - else: - print(f"Skipping {item['title']} Due to blocked Studio: {item['site']}") - else: - urltitle = re.search(r'.*/(.*?)$', response.url).group(1) - print(f"Skipping Due to Low Scene Count: {len(num_scenes)} {urltitle} :: ({response.url})") - - def check_json(self, jsondata, arg1, arg2=None): - jsondata = json.loads(jsondata) - if jsondata: - if arg2: - result = jsondata[arg1][arg2] - else: - result = jsondata[arg1] - return result - return None - - def clean_text(self, textstring): - if textstring is not None: - textstring = textstring.strip() - textstring = unidecode.unidecode(textstring) - textstring = html.unescape(textstring) - textstring = re.sub('<[^<]+?>', '', textstring) - return textstring diff --git a/scenes/moviesJMElite.py b/scenes/moviesJMElite.py deleted file mode 100644 index 32ec4cb1..00000000 --- a/scenes/moviesJMElite.py +++ /dev/null @@ -1,132 +0,0 @@ -import re -import html -import unidecode -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class MoviesJMEliteSpider(BaseSceneScraper): - name = 'JMEliteMovies' - - start_urls = [ - 'https://www.jacquieetmichelelite.com' - ] - - custom_scraper_settings = { - 'USER_AGENT':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62', - 'AUTOTHROTTLE_ENABLED': True, - 'AUTOTHROTTLE_START_DELAY': 1, - 'AUTOTHROTTLE_MAX_DELAY': 120, - 'CONCURRENT_REQUESTS': 1, - 'RANDOMIZE_DOWNLOAD_DELAY': True, - 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, - 'CONCURRENT_REQUESTS_PER_IP': 1, - } - - cookies = { - 'dscl': '1', - 'ppndr': '1', - 'promo-widget-head': '1', - 'force-my18pass-refresh': '0', - } - - selector_map = { - 'title': '//h1[contains(@class,"video-detail__title")]/text()', - 'description': '//div[contains(@class,"video-detail__description")]/text()', - 'date': '//script[contains(@type, "json")]/text()', - 're_date': r'datePublished.*?(\d{4}-\d{2}-\d{2})', - 'duration': '//script[contains(@type, "json")]/text()', - 're_duration': r'duration.*?T(.*?)\"', - 'image': '//img[contains(@class,"video-detail") and contains(@class, "poster")]/@src', - 'performers': '//p[contains(@class,"actor-item") and contains(@class,"title")]/text()', - 'tags': '', - 'studio': '//strong[contains(text(), "Studio:")]/following-sibling::a/text()', - 'director': '//ul[@class="video-detail__infos"]/li[3]/text()', - 'external_id': r'elite/(\d+)/', - # ~ 'pagination': '/en/porn-movies-p-%s.html' - 'pagination': '/en/porn-movies-jacquie-et-michel-elite-f-1354-p-%s.html' - } - - def get_scenes(self, response): - meta = response.meta - scenes = response.xpath( - '//a[@class="video-item"]/@href').getall() - for scene in scenes: - if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_movie, cookies=self.cookies, headers=self.headers, meta=meta) - - def parse_movie(self, response): - meta = response.meta - scenes = response.xpath('//a[@class="scene-item"]/@href').getall() - item = SceneItem() - item['title'] = self.get_title(response) - item['date'] = self.get_date(response) - item['description'] = self.get_description(response) - item['image'] = self.get_image(response) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['director'] = self.get_director(response) - item['performers'] = self.get_performers(response) - item['duration'] = self.get_duration(response) - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Movie' - item['network'] = 'Jacquie et Michel' - item['parent'] = 'Jacquie et Michel Elite' - studio = response.xpath('//strong[contains(text(), "Studio:")]/following-sibling::a/text()') - if studio: - item['site'] = studio.get() - else: - item['site'] = 'Jacquie et Michel Elite' - item['store'] = 'Jacquie et Michel Elite' - item['url'] = response.url - item['id'] = re.search(r'elite/(\d+)/', response.url).group(1) - item['scenes'] = [] - for sceneurl in scenes: - item['scenes'].append({'site': item['site'], 'external_id': re.search(r'show/(\d+)/', sceneurl).group(1)}) - meta['movie'] = item - yield item - for sceneurl in scenes: - yield scrapy.Request(self.format_link(response, sceneurl), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - movie = meta['movie'] - item = SceneItem() - jsondata = response.xpath('//script[contains(text(), "datePublished")]/text()').get() - item['title'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', response.xpath('//h1[contains(@class, "title")]/text()').get().strip()))) - item['date'] = re.search(r'datePublished.*?(\d{4}-\d{2}-\d{2})', jsondata).group(1) - description = response.xpath('//div[contains(@class,"description") and contains(@class,"video-detail")]/text()') - if description: - item['description'] = description.get().strip() - else: - item['description'] = '' - image = response.xpath('//video/@poster') - if image: - item['image'] = image.get() - else: - item['image'] = None - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['director'] = movie['director'] - item['performers'] = response.xpath('//p[contains(@class,"actor-item") and contains(@class,"title")]/text()').getall() - item['performers'] = list(map(lambda x: x.strip(), item['performers'])) - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Scene' - item['network'] = 'Jacquie et Michel' - item['parent'] = 'Jacquie et Michel Elite' - studio = response.xpath('//strong[contains(text(), "Studio:")]/following-sibling::a/text()') - if studio: - item['site'] = studio.get() - else: - item['site'] = 'Jacquie et Michel Elite' - duration = re.search(r'duration.*?(T\d.*?S)', jsondata) - if duration: - duration = duration.group(1) - item['duration'] = self.duration_to_seconds(duration) - else: - item['duration'] = None - - item['url'] = response.url - item['id'] = re.search(r'show/(\d+)/', response.url).group(1) - yield item diff --git a/scenes/moviesPinkLabel.py b/scenes/moviesPinkLabel.py deleted file mode 100644 index 037a386b..00000000 --- a/scenes/moviesPinkLabel.py +++ /dev/null @@ -1,90 +0,0 @@ -import dateparser -import scrapy -import json -import re -import scrapy -import string - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem -import dateparser - -class PinkLabelSpider(BaseSceneScraper): - name = 'PinkLabel' - network = "PinkLabel" - parent = "PinkLabel" - - selector_map = { - 'external_id': 'film\\/(.+)\\/', - 'title': '//div[@class="col-md-12 col-sm-8"]/h1/text()', - 'description': '', - 'date': '', - 'image': '//img[@class="img-responsive wp-post-image"]/@src', - 'performers': '//a[contains(@href,"/performer/")]/text()', - 'tags': '//a[contains(@href,"/tag/")]/text()', - 'trailer': '', - } - - def start_requests(self): - yield scrapy.Request(url="https://pinklabel.tv/on-demand/studios/", callback=self.get_studios, headers=self.headers, cookies=self.cookies) - - - def get_title(self, response): - title = self.process_xpath( - response, self.get_selector_map('title')).get() - if title: - return string.capwords(title.strip()) - return '' - - def get_tags(self, response): - if self.get_selector_map('tags'): - tags = self.process_xpath( - response, self.get_selector_map('tags')).getall() - if tags: - return list(map(lambda x: x.strip().title(), tags)) - return [] - - def get_studios(self, response): - '''Request each individual studio page''' - studios = response.xpath("//div[@class='well']/a/@href") - for studio in studios: - yield scrapy.Request( - url=studio.get(), - callback=self.get_scenes) - - def get_scenes(self, response): - '''Request each individual scene page''' - scenes = response.xpath("//a[@class='epiLink']/@href") - for scene in scenes: - yield scrapy.Request( - url=scene.get().split("?")[0], - callback=self.parse_scene) - - def get_description(self, response): - description = response.xpath('//div[@class="ep-description"]/span/p').getall() - if not isinstance(description, str): - description = "\n\n".join(description) - description = re.sub('<[^<]+?>', '', description).strip() - return description - - def get_trailer(self, response): - trailer = re.search("http.*\.mp4", response.text) - if trailer: - return trailer.group(0) - return '' - - def get_date(self, response): - metadata = response.xpath("//script[@class='yoast-schema-graph']//text()").get() - metadata = json.loads(metadata)["@graph"] - for data in metadata: - if data["@type"] == "WebPage": - return dateparser.parse(data["datePublished"]).isoformat() - - def parse_scene(self, response): - '''Override studio with correct value''' - for item in super().parse_scene(response): - studio = response.xpath('//a[contains(@href,"/studio/")]/text()')[0].get() - item["parent"] = "PinkLabel" - item["network"] = "PinkLabel" - item["site"] = studio - yield item diff --git a/scenes/moviesR18.py b/scenes/moviesR18.py deleted file mode 100644 index 80a33fd9..00000000 --- a/scenes/moviesR18.py +++ /dev/null @@ -1,227 +0,0 @@ -import re -import html -import textwrap -import unidecode -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class MoviesR18Spider(BaseSceneScraper): - name = 'R18Movies' - network = 'R18' - parent = 'R18' - site = 'R18' - - start_urls = [ - 'https://www.r18.com', - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False'} - - selector_map = { - 'external_id': '(\\d+)$', - 'pagination': '/videos/vod/movies/list/?page=%s' - } - - def get_scenes(self, response): - meta = response.meta - scenes = response.xpath('//li[@class="item-list"]/@data-content_id').getall() - for scene in scenes: - scene = f"https://www.r18.com/api/v4f/contents/{scene.strip()}?lang=en&unit=USD" - yield scrapy.Request(scene, callback=self.parse_scene, meta=meta) - - def parse_scene(self, response): - jsondata = response.json() - scene = jsondata['data'] - item = SceneItem() - - title = '' - if scene['dvd_id']: - item['id'] = scene['dvd_id'].upper().strip() - else: - item['id'] = scene['content_id'].upper().strip() - - if not title: - title = self.cleanup_title(scene['title']) - - item['title'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', title)).strip()) - item['title'] = self.uncensor_title(item['title']) - - if len(item['title']) > 100: - item['title'] = textwrap.wrap(item['title'], 100)[0] + "..." - item['title'] = f"{self.cleanup_title(item['title'])} - {item['id']}" - item['description'] = self.uncensor_title(scene['title']) - if scene['release_date']: - item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['release_date']).group(1) - else: - item['date'] = self.parse_date('today').isoformat() - - images = scene['images'] - if "jacket_image" in images: - images = images['jacket_image'] - - item['back'] = None - item['back_blob'] = None - if 'medium' in images: - item['back'] = images['medium'] - elif 'small' in images: - item['back'] = images['small'] - else: - item['back'] = None - if item['back']: - item['back_blob'] = self.get_image_blob_from_link(item['back']) - else: - item['back_blob'] = None - - item['image'] = None - item['image_blob'] = None - if 'large' in images: - item['image'] = images['large'] - elif 'medium' in images: - item['image'] = images['medium'] - if item['image']: - item['image_blob'] = self.get_image_blob_from_link(item['image']) - - item['performers'] = [] - if scene['actresses']: - for performer in scene['actresses']: - item['performers'].append(self.cleanup_title(unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', performer['name'])).strip()))) - - item['tags'] = [] - if scene['categories']: - for category in scene['categories']: - item['tags'].append(self.cleanup_title(unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', category['name'])).strip()))) - if "Asian" not in item['tags']: - item['tags'].append("Asian") - if "JAV" not in item['tags']: - item['tags'].append("JAV") - - item['parent'] = "R18" - item['site'] = "R18" - if scene['maker']: - if scene['maker']['name']: - item['parent'] = scene['maker']['name'] - item['site'] = scene['maker']['name'] - - item['network'] = 'R18' - item['director'] = scene['director'] - item['format'] = "DVD/VOD" - item['duration'] = str(int(scene['runtime_minutes']) * 60) - item['sku'] = scene['content_id'] - if 'id' not in item: - item['id'] = item['sku'] - item['url'] = f"https://www.r18.com/videos/vod/movies/detail/-/id={item['sku']}/" - item['type'] = "JAV" - - item['trailer'] = None - if scene['sample']: - if "high" in scene['sample']: - item['trailer'] = scene['sample']['high'] - elif "medium" in scene['sample']: - item['trailer'] = scene['sample']['medium'] - elif "small" in scene['sample']: - item['trailer'] = scene['sample']['small'] - - yield self.check_item(item, self.days) - - def uncensor_title(self, title): - title = title.replace("A*****t", "Assault") - title = title.replace("A****p", "Asleep") - title = title.replace("A***e", "Abuse") - title = title.replace("B***d", "Blood") - title = title.replace("B**d", "Bled") - title = title.replace("C***d", "Child") - title = title.replace("C*ck", "Cock") - title = title.replace("D******e", "Disgrace") - title = title.replace("D***king", "Drinking") - title = title.replace("D***k", "Drunk") - title = title.replace("D**g", "Drug") - title = title.replace("F*****g", "Forcing") - title = title.replace("F***e", "Force") - title = title.replace("G*******g", "Gangbang") - title = title.replace("G******g", "Gang Bang") - title = title.replace("H*********n", "Humiliation") - title = title.replace("H*******e", "Hypnotize") - title = title.replace("H*******m", "Hypnotism") - title = title.replace("H**t", "Hurt") - title = title.replace("I****t", "Incest") - title = title.replace("K****p", "Kidnap") - title = title.replace("K****r", "Killer") - title = title.replace("K**l", "Kill") - title = title.replace("K*d", "Kid") - title = title.replace("L****a", "Lolita") - title = title.replace("M************n", "Mother And Son") - title = title.replace("M****t", "Molest") - title = title.replace("P********t", "Passed Out") - title = title.replace("P****h", "Punish") - title = title.replace("R****g", "Raping") - title = title.replace("R**e", "Rape") - title = title.replace("RStepB****************r", "Stepbrother and Sister") - title = title.replace("S*********l", "School Girl") - title = title.replace("S**********s", "School Girls") - title = title.replace("S********l", "Schoolgirl") - title = title.replace("S*********s", "Schoolgirls") - title = title.replace("S******g", "Sleeping") - title = title.replace("S*****t", "Student") - title = title.replace("S***e", "Slave") - title = title.replace("S**t", "Scat") - title = title.replace("Sch**l", "School") - title = title.replace("StepM************n", "Stepmother and Son") - title = title.replace("T******e", "Tentacle") - title = title.replace("T*****e", "Torture") - title = title.replace("U*********s", "Unconscious") - title = title.replace("V*****e", "Violate") - title = title.replace("V*****t", "Violent") - title = title.replace("Y********l", "Young Girl") - title = title.replace("A*****t", "Assault") - title = title.replace("a*****t", "assault") - title = title.replace("a****p", "asleep") - title = title.replace("a***e", "abuse") - title = title.replace("b***d", "blood") - title = title.replace("b**d", "bled") - title = title.replace("c***d", "child") - title = title.replace("c*ck", "cock") - title = title.replace("d******e", "disgrace") - title = title.replace("d***king", "drinking") - title = title.replace("d***k", "drunk") - title = title.replace("d**g", "drug") - title = title.replace("f*****g", "forcing") - title = title.replace("f***e", "force") - title = title.replace("g*******g", "gangbang") - title = title.replace("g******g", "gang bang") - title = title.replace("h*********n", "humiliation") - title = title.replace("h*******e", "hypnotize") - title = title.replace("h*******m", "hypnotism") - title = title.replace("h**t", "hurt") - title = title.replace("i****t", "incest") - title = title.replace("k****p", "kidnap") - title = title.replace("k****r", "killer") - title = title.replace("k**l", "kill") - title = title.replace("k*d", "kid") - title = title.replace("l****a", "lolita") - title = title.replace("m************n", "mother and son") - title = title.replace("m****t", "molest") - title = title.replace("p********t", "passed out") - title = title.replace("p****h", "punish") - title = title.replace("r****g", "raping") - title = title.replace("r**e", "rape") - title = title.replace("rstepb****************r", "stepbrother and sister") - title = title.replace("s*********l", "school girl") - title = title.replace("s********l", "schoolgirl") - title = title.replace("s**********s", "school girls") - title = title.replace("s*********s", "schoolgirls") - title = title.replace("s******g", "sleeping") - title = title.replace("s*****t", "student") - title = title.replace("s***e", "slave") - title = title.replace("s**t", "scat") - title = title.replace("sch**l", "school") - title = title.replace("stepm************n", "stepmother and son") - title = title.replace("t******e", "tentacle") - title = title.replace("t*****e", "torture") - title = title.replace("u*********s", "unconscious") - title = title.replace("v*****e", "violate") - title = title.replace("v*****t", "violent") - title = title.replace("y********l", "young girl") - - return title diff --git a/scenes/moviesTinoShop.py b/scenes/moviesTinoShop.py deleted file mode 100644 index b863a969..00000000 --- a/scenes/moviesTinoShop.py +++ /dev/null @@ -1,80 +0,0 @@ -import re -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class MoviesTinoShopSpider(BaseSceneScraper): - name = 'TinoShop' - network = 'Tino Media' - parent = 'Tino Media' - site = 'Tino Media' - - start_url = 'https://tinoshop.com' - paginations = [ - '/en/Teens?cat=1&next_page=%s', - '/en/Bi-DVDs?cat=1&next_page=%s', - '/en/Gay?cat=2&next_page=%s', - ] - - selector_map = { - 'title': '', - 'description': '', - 'date': '', - 'image': '', - 'performers': '', - 'tags': '', - 'duration': '', - 'trailer': '', - 'external_id': r'', - 'pagination': '', - 'type': 'Movie', - } - - - def start_requests(self): - meta = {} - meta['page'] = self.page - - for pagination in self.paginations: - meta['pagination'] = pagination - yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, pagination), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse(self, response, **kwargs): - scenes = self.get_scenes(response) - count = 0 - for scene in scenes: - count += 1 - yield scene - - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta = response.meta - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_next_page_url(self, base, page, pagination): - return self.format_url(base, pagination % page) - - def get_scenes(self, response): - scenes = response.xpath('//table[@class="productPreview"]') - for scene in scenes: - item = SceneItem() - item['title'] = self.cleanup_title(scene.xpath('.//h2/a/text()').get()) - item['date'] = None - item['image'] = scene.xpath('.//a/img/@src').get() - item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['url'] = scene.xpath('.//h2/a/@href').get() - item['tags'] = ['European'] - item['performers'] = [] - item['description'] = "" - item['type'] = 'Movie' - item['id'] = scene.xpath('.//input[@type="hidden" and @name="product"]/@value').get() - item['trailer'] = None - item['site'] = 'Tino Media' - item['parent'] = 'Tino Media' - item['network'] = 'Tino Media' - yield item - - diff --git a/scenes/moviesTreasureIslandMedia.py b/scenes/moviesTreasureIslandMedia.py deleted file mode 100644 index 34168423..00000000 --- a/scenes/moviesTreasureIslandMedia.py +++ /dev/null @@ -1,55 +0,0 @@ -import re -import string -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper - - -class MoviesTreasureIslandMediaSpider(BaseSceneScraper): - name = 'TreasureIslandMediaMovies' - network = 'Treasure Island Media' - parent = 'Treasure Island Media' - site = 'Treasure Island Media' - - start_urls = [ - 'https://timstore.treasureislandmedia.com', - ] - - selector_map = { - 'title': '//h1[contains(@class,"ty-product-block-title")]//text()', - 'description': '//div[contains(@id,"content_description")]/div/p//text()', - 'date': '//span[@class="release-original-inner"]/text()', - 'date_formats': ['%B %Y'], - 'image': '//meta[@property="og:image"]/@content', - 'performers': '//div[contains(@class,"row model-timremoteapis-wrap-row")]//a[contains(@class,"thumbnail-subtitle")]/text()', - 'tags': '', - 'duration': '//span[@class="run-time-inner"]/text()', - 'trailer': '', - 'external_id': r'.*/(.*?)/', - 'pagination': '/movies/page-%s/?features_hash=1677_24-5855', - 'type': 'Movie', - } - - def get_scenes(self, response): - meta = response.meta - scenes = response.xpath('//div[@class="ty-grid-list__image"]/a/@href').getall() - for scene in scenes: - if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) - - def get_title(self, response): - title = super().get_title(response) - title = title.lower() - title = title.replace("(dvd)", "") - title = string.capwords(title) - return title - - def get_duration(self, response): - duration = super().get_duration(response) - duration = duration.lower() - duration = duration.replace(" ", "") - if "h" in duration and "m" in duration: - hours = re.search(r'(\d+)h', duration).group(1) - mins = re.search(r'(\d+)m', duration).group(1) - duration = str(((int(hours) * 60) * 60) + (int(mins) * 60)) - return duration - return None diff --git a/scenes/networkAdultCentro.py b/scenes/networkAdultCentro.py index 807782ad..fad35037 100644 --- a/scenes/networkAdultCentro.py +++ b/scenes/networkAdultCentro.py @@ -94,6 +94,13 @@ class NetworkAdultCentroSpider(BaseSceneScraper): ['https://idreamofjo.com', '&transitParameters[v1]=ykYa8ALmUD&transitParameters[v2]=ykYa8ALmUD', 'I Dream of Jo', 'Monica Sweet'], ['https://jenysmith.net', '&transitParameters[v1]=ykYa8ALmUD&transitParameters[v2]=ykYa8ALmUD', 'Jeny Smith', 'Jeny Smith'], ['https://oopsmodels.com', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Oops Models (Official)', ''], + ['https://ginagerson.xxx', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Gina Gerson', ''], + ['https://mugursworld.com', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Mugur Porn', ''], + ['https://hunglow.org', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Hung Low', 'Hung Lo'], + ['https://exploitedtalent.com', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Exploited Talent', ''], + ['https://dripdropprod.net', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'DripDrop', ''], + ['https://thepervempire.com', '&transitParameters[v1]=ykYa8ALmUD&transitParameters[v2]=ykYa8ALmUD', 'The Perv Empire', ''], + ['https://sallydangeloxxx.com', '&transitParameters[v1]=OhUOlmasXD&transitParameters[v2]=OhUOlmasXD', 'Sally Dangelo XXX', ''], ] selector_map = { @@ -219,9 +226,15 @@ def parse_scene(self, response): if "vinasky" in response.url: meta['performer'] = "Vina Sky" + if "ginagerson" in response.url: + meta['performer'] = "Gina Gerson" + if "rydenarmani" in response.url: meta['performer'] = "Ryden Armani" + if "mugursworld" in response.url: + meta['performer'] = "Mugur" + if "backalleytoonz" in response.url: item['tags'].append("Animation") @@ -275,6 +288,8 @@ def get_performers_json(self, response): item['performers'] = [] if "lonelymeow" in response.url: item['performers'] = ['LonelyMeow'] + if "sallydangelo" in response.url: + item['performers'] = ['Sally DAngelo'] if "Don Whoe" in item['tags']: item['tags'].remove("Don Whoe") diff --git a/scenes/networkAdultEmpireCash.py b/scenes/networkAdultEmpireCash.py index 16ec5fa7..f006e34b 100644 --- a/scenes/networkAdultEmpireCash.py +++ b/scenes/networkAdultEmpireCash.py @@ -16,7 +16,7 @@ class AdultEmpireCashScraper(BaseSceneScraper): # ~ # 'https://www.mypervyfamily.com/', # Moved to AdulttimeAPI scraper 'https://www.conorcoxxx.com', 'https://www.hornyhousehold.com', - 'https://jayspov.net', + # ~ # 'https://jayspov.net', Now Cloudflared # 'https://www.filthykings.com/', # Moved to AdulttimeAPI scraper 'https://thirdworldxxx.com', 'https://latinoguysporn.com', @@ -217,6 +217,8 @@ def get_site(self, response): return 'Smut Factor' if 'realgirlsfuck' in response.url: return 'Real Girls Fuck' + if 'thirdworld' in response.url: + return 'Third World Media' if 'wcpclub' in response.url: return 'West Coast Productions' @@ -234,7 +236,7 @@ def get_parent(self, response): if 'smutfactor' in response.url: return 'Smut Factor' if 'thirdworld' in response.url: - return 'Third World XXX' + return 'Third World Media' if 'wcpclub' in response.url: return 'West Coast Productions' diff --git a/scenes/networkAdultPrime.py b/scenes/networkAdultPrime.py index 2c974eec..b08621fb 100644 --- a/scenes/networkAdultPrime.py +++ b/scenes/networkAdultPrime.py @@ -68,9 +68,10 @@ class NetworkAdultPrimeSpider(BaseSceneScraper): selector_map = { 'title': '//h2[@class="update-info-title"]/text()', - 'description': '//p[contains(@class,"ap-limited-description-text")]/text()', - 'date': '//p[contains(@class,"update-info-line")]/i[@class="fa fa-calendar"][1]/following-sibling::b[1]/text()', - 'date_formats': ['%d.%m.%Y'], + # ~ 'description': '//p[contains(@class,"ap-limited-description-text")]/text()', # Blocked due to public scenes having generic site descriptions + 'description': '', + 'date': '//div[contains(@class, "player-wrapper")]//span[@class="description-releasedate"]/text()', + 'date_formats': ['%d.%m.%Y','%d-%m-%Y'], 'image': '//div[contains(@class,"update-video-wrapper")]/a/div/@style|//video/@poster', 're_image': r'(http.*\.jpg)', 'performers': '//p[contains(@class,"update-info-line")]/b[contains(text(), "Performer")]/following-sibling::a/text()', @@ -117,6 +118,6 @@ def get_tags(self, response): def get_image(self, response): image = super().get_image(response) - if len(image) < 10: + if image and len(image) < 10: image = "" return image diff --git a/scenes/networkAdulttimeAPI.py b/scenes/networkAdulttimeAPI.py index 1706954b..64f86d3e 100644 --- a/scenes/networkAdulttimeAPI.py +++ b/scenes/networkAdulttimeAPI.py @@ -72,7 +72,7 @@ def match_site(argument): 'girlstryanal': 'Girls Try Anal', 'girlsway': 'Girlsway', 'givemeteens': 'Give Me Teens', - 'gloryholesecreta': 'Gloryhole Secrets', + 'gloryholesecrets': 'Gloryhole Secrets', 'grandpasfuckteens': 'Grandpas Fuck Teens', 'hairyundies': 'Hairy Undies', 'homepornreality': 'Home Porn Reality', @@ -164,7 +164,7 @@ class AdultTimeAPISpider(BaseSceneScraper): network = 'Gamma Enterprises' start_urls = [ - # ~ # # 'https://www.agentredgirl.com', Disabled due to AdultTime being very protective + # # 'https://www.agentredgirl.com', Disabled due to AdultTime being very protective 'https://www.21naturals.com', 'https://www.21sextreme.com', 'https://www.21sextury.com', @@ -213,6 +213,7 @@ class AdultTimeAPISpider(BaseSceneScraper): 'https://www.mommysgirl.com', 'https://www.mypervyfamily.com', 'https://www.nextdoorstudios.com', + 'https://www.peternorth.com', 'https://www.povthis.com', 'https://www.prettydirty.com', 'https://www.pridestudios.com', @@ -390,7 +391,7 @@ def get_scenes(self, response): item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['url_title'] + '/' + str(scene['clip_id'])) if '21sextury' in referrerurl: item['parent'] = "21Sextury" - item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['url_title'] + '/' + str(scene['clip_id'])) + item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['sitename'] + '/' + scene['url_title'] + '/' + str(scene['clip_id'])) if '21naturals' in referrerurl: item['parent'] = "21Naturals" item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['sitename'] + '/' + scene['url_title'] + '/' + str(scene['clip_id'])) @@ -519,6 +520,9 @@ def get_scenes(self, response): if 'oopsie' in referrerurl: item['parent'] = "Oopsie" item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['sitename'] + '/' + scene['url_title'] + '/' + str(scene['clip_id'])) + if 'peternorth' in referrerurl: + item['parent'] = "Peter North" + item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['sitename'] + '/' + scene['url_title'] + '/' + str(scene['clip_id'])) if 'povthis' in referrerurl: item['parent'] = "POV This" item['url'] = self.format_url(response.meta['url'], '/en/video/' + scene['sitename'] + '/' + scene['url_title'] + '/' + str(scene['clip_id'])) @@ -722,6 +726,8 @@ def call_algolia(self, page, token, referrer): jbody = '{"requests":[{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Amypervyfamily%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=false&clickAnalytics=true&facets=%5B%22categories.name%22%2C%22channels.id%22%2C%22availableOnSite%22%2C%22upcoming%22%5D&tagFilters=&facetFilters=%5B%5B%22upcoming%3A0%22%5D%5D"},{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=1&maxValuesPerFacet=10&page=0&analytics=false&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Amypervyfamily%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=false&clickAnalytics=false&attributesToRetrieve=%5B%5D&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&facets=upcoming"}]}' if 'naked-yoga-life' in referrer: jbody = '{"requests":[{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=24&page=' + str(page) + '&attributesToRetrieve=%5B%22action_tags%22%2C%22clip_id%22%2C%22title%22%2C%22url_title%22%2C%22pictures%22%2C%22categories%22%2C%22actors%22%2C%22release_date%22%2C%22sitename%22%2C%22download_sizes%22%2C%22clip_length%22%2C%22upcoming%22%2C%22network_name%22%2C%22length%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22rating_rank%22%2C%22clip_path%22%2C%22channels%22%2C%22mainChannel%22%2C%22views%22%2C%22award_winning%22%2C%22directors%22%2C%22download_file_sizes%22%2C%22trailers%22%2C%22subtitles%22%2C%22objectID%22%2C%22subtitle_id%22%2C%22source_clip_id%22%5D&clickAnalytics=true&facets=%5B%5D&tagFilters=&facetFilters=%5B%22upcoming%3A0%22%2C%5B%22availableOnSite%3Anakedyogalife%22%5D%5D"},{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=1&page=0&attributesToRetrieve=%5B%22action_tags%22%2C%22clip_id%22%2C%22title%22%2C%22url_title%22%2C%22pictures%22%2C%22categories%22%2C%22actors%22%2C%22release_date%22%2C%22sitename%22%2C%22download_sizes%22%2C%22clip_length%22%2C%22upcoming%22%2C%22network_name%22%2C%22length%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22rating_rank%22%2C%22clip_path%22%2C%22channels%22%2C%22mainChannel%22%2C%22views%22%2C%22award_winning%22%2C%22directors%22%2C%22download_file_sizes%22%2C%22trailers%22%2C%22subtitles%22%2C%22objectID%22%2C%22subtitle_id%22%2C%22source_clip_id%22%5D&clickAnalytics=false&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&analytics=false&facets=availableOnSite&facetFilters=%5B%22upcoming%3A0%22%5D"}]}' + if 'peternorth' in referrer: + jbody = '{"requests":[{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Apeternorth%22%2C%22context%3Avideos%22%2C%22device%3Adesktop%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22availableOnSite%22%2C%22upcoming%22%5D&tagFilters=&facetFilters=%5B%5B%22upcoming%3A0%22%5D%2C%5B%22availableOnSite%3Apeternorth%22%5D%5D"},{"indexName":"all_scenes_latest_desc"]}' if 'clubinfernodungeon' in referrer: jbody = '{"requests":[{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=20&maxValuesPerFacet=1000&page=' + str(page) + '&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=false&facets=%5B%22studio_name%22%2C%22categories.name%22%2C%22actors.name%22%2C%22download_sizes%22%2C%22length_range_15min%22%2C%22availableOnSite%22%2C%22upcoming%22%5D&tagFilters=&facetFilters=%5B%5B%22upcoming%3A0%22%5D%5D"},{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=1&maxValuesPerFacet=1000&page=0&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=false&attributesToRetrieve=%5B%5D&attributesToHighlight=%5B%5D&attributesToSnippet=%5B%5D&tagFilters=&analytics=false&clickAnalytics=false&facets=upcoming"}]}' if 'nextdoorstudios' in referrer: diff --git a/scenes/networkAdulttimeMoviesAPI.py b/scenes/networkAdulttimeMoviesAPI.py deleted file mode 100644 index 8d14eb25..00000000 --- a/scenes/networkAdulttimeMoviesAPI.py +++ /dev/null @@ -1,253 +0,0 @@ -import re -import string -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - -# NOTE! This scraper _ONLY_ pulls scenes from AdultTime sites with publicly available video index pages. -# It will not pull any scenes or images that are unavailable if you simply go to the specific site -# as a guest user in an incognito browser - - -def match_site(argument): - match = { - 'wicked': 'Wicked', - 'evilangel': 'Evil Angel', - } - return match.get(argument.lower(), argument) - - -class AdultTimeMoviesAPISpider(BaseSceneScraper): - name = 'AdulttimeMoviesAPI' - network = 'Gamma Enterprises' - - start_urls = [ - 'https://www.evilangel.com', - # ~ 'https://www.wicked.com', - ] - - custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False', 'CONCURRENT_REQUESTS': '2'} - - image_sizes = [ - '1920x1080', - '1280x720', - '960x544', - '638x360', - '201x147', - '406x296', - '307x224' - ] - - trailer_sizes = [ - '1080p', - '720p', - '4k', - '540p', - '480p', - '360p', - '240p', - '160p' - ] - - selector_map = { - 'external_id': '(\\d+)$', - 'pagination': '/en/movies/page/%s' - } - - def start_requests(self): - if not hasattr(self, 'start_urls'): - raise AttributeError('start_urls missing') - - if not self.start_urls: - raise AttributeError('start_urls selector missing') - page = int(self.page) - 1 - - for link in self.start_urls: - yield scrapy.Request(url=self.get_next_page_url(link, page + 1), callback=self.parse_token, meta={'page': page, 'url': link}) - - def parse_token(self, response): - match = re.search(r'\"apiKey\":\"(.*?)\"', response.text) - token = match.group(1) - return self.call_algolia_movie(response.meta['page'], token, response.meta['url']) - - def parse(self, response, **kwargs): - meta = response.meta - if response.status == 200: - movies = self.get_movies(response) - # ~ print(f"Movies Len: {len(list(movies))}") - count = 0 - for movie in movies: - if movie is not None: - count += 1 - # ~ scenecall = self.call_algolia_scene(meta['token'], meta["url"], movie['id']) - # ~ meta['movie'] = movie - # ~ yield scrapy.Request(url=scenecall['url'], method='post', body=scenecall['jbody'], meta=meta, callback=self.get_scenes, headers=scenecall['headers']) - yield movie - - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - next_page = response.meta['page'] + 1 - yield self.call_algolia_movie(next_page, response.meta['token'], response.meta['url']) - - def get_movies(self, response): - meta = response.meta - for scene in response.json()['results'][0]['hits']: - # ~ print(scene) - item = SceneItem() - - item['image'] = '' - if scene['cover_path']: - item['image'] = f"https://images02-openlife.gammacdn.com/movies/{scene['cover_path']}_front_400x625.jpg" - item['image'] = item['image'].replace("movies//", "movies/") - - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - - item['trailer'] = '' - for size in self.trailer_sizes: - if size in scene['trailers']: - item['trailer'] = scene['trailers'][size] - break - - item['id'] = scene['objectID'].split('-')[0] - item['title'] = string.capwords(scene['title']) - - if 'description' in scene: - item['description'] = scene['description'] - elif 'description' in scene['_highlightResult']: - item['description'] = scene['_highlightResult']['description']['value'] - if 'description' not in item: - item['description'] = '' - - if self.parse_date(scene['last_modified']): - item['date'] = self.parse_date(scene['last_modified']).isoformat() - else: - item['date'] = self.parse_date(scene['date_created']).isoformat() - - item['performers'] = list( - map(lambda x: x['name'], scene['actors'])) - if "directors" in scene: - if scene['directors']: - item['director'] = scene['directors'][0]['name'] - item['tags'] = list(map(lambda x: x['name'], scene['categories'])) - item['tags'] = list(filter(None, item['tags'])) - item['scenes'] = [] - item['duration'] = self.duration_to_seconds(scene['total_length']) - item['sku'] = scene['objectID'] - item['network'] = self.network - - if "wicked" in meta['url']: - item['site'] = scene['sitename_pretty'] - item['parent'] = scene['studio_name'] - item['url'] = f"https://www.wicked.com/en/movie/{scene['url_title']}/{item['id']}" - - if "evilangel" in meta['url']: - item['site'] = "Evil Angel" - item['parent'] = "Evil Angel" - item['url'] = f"https://www.evilangel.com/en/movie/{scene['url_title']}/{item['id']}" - - item['type'] = 'Movie' - - # ~ print(item['title'], item['id']) - yield self.check_item(item, self.days) - - def get_scenes(self, response): - meta = response.meta - movie = response.meta['movie'] - for scene in response.json()['results'][0]['hits']: - item = SceneItem() - - item['image'] = '' - for size in self.image_sizes: - if size in scene['pictures']: - item['image'] = 'https://images-fame.gammacdn.com/movies' + \ - scene['pictures'][size] - break - - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = None - - item['trailer'] = '' - for size in self.trailer_sizes: - if size in scene['trailers']: - item['trailer'] = scene['trailers'][size] - break - - item['id'] = scene['objectID'].split('-')[0] - - if 'title' in scene and scene['title']: - item['title'] = scene['title'] - else: - item['title'] = scene['movie_title'] - - item['title'] = string.capwords(item['title']) - - if 'description' in scene: - item['description'] = scene['description'] - elif 'description' in scene['_highlightResult']: - item['description'] = scene['_highlightResult']['description']['value'] - if 'description' not in item: - item['description'] = '' - if "director" in movie: - item['director'] = movie['director'] - - if self.parse_date(scene['release_date']): - item['date'] = self.parse_date(scene['release_date']).isoformat() - else: - item['date'] = self.parse_date('today').isoformat() - item['performers'] = list( - map(lambda x: x['name'], scene['actors'])) - item['tags'] = list(map(lambda x: x['name'], scene['categories'])) - item['tags'] = list(filter(None, item['tags'])) - - item['duration'] = scene['length'] - - item['site'] = scene['sitename_pretty'] - item['parent'] = scene['studio_name'] - item['network'] = self.network - movie['scenes'].append({'site': item['site'], 'external_id': item['id']}) - - if "wicked" in meta['url']: - item['url'] = f"https://www.wicked.com/en/video/{scene['url_title']}/{item['id']}" - - if "evilangel" in meta['url']: - item['url'] = f"https://www.evilangel.com/en/video/evilangel/{scene['url_title']}/{item['id']}" - - item['type'] = 'Scene' - - yield item - yield movie - - def call_algolia_movie(self, page, token, referrer): - # ~ algolia_url = 'https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia for vanilla JavaScript 3.27.1;JS Helper 2.26.0&x-algolia-application-id=TSMKFA364Q&x-algolia-api-key=%s' % token - algolia_url = 'https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20react%20(18.2.0)%3B%20react-instantsearch%20(5.7.0)%3B%20JS%20Helper%202.26.0&x-algolia-application-id=TSMKFA364Q&x-algolia-api-key=' + token - - headers = { - 'Content-Type': 'application/json', - 'Referer': self.get_next_page_url(referrer, page) - } - - if 'wicked' in referrer: - jbody = '{"requests":[{"indexName":"all_movies_latest_desc","params":"query=&hitsPerPage=30&maxValuesPerFacet=10&page=' + str(page) + '&filters=&facets=%5B%22availableOnSite%22%2C%22nb_of_scenes%22%2C%22is_movie_upcoming%22%5D&tagFilters=&facetFilters=%5B%5B%22is_movie_upcoming%3A-1%22%5D%2C%5B%22nb_of_scenes%3A-1%22%5D%2C%5B%22availableOnSite%3Awicked%22%2C%22availableOnSite%3Awickedpartners%22%5D%5D"}]}' - - if 'evilangel' in referrer: - jbody = '{"requests":[{"indexName":"all_movies_latest_desc","params":"query=&hitsPerPage=60&maxValuesPerFacet=10&page=' + str(page) + '&analytics=true&analyticsTags=%5B%22component%3Asearchlisting%22%2C%22section%3Afreetour%22%2C%22site%3Aevilangel%22%2C%22context%3Advds%22%2C%22device%3Adesktop%22%5D&attributesToRetrieve=%5B%22movie_id%22%2C%22title%22%2C%22cover_path%22%2C%22last_modified%22%2C%22actors%22%2C%22url_title%22%2C%22total_length%22%2C%22views%22%2C%22ratings_up%22%2C%22ratings_down%22%2C%22award_winning%22%2C%22nb_of_scenes%22%2C%22categories%22%2C%22full_movie%22%2C%22description%22%2C%22has_trailer%22%2C%22trailers%22%2C%22objectID%22%5D&highlightPreTag=%3Cais-highlight-0000000000%3E&highlightPostTag=%3C%2Fais-highlight-0000000000%3E&facetingAfterDistinct=true&clickAnalytics=true&filters=&facets=%5B%22nb_of_scenes%22%2C%22is_movie_upcoming%22%5D&tagFilters=&facetFilters=%5B%5B%22is_movie_upcoming%3A-1%22%5D%2C%5B%22nb_of_scenes%3A-1%22%5D%5D"}]}' - - return scrapy.Request(url=algolia_url, method='post', body=jbody, meta={'token': token, 'page': page, 'url': referrer}, callback=self.parse, headers=headers) - - def call_algolia_scene(self, token, referrer, movie): - algolia_url = 'https://tsmkfa364q-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=Algolia%20for%20JavaScript%20(3.35.1)%3B%20Browser%20(lite)%3B%20react%20(16.14.0)%3B%20react-instantsearch%20(5.7.0)%3B%20JS%20Helper%202.26.0&x-algolia-application-id=TSMKFA364Q&x-algolia-api-key=' + token - headers = { - 'Content-Type': 'application/json', - 'Referer': referrer - } - - jbody = '{"requests":[{"indexName":"all_scenes_latest_desc","params":"query=&hitsPerPage=60&page=0&facets=%5B%5D&tagFilters=&facetFilters=%5B%22movie_id%3A' + movie + '%22%5D"}]}' - - - call_return = {} - call_return['url'] = algolia_url - call_return['jbody'] = jbody - call_return['headers'] = headers - return call_return diff --git a/scenes/networkAllHerLuv.py b/scenes/networkAllHerLuv.py index c5c84f40..6d0d9e63 100644 --- a/scenes/networkAllHerLuv.py +++ b/scenes/networkAllHerLuv.py @@ -1,5 +1,4 @@ import re -import dateparser import scrapy from tpdb.BaseSceneScraper import BaseSceneScraper @@ -16,8 +15,8 @@ class AllHerLuvSpider(BaseSceneScraper): selector_map = { 'title': '//meta[@name="twitter:title"]/@content', - 'description': '//div[@class="container"]/p[contains(@class,"text")]/strong/text()', - 'image': '//img[contains(@class,"update_thumb")]/@src0_1x', # Image is tokened + 'description': '//div[@class="container"]/p[contains(@class,"text")]/strong/text()|//p[contains(text(), "Video Description:")]/following-sibling::p//text()', + 'image': '//img[contains(@class,"update_thumb")]/@src0_4x', # Image is tokened 'image_blob': True, 'performers': '//p[@class="dvd-scenes__data"]/a[contains(@href,"/models/")]/text()', 'tags': '//p[@class="dvd-scenes__data"]/a[contains(@href,"/categories/")]/text()', @@ -32,11 +31,20 @@ def get_scenes(self, response): yield scrapy.Request(url=scene, callback=self.parse_scene) def get_date(self, response): - date = response.xpath('//p[@class="dvd-scenes__data" and contains(text(),"Added:")]').get() - if date: - date = re.search(r'(\d{2}\/\d{2}\/\d{4})', date).group(1) - if date: - return dateparser.parse(date).isoformat() + scenedate = response.xpath('//p[contains(@class,"dvd-scenes__data")]//text()[contains(., "Added:")]').get() + if scenedate: + scenedate = re.search(r'(\d{1,2}/\d{1,2}/\d{4})', scenedate).group(1) + if scenedate: + return self.parse_date(scenedate, date_formats=['%m/%d/%Y']).isoformat() + + def get_duration(self, response): + duration = response.xpath('//p[contains(@class,"dvd-scenes__data")]//text()[contains(., "Added:")]') + if duration: + duration = duration.get() + duration = re.search(r'((?:\d{1,2}\:)?\d{2}\:\d{2})', duration) + if duration: + return self.duration_to_seconds(duration.group(1)) + return None def get_site(self, response): if "allherluv" in response.url: @@ -46,8 +54,11 @@ def get_site(self, response): return "MissaX" def get_image(self, response): - image = self.process_xpath(response, self.get_selector_map('image')).get() + image = response.xpath(self.get_selector_map('image')) + if not image: + image = response.xpath('//img[contains(@class,"update_thumb")]/@src0_3x|//img[contains(@class,"update_thumb")]/@src0_2x|//img[contains(@class,"update_thumb")]/@src0_1x') if image: + image = image.get() return self.format_link(response, image) return '' diff --git a/scenes/networkAndomark.py b/scenes/networkAndomark.py index fd133dfe..837f6e7e 100644 --- a/scenes/networkAndomark.py +++ b/scenes/networkAndomark.py @@ -103,7 +103,7 @@ class AndomarkSpider(BaseSceneScraper): 'https://rionkingxxx.com', 'https://seanmichaelsxxx.com', 'https://secretsusan.com', - 'https://sheseducedme.com', + # ~ # 'https://sheseducedme.com', Need new scraper 'https://shinybound.com', 'https://www.shinysboundsluts.com', 'https://sofiemariexxx.com', @@ -223,6 +223,8 @@ def get_date(self, response): date = response.xpath('//span[contains(text(),"ADDED:")]/following-sibling::text()').get() else: date = self.process_xpath(response, self.get_selector_map('date')).get() + if not date: + date = response.xpath('//div[contains(@class, "gallery_info")]/div[@class="table"][1]//div[contains(@class, "update_date")]/text()').get() if not date: date = response.xpath('//span[@class="update_date"]/text()').get() if not date: @@ -233,7 +235,7 @@ def get_date(self, response): date = date.strip() date = re.search(r'(\d{2}\/\d{2}\/\d{4})', date).group(1) if date: - return self.parse_date(date).isoformat() + return self.parse_date(date).strftime('%Y-%m-%d') return '' def get_title(self, response): diff --git a/scenes/networkBadoinkVr.py b/scenes/networkBadoinkVr.py index 3a3ac999..b85cd7d9 100644 --- a/scenes/networkBadoinkVr.py +++ b/scenes/networkBadoinkVr.py @@ -1,16 +1,10 @@ -import warnings from datetime import datetime import dateparser import scrapy - +import re +from tpdb.items import SceneItem from tpdb.BaseSceneScraper import BaseSceneScraper -# Ignore dateparser warnings regarding pytz -warnings.filterwarnings( - "ignore", - message="The localize method is no longer necessary, as this time zone supports the fold attribute", -) - class BadoinkVrSpider(BaseSceneScraper): name = 'BadoinkVr' @@ -38,20 +32,20 @@ class BadoinkVrSpider(BaseSceneScraper): } def get_scenes(self, response): - scenes = response.xpath( - "//div[@class='tile-grid-item']//a[contains(@class, 'video-card-title')]/@href").getall() + scenes = response.xpath("//div[@class='tile-grid-item']//a[contains(@class, 'video-card-title')]/@href").getall() for scene in scenes: - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) + scene = self.format_link(response, scene) + yield scrapy.Request(scene, callback=self.parse_scene) def get_next_page_url(self, base, page): - selector = '/vrpornvideos?page=%s' + selector = '/vrpornvideos/%s?order=newest' if 'vrbtrans' in base: - selector = '/videos?page=%s' + selector = '/videos/?category=all&sort=latest&page=%s' elif 'vrcosplay' in base: - selector = '/cosplaypornvideos?page=%s' + selector = '/cosplaypornvideos/%s?order=newest' elif 'kinkvr' in base: - selector = '/bdsm-vr-videos?page=%s' + selector = '/bdsm-vr-videos/%s?order=newest' return self.format_url(base, selector % page) @@ -61,3 +55,31 @@ def get_date(self, response): if date: return dateparser.parse(date.strip()).isoformat() return datetime.now().isoformat() + + def parse_scene(self, response): + item = SceneItem() + item['title'] = self.get_title(response) + item['description'] = self.get_description(response) + item['site'] = self.get_site(response) + item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', self.get_date(response)).group(1) + item['image'] = self.get_image(response) + if item['image']: + item['image_blob'] = self.get_image_blob(response) + else: + item['image_blob'] = "" + + if item['image']: + if "?" in item['image'] and ("token" in item['image'].lower() or "expire" in item['image'].lower()): + item['image'] = re.search(r'(.*?)\?', item['image']).group(1) + + item['performers'] = self.get_performers(response) + item['tags'] = self.get_tags(response) + item['id'] = self.get_id(response) + item['trailer'] = self.get_trailer(response) + item['duration'] = self.get_duration(response) + item['url'] = self.get_url(response) + item['network'] = self.network + item['parent'] = item['site'] + + item['type'] = 'Scene' + yield self.check_item(item, self.days) diff --git a/scenes/networkBangNew.py b/scenes/networkBangNew.py index a4274109..75cd0b81 100644 --- a/scenes/networkBangNew.py +++ b/scenes/networkBangNew.py @@ -79,7 +79,7 @@ def convert_duration(self, duration): duration = duration.replace(":", "") if "H" in duration: duration = re.search(r'(\d{1,2})H(\d{1,2})M(\d{1,2})S', duration) - hours = int(duration.group(1)) * 3660 + hours = int(duration.group(1)) * 3600 minutes = int(duration.group(2)) * 60 seconds = int(duration.group(3)) duration = str(hours + minutes + seconds) diff --git a/scenes/networkCXWow.py b/scenes/networkCXWow.py index 97e9053a..7fb36242 100644 --- a/scenes/networkCXWow.py +++ b/scenes/networkCXWow.py @@ -28,45 +28,27 @@ class CXWowSpider(BaseSceneScraper): ] selector_map = { - 'title': '//div[contains(@class, "titlebox")]//h3/text()', - 'description': '//div[contains(@class, "aboutvideo")]//p/text()', - 'performers': '//ul[contains(@class, "featuredModels")]/li//span/text()', - 'date': '//div[contains(@class, "video_description")]//h4/text()', - 're_date': r'(\d{4}-\d{2}-\d{2})', - 'image': '//div[contains(@class, "videohere")]//img[contains(@src,"contentthumbs")]/@src', - 'tags': '//meta[@name="keywords"]/@content', - 'trailer': '', + 'title': '//h1/text()', + 'description': '//h4/following-sibling::p/text()', + 'performers': '//h5[contains(text(), "Featuring")]/following-sibling::ul/li/a/text()', + 'date': '//h5[contains(text(), "Added")]/following-sibling::p/text()', + 'date_formats': ['%B %d, %Y'], + 'image': '//div[contains(@class, "player-window-play")]/following-sibling::img[1]/@src0_4x', + 'duration': '//div[contains(@class, "player-time")]/text()', + 're_duration': r'/.*?(\d{1,2}:\d{2}(?::\d{2})?)', + 'tags': '//ul[@class="tags"]/li/a/text()', + 'trailer': '//script[contains(text(), "playsinline")]/text()', + 're_trailer': r'playsinline.*?(/.*?)[\'\"]', 'external_id': '/trailers/(.*).html', 'pagination': '/tour/updates/page_%s.html', } def get_scenes(self, response): - scenes = response.xpath('//body//section[2]//div[@class="empireimg"]/a/@href').getall() + scenes = response.xpath('//div[contains(@class,"iLScenePic")]/a/@href|//div[@class="mtVideoThumb"]/a/@href').getall() for scene in scenes: - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) - - def get_tags(self, response): - performers = super().get_performers(response) - tags = self.process_xpath(response, self.get_selector_map('tags')).get() - if tags: - tags2 = [] - tags = tags.split(",") - for tag in tags: - addtag = True - for performer in performers: - if performer.lower().strip() in tag.lower().strip(): - addtag = False - if "movie" in tag.lower() or "photo" in tag.lower(): - addtag = False - if "photo" in tag.lower(): - addtag = False - if " id " in tag.lower(): - addtag = False - if addtag: - tags2.append(tag) - return tags2 - + if "join.php" not in scene: + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) def get_site(self, response): return match_site(super().get_site(response)) @@ -78,58 +60,3 @@ def get_performers(self, response): performers = super().get_performers(response) performers.append("Christian XXX") return performers - - def get_image(self, response): - image = super().get_image(response) - if image and image not in response.url: - return image - - image = response.xpath('//script[contains(text(), "playTrailer")]/text()') - if image: - image = image.get() - image = re.search(r'(?:[^\/])image.*?[\'\"](.*?)[\'\"]', image) - if image: - image = image.group(1) - return self.format_link(response, image) - return None - - def get_trailer(self, response): - trailer = response.xpath('//script[contains(text(), "playTrailer")]/text()') - if trailer: - trailer = trailer.get() - trailer = re.search(r'(?:[^\/])file.*?[\'\"](.*?)[\'\"]', trailer) - if trailer: - trailer = trailer.group(1) - return self.format_link(response, trailer) - return None - - def get_date(self, response): - date = response.xpath('//div[contains(@class, "video_description")]//h4/text()') - if date: - date = date.getall() - date = "".join(date) - date = date.replace(" ", "").strip() - date = re.search(r'(\d{4}-\d{2}-\d{2})', date) - if date: - date = date.group(1) - return date - return None - - def get_duration(self, response): - duration = response.xpath('//div[contains(@class, "video_description")]//h4/text()') - if duration: - total_duration = 0 - duration = duration.getall() - duration = "".join(duration) - duration = re.sub(r'[^a-z0-9-]', '', duration.lower()) - minutes = re.search(r'(\d+)min', duration) - if minutes: - minutes = int(minutes.group(1)) * 60 - total_duration = total_duration + minutes - seconds = re.search(r'(\d+)second', duration) - if seconds: - seconds = int(seconds.group(1)) - total_duration = total_duration + seconds - if total_duration: - return str(total_duration) - return None diff --git a/scenes/networkCherryPimps.py b/scenes/networkCherryPimps.py index a582af7e..6b26e194 100644 --- a/scenes/networkCherryPimps.py +++ b/scenes/networkCherryPimps.py @@ -1,7 +1,6 @@ import re import dateparser import scrapy - from tpdb.BaseSceneScraper import BaseSceneScraper @@ -12,21 +11,15 @@ class CherryPimpsSpider(BaseSceneScraper): start_urls = [ 'https://www.cherrypimps.com', 'https://www.wildoncam.com', - 'https://www.cherryspot.com', + # ~ 'https://www.cherryspot.com', ] selector_map = { 'title': '//*[@class="trailer-block_title"]/text() | //h1/text()', - 'description': '//div[@class="info-block"]//p[@class="text"]/text() | ' - '//div[@class="update-info-block"]//p/text()', - 'image': '//img[contains(@class, "update_thumb")]/@src | ' - '//img[contains(@class, "update_thumb")]/@src0_1x', - 'performers': '//div[contains(@class, "model-list-item")]' - '//a/span/text() | ' - '//p[contains(text(), "Featuring")]/a/text()', - 'tags': '//ul[@class="tags"]/li/a/text() | ' - '//p[@class="text" and contains(text()' - ',"Categories")]/a/text()', + 'description': '//div[@class="info-block"]//p[@class="text"]/text() | //div[@class="update-info-block"]//p/text()', + 'image': '//img[contains(@class, "update_thumb")]/@src | //img[contains(@class, "update_thumb")]/@src0_1x', + 'performers': '//div[contains(@class, "model-list-item")]//a/span/text() | //p[contains(text(), "Featuring")]/a/text()', + 'tags': '//ul[@class="tags"]/li/a/text() | //p[@class="text" and contains(text(),"Categories")]/a/text()', 'duration': '//div[@class="update-info-row"]/i[contains(@class, "play-circle")]/following-sibling::text()[1]', 're_duration': r'(\d{1,2}:\d{2}(?::\d{2})?)', 'external_id': 'trailers/(.+)\\.html', @@ -35,38 +28,39 @@ class CherryPimpsSpider(BaseSceneScraper): } def get_scenes(self, response): + meta = response.meta """ Returns a list of scenes @url https://cherrypimps.com/categories/movies.html @returns requests 10 50 """ if "cherrypimps" in response.url: - scenexpath = '//div[contains(@class,"item-update") and ' \ - 'not(contains(@class,"item-updates"))]' - if "wildoncam" in response.url or "cherryspot" in response.url: + scenexpath = '//div[contains(@class,"item-update") and not(contains(@class,"item-updates"))]' + if "wildoncam" in response.url: scenexpath = '//div[contains(@class,"video-thumb")]' scenes = response.xpath(scenexpath) for scene in scenes: - site = scene.xpath( - './/div[@class="item-sitename"]/a/text() | ' - './p[contains(@class, "text-thumb")]/a/@data-elx_site_name' - ) + image = scene.xpath('.//img[contains(@class, "update_thumb")]/@src0_1x|.//img[contains(@class, "video_placeholder")]/@src') + if image: + meta['origimage'] = image.get() + + site = scene.xpath('.//div[@class="item-sitename"]/a/text() | ./p[contains(@class, "text-thumb")]/a/@data-elx_site_name') if site: site = site.get().strip() else: site = False + meta['site'] = site + if "cherrypimps" in response.url: - urlxpath = './div[@class="item-footer"]/div' \ - '/div[@class="item-title"]/a/@href' + urlxpath = './div[@class="item-footer"]/div/div[@class="item-title"]/a/@href' else: - urlxpath = './div[contains(@class, "videothumb")]/a/@href' \ - '| ./a/@href' - scene = scene.xpath(urlxpath).get() - yield scrapy.Request( - url=scene, callback=self.parse_scene, meta={'site': site}) + urlxpath = './div[contains(@class, "videothumb")]/a/@href | ./a/@href' + scenelink = scene.xpath(urlxpath).get() + + if "/signup/" not in scenelink: + yield scrapy.Request(url=scenelink, callback=self.parse_scene, meta=meta) def get_date(self, response): - selector = '//div[@class="info-block_data"]//p[@class="text"]/text() '\ - '| //div[@class="update-info-row"]/text()' + selector = '//div[@class="info-block_data"]//p[@class="text"]/text() | //div[@class="update-info-row"]/text()' if "wildoncam" in response.url or "cherryspot" in response.url: date = response.xpath(selector).extract()[0] else: @@ -95,3 +89,10 @@ def get_duration(self, response): if duration: return str(int(duration.group(1)) * 60) return None + + def get_image(self, response): + meta = response.meta + image = super().get_image(response) + if "content" not in image and "cdn" not in image: + return meta['origimage'] + return image diff --git a/scenes/networkClips4Sale.py b/scenes/networkClips4Sale.py index f5dc0685..f5ffc1e7 100644 --- a/scenes/networkClips4Sale.py +++ b/scenes/networkClips4Sale.py @@ -1,4 +1,5 @@ import re +import string import scrapy import tldextract from tpdb.BaseSceneScraper import BaseSceneScraper @@ -86,7 +87,7 @@ class SiteClips4SaleSpider(BaseSceneScraper): ['Clips4Sale', 'SilverCherrys Handjobs With a Twist', 'SilverCherrys Handjobs With a Twist', '79', 'silvercherrys-handjobs-with-a-twist'], ['Clips4Sale', 'Eros Handjobs N Blowjobs', 'Eros Handjobs N Blowjobs', '105416', '105416-eros-handjobs-n-blowjobs'], ['Clips4Sale', 'Lexis Taboo Diaries', 'Lexis Taboo Diaries', '113974', 'lexis-taboo-diaries'], - ['Clips4Sale', 'TABOO', 'TABOO', '58471', 'taboo'], + ['Clips4Sale', 'Clips4Sale: TABOO', 'Clips4Sale: TABOO', '58471', 'taboo'], ['Clips4Sale', 'Old School Ties By Steve Villa', 'Old School Ties By Steve Villa', '17008', 'tied-up---gagged-by-steve-villa'], ['Clips4Sale', 'Hardcore Foot Sex', 'Hardcore Foot Sex', '28231', 'hardcore-foot-sex'], ['Clips4Sale', 'FM Concepts 1080p Men In Bondage', 'FM Concepts 1080p Men In Bondage', '117240', 'FM-Concepts-1080p-Men-In-Bondage'], @@ -106,6 +107,32 @@ class SiteClips4SaleSpider(BaseSceneScraper): ['Clips4Sale', 'AstroDomina', 'AstroDomina', '56587', 'astrodomina'], ['Clips4Sale', 'Custom Fetish Cumshots', 'Custom Fetish Cumshots', '104694', 'custom-fetish-cumshots'], ['Clips4Sale', 'Cruel Anettes Fetish Store', 'Cruel Anettes Fetish Store', '122893', 'cruel-anettes-fetish-store'], + ['Clips4Sale', 'Kenny Kong AMWF Porn', 'Kenny Kong AMWF Porn', '105418', 'kenny-kong-amwf-porn'], + ['Clips4Sale', 'Cruel Punishments - Severe Femdom', 'Cruel Punishments - Severe Femdom', '20885', 'cruel-punishments---severe-femdom-'], + ['Clips4Sale', 'Princess Camryn', 'Princess Camryn', '117722', 'princess-camryn'], + ['Clips4Sale', 'Eva de Vil', 'Eva de Vil', '122965', 'eva-de-vil'], + ['Clips4Sale', 'Mandy Flores', 'Mandy Flores', '33729', 'mandy-flores'], + ['Clips4Sale', 'Angel The Dreamgirl', 'Angel The Dreamgirl', '68591', 'angel-the-dreamgirl'], + ['Clips4Sale', 'Lelu Love - Cum Inside, Lets Play', 'Lelu Love - Cum Inside, Lets Play', '44611', 'lelu-love---cum-inside--let-s-play'], + ['Clips4Sale', 'Naughty Girls', 'Naughty Girls', '148381', '148381-naughty-girls'], + ['Clips4Sale', 'Bratty Bunny', 'Bratty Bunny', '35587', 'Bratty-Bunny'], + ['Clips4Sale', 'POV Central', 'POV Central', '15933', 'pov-central'], + ['Clips4Sale', 'Mistress - T - Fetish Fuckery', 'Mistress - T - Fetish Fuckery', '23869', 'mistress---t---fetish-fuckery'], + ['Clips4Sale', 'Princess Camryn', 'Princess Camryn', '117722', 'princess-camryn'], + ['Clips4Sale', 'Nathan Blake XXX', 'Nathan Blake XXX', '94243', 'nathan-blake-xxx'], + ['Clips4Sale', 'JBC Videos Pantyhose', 'JBC Videos Pantyhose', '32173', 'jbc-videos-pantyhose'], + ['Clips4Sale', 'Alex Mack Clip Store', 'Alex Mack Clip Store', '143621', 'alex-mack-clip-store'], + ['Clips4Sale', 'J Macs POV', 'J Macs POV', '151671', 'j-macs-pov'], + ['Clips4Sale', 'Queens of Kink', 'Queens of Kink', '74545', 'queens-of-kink'], + ['Clips4Sale', 'Natalie Wonder Clips', 'Natalie Wonder Clips', '79477', 'natalie-wonder-clips'], + ['Clips4Sale', 'Hoby Buchanon Facefucks Chicks', 'Hoby Buchanon Facefucks Chicks', '116032', 'hoby-buchanon-facefucks-chicks'], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], + # ~ ['Clips4Sale', '', '', '', ''], ] url = 'https://www.clips4sale.com' @@ -169,7 +196,14 @@ def get_scenes(self, response): item['tags'] = [] if "related_category_links" in scene and scene['related_category_links']: for tag in scene['related_category_links']: - item['tags'].append(tag['category']) + if "category" in tag: + item['tags'].append(tag['category']) + if "clean_name" in tag: + item['tags'].append(string.capwords(tag['clean_name'])) + if "keyword_links" in scene and scene['keyword_links']: + for tag in scene['keyword_links']: + if "keyword" in tag: + item['tags'].append(string.capwords(tag['keyword'])) if scene['duration']: item['duration'] = str(int(scene['duration']) * 60) item['site'] = self.get_site(response) @@ -217,4 +251,8 @@ def get_performers(self, response): return ['Addie Juniper'] if "mandy-marx" in response.url: return ['Mandy Marx'] + if "natalie-wonder" in response.url: + return ['Natalie Wonder'] + if "princess-camryn" in response.url: + return ['Princess Camryn'] return [] diff --git a/scenes/networkCombatZone.py b/scenes/networkCombatZone.py new file mode 100644 index 00000000..b45870dc --- /dev/null +++ b/scenes/networkCombatZone.py @@ -0,0 +1,74 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class NetworkCombatZoneSpider(BaseSceneScraper): + name = 'CombatZone' + network = 'Combat Zone' + + start_urls = [ + 'https://tour.blackmarketxxx.com', + 'https://tour.fillyfilms.com', + 'https://tour.smashpictures.com', + 'https://tour.combatzonexxx.com', + ] + + selector_map = { + 'title': '', + 'description': '//div[@class="description"]/p/text()', + 'date': '//div[@class="info"]/p[1]//text()[contains(., "Added")]', + 're_date': r'(\w+ \d{1,2}, \d{4})', + 'image': '//img[contains(@id, "set-target")]/@src0_3x|//img[contains(@id, "set-target")]/@src0_2x|//img[contains(@id, "set-target")]/@src0_1x', + 'performers': '//div[@class="info"]/p[1]//a[contains(@href, "/models/")]/text()', + 'tags': '//ul[@class="tags"]/li/a/text()', + 'duration': '//div[@class="info"]/p[1]//text()[contains(., "Runtime")]', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'trailer': '', + 'external_id': r'', + 'pagination': '/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class,"item-video")]') + for scene in scenes: + meta['title'] = self.cleanup_title(scene.xpath('./div[1]/a/span/text()').get()) + duration = scene.xpath('.//div[@class="timeDate"]/text()[1]').get() + duration = re.search(r'((?:\d{1,2}\:)?\d{2}\:\d{2})', duration) + if duration: + meta['duration'] = self.duration_to_seconds(duration.group(1)) + + scenedate = scene.xpath('.//div[@class="timeDate"]//text()').getall() + scenedate = "".join(scenedate).replace(" ", "") + scenedate = re.search(r'(\d{4}-\d{2}-\d{2})', scenedate) + if scenedate: + meta['date'] = scenedate.group(1) + + meta['performers'] = scene.xpath('.//a[contains(@href, "/models/")]/text()').getall() + + image = scene.xpath('.//img[contains(@id, "set-target")]/@src0_3x|.//img[contains(@id, "set-target")]/@src0_2x|.//img[contains(@id, "set-target")]/@src0_1x') + if image: + meta['image'] = self.format_link(response, image.get()) + meta['image_blob'] = self.get_image_blob_from_link(meta['image']) + + scene = scene.xpath('.//div[@class="item-thumb"]/a/@href').get() + scene = self.format_link(response, scene) + + sceneid = re.search(r'/(trailers/.*?)\.htm', scene).group(1) + meta['id'] = sceneid.lower().replace("/", "-") + + if "fillyfilms" in scene: + meta['site'] = "Filly Films" + elif "blackmarketxxx" in scene: + meta['site'] = "Black Market" + elif "combatzonexxx" in scene: + meta['site'] = "Combat Zone" + elif "smashpictures" in scene: + meta['site'] = "Smash Pictures" + + meta['parent'] = meta['site'] + + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(scene, callback=self.parse_scene, meta=meta) diff --git a/scenes/networkDMEMediaV2.py b/scenes/networkDMEMediaV2.py index 57b3e801..24977fc8 100644 --- a/scenes/networkDMEMediaV2.py +++ b/scenes/networkDMEMediaV2.py @@ -127,6 +127,8 @@ def get_image(self, response): image = response.xpath('//script[contains(text(), "poster")]/text()') if image: image = re.search(r'poster=\"(http.*?)\"', image.get()).group(1) + if not image: + image = response.meta['image2'] return image def get_image_blob(self, response): diff --git a/scenes/networkFanCentro.py b/scenes/networkFanCentro.py new file mode 100644 index 00000000..e5d7ad45 --- /dev/null +++ b/scenes/networkFanCentro.py @@ -0,0 +1,106 @@ +import re +import json +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class NetworkFanCentroSpider(BaseSceneScraper): + name = 'FanCentro' + network = 'FanCentro' + + start_urls = [ + ['Just Lucy', True, 'justlucy94'], + ['Mdemma', True, 'mdemma'], + ] + + selector_map = { + 'title': '', + 'description': '', + 'date': '', + 'image': '', + 'performers': '', + 'tags': '', + 'duration': '', + 'trailer': '', + 'external_id': r'', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + + for link in self.start_urls: + meta['page'] = self.page + meta['siteid'] = link[2] + meta['site'] = link[0] + meta['parse_performer'] = link[1] + yield scrapy.Request(url=self.get_next_page_url(self.page, meta), callback=self.parse, meta=meta, headers=self.headers) + + def parse(self, response): + meta = response.meta + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(meta['page'], meta), callback=self.parse, meta=meta, headers=self.headers) + + def get_next_page_url(self, page, meta): + link = f"https://fancentro.com/lapi/feed?filter%5BprofileAlias%5D={meta['siteid']}&filter%5BwithInactive%5D=1&filter%5BcontentSection%5D=clip&thumbnailSizes%5Bclip.cover%5D=Ewe1mwK819%2CQYHojytoKM%2Cwv8oBuQHGy%2CAgZ4mIATPd%2CuDrXbF6zst%2CuaKV7hnDcF%2CuaKV7hnDc2&thumbnailSizes%5Bprofile.avatar%5D=wv8oBuQHGy&thumbnailSizes%5BpostResource.image%5D=uaKV7hnDc2%2CvRDNthbb3h%2CRS8qPJQXEw%2CuDrXbF6zst&fields%5Bprofiles%5D=alias%2Cavatar%2CprofileData&fields%5Bposts%5D=title%2Cbody%2CpublishDate%2CisDateHidden%2Cprice%2Cpolls%2CisPinned%2Cresources%2CpostAttachments%2CpostTags&page%5Bnumber%5D={page}&page%5Bsize%5D=10&sort=-publishDate" + return link + + def get_scenes(self, response): + meta = response.meta + jsondata = json.loads(response.text) + if "included" in jsondata: + jsondata = jsondata['included'] + cliptags = [] + for entry in jsondata: + if entry['type'] == "clipTags": + cliptags.append(entry) + for entry in jsondata: + if entry['type'] == "clips": + scene = entry + item = SceneItem() + + item['id'] = scene['id'] + item['title'] = scene['attributes']['title'] + item['description'] = scene['attributes']['description'] + item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['attributes']['publishDate']).group(1) + item['duration'] = scene['attributes']['length'] + if meta['parse_performer']: + item['performers'] = [meta['site']] + else: + item['performers'] = [] + + item['tags'] = [] + if "relationships" in entry and "clipTags" in entry['relationships'] and entry['relationships']['clipTags']['data']: + for tag in entry['relationships']['clipTags']['data']: + for clipentry in cliptags: + if tag['id'] == clipentry['id']: + item['tags'].append(clipentry['attributes']['alias']) + break + + for imagekey in scene['attributes']['coverUrl']: + image = imagekey + item['image'] = scene['attributes']['coverUrl'][image] + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + item['trailer'] = "" + + item['type'] = "Scene" + item['site'] = f"FanCentro: {meta['site']}" + item['parent'] = f"FanCentro: {meta['site']}" + item['network'] = "FanCentro" + + item['url'] = f"https://fancentro.com/{meta['siteid']}/clips/{item['id']}/" + + if item['id'] and item['title']: + yield self.check_item(item, self.days) diff --git a/scenes/networkManojob.py b/scenes/networkManojob.py index d804db5e..5cfbd7c1 100644 --- a/scenes/networkManojob.py +++ b/scenes/networkManojob.py @@ -23,7 +23,7 @@ class networkManojobSpider(BaseSceneScraper): 'title': '//h1/text()', 'description': '//div[@class="row"]/div/div[@class="text-center"]/preceding-sibling::p[1]/text()', 'date': '//meta[@itemprop="uploadDate"]/@content', - 'image': '//div[@class="video"]//video/@poster', + 'image': '//div[@class="video"]//video/@poster|//div[contains(@class,"video")]//video/@poster', 'performers': '//h3[contains(text(),"Starring")]/a/text()', 'tags': '//p[contains(text(),"Categories")]/a/text()', 'external_id': '.*\/(.*)', @@ -35,7 +35,7 @@ def start_requests(self): for link in self.start_urls: yield scrapy.Request(url=self.get_next_page_url(link[0], self.page, link[1]), callback=self.parse, - meta={'page': self.page, 'pagination':link[1], 'site':link[2], 'url':link[0]}, + meta={'page': self.page, 'pagination':link[1], 'site':link[2], 'siteurl':link[0]}, headers=self.headers, cookies=self.cookies) @@ -52,7 +52,7 @@ def parse(self, response, **kwargs): meta = response.meta meta['page'] = meta['page'] + 1 print('NEXT PAGE: ' + str(meta['page'])) - url = meta['url'] + url = meta['siteurl'] yield scrapy.Request(url=self.get_next_page_url(url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, diff --git a/scenes/networkManyVidsV2.py b/scenes/networkManyVidsV2.py index b9decd1c..317463a5 100644 --- a/scenes/networkManyVidsV2.py +++ b/scenes/networkManyVidsV2.py @@ -40,6 +40,7 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Alexxxis89', True, '1002515519'], ['Manyvids: Alice Bong', True, '1002715079'], ['Manyvids: Alice Red', True, '1007208921'], + ['Manyvids: Alice Stoner', True, '1005082413'], ['Manyvids: AliceNZ', False, '493690'], ['Manyvids: Alissa Foxy', True, '1005955692'], ['Manyvids: Alix Lynx', True, '30313'], @@ -94,10 +95,12 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Brea Rose', True, '1000024307'], ['Manyvids: Brett TylerXXX', False, '1004700026'], ['Manyvids: Britney Amber', True, '1003387859'], + ['Manyvids: Brittanythingoz', True, '1006754881'], ['Manyvids: Brooke Dillinger', True, '376796'], ['Manyvids: Brooklyn Springvalley', True, '1001853938'], ['Manyvids: BuniBun', True, '49000'], ['Manyvids: CallMeBabyBlue', True, '1004603307'], + ['Manyvids: Carmita Bonita', True, '30655'], ['Manyvids: Carol Cox', True, '130540'], ['Manyvids: Casey Calvert', True, '1003999309'], ['Manyvids: Cassie Clarke', True, '1001062063'], @@ -108,17 +111,20 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Chad Alva', True, '1000107977'], ['Manyvids: Chad Diamond', True, '577547'], ['Manyvids: Chanel Santini', True, '1000344210'], + ['Manyvids: Chantal Owens', True, '1002667100'], ['Manyvids: Charlette Webb', True, '35990'], ['Manyvids: Cherry Crush', True, '32539'], ['Manyvids: Cherry Fae', False, '110767'], ['Manyvids: Chezza Luna', True, '177172'], ['Manyvids: Chris And Mari', False, '1004131603'], + ['Manyvids: Ciren Verde', True, '1002613557'], ['Manyvids: Cmbprod', True, '1003696960'], ['Manyvids: Codi Vore', True, '574802'], ['Manyvids: Courtney Scott', True, '273124'], ['Manyvids: CrazyBella', True, '327770'], ['Manyvids: CreamBerryFairy', True, '1002527905'], ['Manyvids: CuckoldingMILF', False, '1002431767'], + ['Manyvids: CumSlutJenna', True, '1006212286'], ['Manyvids: CutieElly', False, '1002778789'], ['Manyvids: Daddys Rozay', True, '1002023399'], ['Manyvids: Daisy Haze', True, '261301'], @@ -157,11 +163,13 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Eva Long', True, '1002576385'], ['Manyvids: Evie Rees', True, '1000468027'], ['Manyvids: Facial King Pro', False, '1001204864'], + ['Manyvids: Fay Valentine', True, '1001627467'], ['Manyvids: Felicia Vox', True, '321754'], ['Manyvids: Fell On Productions', False, '456897'], ['Manyvids: FFeZine', False, '1000045578'], ['Manyvids: Fiery Redhead', False, '527041'], ['Manyvids: Fiona Dagger', True, '759730'], + ['Manyvids: Fishnet Housepet', True, '222290'], ['Manyvids: FitSid', False, '1002419479'], ['Manyvids: Florida Milf', True, '1003205269'], ['Manyvids: Forbidden Perversions', False, '599647'], @@ -172,9 +180,11 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Funsizedcumslut', True, '1007113302'], ['Manyvids: Gala MV', True, '830429'], ['Manyvids: Gia OhMy', True, '1003519826'], + ['Manyvids: Gin Lustig', True, '1006293001'], ['Manyvids: Ginger Banks', True, '37055'], ['Manyvids: GirlOnTop880', True, '1004830774'], ['Manyvids: Goddess Tangent', False, '427284'], + ['Manyvids: Godmother The Great', True, '228889'], ['Manyvids: Gogofukmexxx', False, '1002219117'], ['Manyvids: Golden Lace', True, '1002587264'], ['Manyvids: Goldie Blair', True, '380752'], @@ -191,11 +201,13 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Heatherbby', True, '88086'], ['Manyvids: Hello Alice', True, '1005562667'], ['Manyvids: Hidori', True, '97815'], + ['Manyvids: Holland of Chicago', True, '1003631211'], ['Manyvids: Hope Penetration', True, '1004890226'], ['Manyvids: Hornnstudio', False, '1003146686'], ['Manyvids: Horny Lily', True, '1000862654'], ['Manyvids: Hottalicia1', False, '1000447453'], ['Manyvids: HouseholdFantasy', False, '1007157741'], + ['Manyvids: HugeBoobsErin', True, '1001183502'], ['Manyvids: Icy Winters', True, '697815'], ['Manyvids: Im Heather Harmon', False, '1003667583'], ['Manyvids: ImMeganLive', True, '491714'], @@ -203,12 +215,16 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: InkedMonster', True, '1001576946'], ['Manyvids: ItsReeseRobins', False, '1005302009'], ['Manyvids: Ivy Starshine', True, '362540'], + ['Manyvids: Izzy Rosse', True, '1005964496'], ['Manyvids: Jack and Jill', False, '1001495638'], ['Manyvids: Jack Blaque', True, '536056'], ['Manyvids: Jack Ripher', True, '1001692458'], + ['Manyvids: Jackie Synn', True, '194026'], ['Manyvids: Jada Kai', True, '1000722201'], ['Manyvids: Jade Vow', True, '1002042328'], + ['Manyvids: JadedJuneBug', True, '1007315738'], ['Manyvids: Jane Cane', True, '1000718761'], + ['Manyvids: Jane Judge', True, '1000197274'], ['Manyvids: Jasper Nyx', True, '1003787164'], ['Manyvids: Jaybbgirl', True, '1001317123'], ['Manyvids: JaySmoothXXX', False, '525062'], @@ -226,6 +242,7 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Manyvids: Kalina Ryu', True, '815701'], ['Manyvids: Karabella', False, '1000456360'], ['Manyvids: KarmannAndJosie', False, '1006611324'], + ['Manyvids: Kate Kuray', True, '1002855322'], ['Manyvids: Kathia Nobili', True, '1003527333'], ['Manyvids: Kati3kat', True, '354103'], ['Manyvids: Katie Cummings', True, '1000489998'], @@ -512,7 +529,39 @@ class NetworkManyVidsV2Spider(BaseSceneScraper): ['Sloppy Toppy', False, '1002638751'], ['Undercover Sluts', False, '1001483477'], ['YouthLust', False, '1001216419'], - ['Manyvids: Helly Rite', True, '1002625980', "v2"], + ['Manyvids: Helly Rite', True, '1002625980'], + ['Manyvids: Samantha Flair', True, '1001379073'], + ['Manyvids: DrKInLA', True, '1003399990'], + ['Manyvids: JazminTorresBBW', True, '1001552380'], + ['Manyvids: juicyxjaden', True, '1004101467'], + ['Manyvids: KatesKurves', True, '1001897961'], + ['Manyvids: LilyLoveles', True, '1002028291'], + ['Manyvids: Melonie Kares', True, '1003030823'], + ['Manyvids: MissGothBooty', True, '33842'], + ['Manyvids: MZ NORMA STITZ', True, '1001723300'], + ['Manyvids: Nixlynka', True, '1002349390'], + ['Manyvids: QueenRhaena', True, '1004676688'], + ['Manyvids: Rebeca Cross', True, '1004469370'], + ['Manyvids: Rem Sequence', True, '1001345701'], + ['Manyvids: Rice Bunny', True, '410732'], + ['Manyvids: Shemeatress', True, '1000243328'], + ['Manyvids: SugarSweetmeatBBW', True, '1003902752'], + ['Manyvids: SuzyQ44ks', True, '1001155424'], + ['Manyvids: SweetheartMiaBBW', True, '1001145696'], + ['Manyvids: xPrincessAura', True, '1001967166'], + ['Manyvids: AriaNicoleXXX', True, '1006336627'], + ['Manyvids: Ayumi Anime', True, '1000907145'], + ['Manyvids: Bubblebumbutt', True, '1001007459'], + ['Manyvids: QueenieSteph', True, '1006752181'], + ['Manyvids: Kimswallows', True, '1001125267'], + ['Manyvids: PinkMaggit', True, '418469'], + ['Manyvids: Layndare', True, '1002480074'], + ['Manyvids: Ninadoll', True, '113109'], + ['Manyvids: Lolliepopxxx', True, '1004243105'], + ['Manyvids: Ema Lee', True, '1004698572'], + ['Manyvids: Evelin Stone', True, '1000691850'], + ['Manyvids: Throat GOAT', True, '1003432123'], + ['Manyvids: Mikaela_tx', True, '1007908784'], ] custom_settings = {'AUTOTHROTTLE_ENABLED': 'True', 'AUTOTHROTTLE_DEBUG': 'False', 'HTTPERROR_ALLOWED_CODES': [403, 404]} @@ -541,7 +590,6 @@ def start_requests(self): def start_requests2(self, response): meta = response.meta - meta['mvtoken'] = response.xpath('//html/@data-mvtoken').get() self.headers['referer'] = 'https://www.manyvids.com/Profile/1003004427/Sweetie-Fox/Store/Videos/' for link in self.start_urls: @@ -549,11 +597,7 @@ def start_requests2(self, response): meta['siteid'] = link[2] meta['site'] = link[0] meta['parse_performer'] = link[1] - meta['v2'] = False - if len(link) == 4: - if link[3].lower() == "v2": - meta['v2'] = True - yield scrapy.Request(url=self.get_next_page_url(self.page, meta), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + yield scrapy.Request(url=self.get_next_page_url(self.page, meta), callback=self.parse, meta=meta, headers=self.headers) def parse(self, response): # ~ print(response.text) @@ -567,24 +611,20 @@ def parse(self, response): if 'page' in response.meta and response.meta['page'] < self.limit_pages: meta['page'] = meta['page'] + 1 print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(meta['page'], meta), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + yield scrapy.Request(url=self.get_next_page_url(meta['page'], meta), callback=self.parse, meta=meta, headers=self.headers) def get_next_page_url(self, page, meta): - offset = str((int(page) - 1) * 30) - link = f"https://www.manyvids.com/api/model/{meta['siteid']}/videos?category=all&offset={offset}&sort=0&limit=30&mvtoken={meta['mvtoken']}" + link = f"https://www.manyvids.com/bff/store/videos/{meta['siteid']}/?page={page}" return link def get_scenes(self, response): meta = response.meta jsondata = json.loads(response.text) - data = jsondata['result']['content']['items'] + data = jsondata['data'] for jsonentry in data: meta['id'] = jsonentry['id'] meta['title'] = string.capwords(html.unescape(jsonentry['title'])) - if meta['v2']: - scenelink = f"https://www.manyvids.com/bff/store/video/{meta['id']}" - else: - scenelink = f"https://video-player-bff.estore.kiwi.manyvids.com/videos/{meta['id']}" + scenelink = f"https://www.manyvids.com/bff/store/video/{meta['id']}" if meta['id']: # ~ print(meta) yield scrapy.Request(scenelink, callback=self.parse_scene, meta=meta) @@ -625,6 +665,8 @@ def get_performers(self, response): return ['Cattie Candescent'] if "Cherry Fae" in meta['site']: return ['Krystal Orchid'] + if "JazminTorresBBW" in meta['site']: + return ['Jazmin Torres'] if "CuckoldingMILF" in meta['site']: return ['Mila Rose'] if "CutieElly" in meta['site']: @@ -763,8 +805,7 @@ def parse_scene(self, response): item = SceneItem() meta = response.meta jsondata = json.loads(response.text) - if meta['v2']: - jsondata = jsondata['data'] + jsondata = jsondata['data'] item['title'] = meta['title'] item['id'] = meta['id'] if 'description' in jsondata: @@ -775,6 +816,10 @@ def parse_scene(self, response): item['tags'] = jsondata['tags'] else: item['tags'] = [] + if "tagList" in jsondata and jsondata['tagList']: + for tag in jsondata['tagList']: + item['tags'].append(tag['label']) + if "screenshot" in jsondata: item['image'] = jsondata['screenshot'].replace(" ", "%20") item['image_blob'] = self.get_image_blob_from_link(item['image']) @@ -790,9 +835,14 @@ def parse_scene(self, response): item['site'] = self.get_site(response) item['parent'] = self.get_parent(response) item['url'] = "https://www.manyvids.com" + jsondata['url'] - if "videoDuration" in jsondata: - duration = re.search(r'(\d{1,2}:\d{1,2}:?\d{1,2}?)', jsondata['videoDuration']) - item['duration'] = self.duration_to_seconds(duration.group(1)) + if "videoDuration" in jsondata and jsondata['videoDuration']: + if ":" in jsondata['videoDuration']: + duration = re.search(r'(\d{1,2}:\d{1,2}:?\d{1,2}?)', jsondata['videoDuration']) + item['duration'] = self.duration_to_seconds(duration.group(1)) + elif jsondata['videoDuration']: + duration = int(jsondata['videoDuration']) + if duration: + item['duration'] = str(duration * 60) else: item['duration'] = "" parse_scene = True diff --git a/scenes/networkNubiles.py b/scenes/networkNubiles.py index 6c6f0372..5e3d3e2a 100644 --- a/scenes/networkNubiles.py +++ b/scenes/networkNubiles.py @@ -19,6 +19,7 @@ class NubilesSpider(BaseSceneScraper): "https://cheatingsis.com", "https://cumswappingsis.com", "https://daddyslilangel.com", + "https://datingmystepson.com", "https://deeplush.com", "https://detentiongirls.com", "https://driverxxx.com", diff --git a/scenes/networkPOVR.py b/scenes/networkPOVR.py index 75f19f5a..95df5dfc 100644 --- a/scenes/networkPOVR.py +++ b/scenes/networkPOVR.py @@ -238,8 +238,8 @@ def parse_scene(self, response): shortsite = re.sub(r'[^a-z0-9]', '', item['site'].lower()) matches = ['vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'fuckpassvr', 'czechvr', 'stripzvr', 'badoink', 'realvr', 'kinkvr', 'babevr', 'vrcosplayx', '18vr', 'wankzvr', 'vrhush', 'naughtyamerica'] if not any(x in shortsite for x in matches): - matches = ['virtualtaboo', 'virtualrealporn', 'virtualrealtrans', 'virtualrealpassion', 'virtualrealamateur', 'realjamvr', 'only3x', 'wankzvr', 'naughtyamerica', 'vrhush'] + matches = ['virtualtaboo', 'virtualrealporn', 'virtualrealtrans', 'virtualrealpassion', 'virtualrealamateur', 'realjamvr', 'only3x', 'wankzvr', 'naughtyamerica', 'vrhush', 'realitylovers'] if not any(x in shortsite for x in matches): - matches = ['swallowbay', 'wankitnowvr', 'baberoticavr', 'vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'stripzvr', 'badoink', 'slr-milfvr', 'milfvr'] + matches = ['swallowbay', 'wankitnowvr', 'baberoticavr', 'vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'stripzvr', 'badoink', 'slr-milfvr', 'milfvr', 'tranzvr'] if not any(x in shortsite for x in matches): yield self.check_item(item, self.days) diff --git a/scenes/networkPornFidelity.py b/scenes/networkPornFidelity.py index 8d583de8..d682bacd 100644 --- a/scenes/networkPornFidelity.py +++ b/scenes/networkPornFidelity.py @@ -11,7 +11,7 @@ class PornFidelitySpider(BaseSceneScraper): network = 'pornfidelity' start_urls = [ - # 'https://www.teenfidelity.com', + # ~ # 'https://www.teenfidelity.com', 'https://www.pornfidelity.com', # 'https://www.kellymadison.com' ] @@ -21,7 +21,8 @@ class PornFidelitySpider(BaseSceneScraper): 'title': '//div[@class="level-item"]/text()', 'description': '//div[@class="column is-three-fifths"]/text()', 'date': "", - 'image': '', + 'image': '//script[contains(text(), ".jpg")]/text()', + 're_image': r'poster.*?(http.*?)[\'\"]', 'performers': '//a[@class="is-underlined"]/text()', 'tags': "", 'duration': '//li//text()[contains(., "mins")]', @@ -32,16 +33,14 @@ class PornFidelitySpider(BaseSceneScraper): } def get_scenes(self, response): - rsp = HtmlResponse(url=response.url, body=response.json()[ - 'html'], encoding='utf-8') + rsp = HtmlResponse(url=response.url, body=response.json()['html'], encoding='utf-8') scenes = rsp.css('.episode .card-link::attr(href)').extract() for scene in scenes: yield scrapy.Request(url=scene, callback=self.parse_scene, cookies=self.cookies) - def get_image(self, response): - res = re.search(self.get_selector_map('external_id'), response.url) - return 'https://tour-cdn.kellymadisonmedia.com/content/episode/poster_image/%s/poster.jpg' % res.group( - 1) + # ~ def get_image(self, response): + # ~ res = re.search(self.get_selector_map('external_id'), response.url) + # ~ return 'https://tour-cdn.kellymadisonmedia.com/content/episode/poster_image/%s/poster.jpg' % res.group(1) def get_date(self, response): search = re.search('Published: (\\d+-\\d+-\\d+)', response.text) @@ -54,7 +53,7 @@ def get_title_full(self, response): return response.xpath(self.get_selector_map('title'))[1].get().strip() def get_title(self, response): - print(response) + # ~ print(response) title = self.get_title_full(response) search = re.search('(.+) - .+ \\#(\\d+)', title) if not search: diff --git a/scenes/networkPornMegaLoad.py b/scenes/networkPornMegaLoad.py new file mode 100644 index 00000000..d5c9f45d --- /dev/null +++ b/scenes/networkPornMegaLoad.py @@ -0,0 +1,134 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +def match_site(argument): + match = { + '18eighteen': "18Eighteen", + '40somethingmag': "40Somethingmag", + '50plusmilfs': "50Plus MILFs", + '60plusmilfs': "60Plus MILFs", + 'bigtithooker': "Big Tit Hooker", + 'bootyliciousmag': "Bootyliciousmag", + 'legsex': "Leg Sex", + 'mickybells': "Micky Bells", + 'naughtymag': "Naughtymag", + 'pornmegaload': "PornMegaLoad", + 'scoreland': "Scoreland", + 'scoreland2': "Scoreland2", + 'scorevideos': "Score Videos", + 'xlgirls': "XL Girls", + } + return match.get(argument, argument) + + +class NetworkPornMegaLoadPlaywrightSpider(BaseSceneScraper): + name = 'PornMegaLoadPlaywright' + network = 'ScorePass' + + start_urls = [ + 'https://www.pornmegaload.com', + # ----------------------------------- + # 'https://www.18eighteen.com', + # 'https://www.40somethingmag.com', + # 'https://www.50plusmilfs.com', + # 'https://www.60plusmilfs.com', + # 'https://www.bigtithooker.com', + # 'https://www.bootyliciousmag.com', + # 'https://www.legsex.com', + # 'https://www.mickybells.com', + # 'https://www.naughtymag.com', + # 'https://www.pornmegaload.com', + # 'https://www.scoreland.com', + # 'https://www.scorevideos.com', + # 'https://www.xlgirls.com' + ] + + custom_scraper_settings = { + 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor', + # ~ 'AUTOTHROTTLE_ENABLED': True, + # ~ 'AUTOTHROTTLE_START_DELAY': 1, + # ~ 'AUTOTHROTTLE_MAX_DELAY': 120, + 'CONCURRENT_REQUESTS': 1, + # 'DOWNLOAD_DELAY': 60, + # 'RANDOMIZE_DOWNLOAD_DELAY': True, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, + 'CONCURRENT_REQUESTS_PER_IP': 1, + 'SPIDERMON_ENABLED': False, + 'DOWNLOAD_FAIL_ON_DATALOSS': True, + 'RETRY_ENABLED': True, + 'RETRY_TIMES': 10, + 'RETRY_HTTP_CODES': [500, 503, 504, 400, 408, 307, 403], + 'HANDLE_HTTPSTATUS_LIST': [500, 503, 504, 400, 408, 307, 403], + 'DOWNLOAD_HANDLERS': { + "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler", + }, + 'DOWNLOADER_MIDDLEWARES': { + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': 500, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 300, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 301, + 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 100, + } + } + + selector_map = { + 'title': '//main/div/section/div[@class="row"]/div/h1/text()|//section[@id="videos_page-page"]/div[contains(@class,"ali-center")]//h2/text()', + 'description': '//div[contains(@class, "p-desc")]//text()', + 'date': '//div[contains(@class,"p-info")]//span[contains(text(), "Date:")]/following-sibling::span/text()', + 'date_format': ['%B %d, %Y'], + 'image': '//video/@poster', + 'performers': '//div[contains(@class,"p-info")]//span[contains(text(), "Featuring:")]/following-sibling::span/a/text()', + 'tags': '//h3[contains(text(), "Tags")]/following-sibling::a/text()', + 'duration': '//div[contains(@class,"p-info")]//span[contains(text(), "Duration:")]/following-sibling::span/text()', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'external_id': r'.*/(\d+)/', + 'trailer': '//div[contains(@class, "mr-lg")]//video/source[1]/@src', + 'pagination': '/hd-porn-scenes/?page=%s' + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + meta['playwright'] = True + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "li-item")]/div/div/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=scene, callback=self.parse_scene, meta=meta) + + def get_site(self, response): + site = response.xpath('//div[contains(@class, "d-lg-block")]/a/div[1]/img[1]/@src') + if site: + return match_site(re.search(r'\.com/(.*?)/', site.get()).group(1)) + return "PornMegaLoad" + + def parse_scene(self, response): + item = SceneItem() + item['title'] = self.get_title(response) + item['description'] = self.get_description(response) + item['site'] = self.get_site(response) + item['date'] = self.get_date(response) + item['image'] = self.get_image(response) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['performers'] = self.get_performers(response) + item['tags'] = self.get_tags(response) + item['id'] = self.get_id(response) + item['trailer'] = self.get_trailer(response) + item['duration'] = self.get_duration(response) + item['url'] = self.get_url(response) + if "?" in item['url']: + item['url'] = re.search(r'(.*?)\?', item['url']).group(1) + item['network'] = self.get_network(response) + item['parent'] = item['site'] + item['type'] = 'Scene' + + yield self.check_item(item, self.days) diff --git a/scenes/networkPornbox.py b/scenes/networkPornbox.py index 2159209a..1f89ecb6 100644 --- a/scenes/networkPornbox.py +++ b/scenes/networkPornbox.py @@ -52,6 +52,8 @@ def parse_scene(self, response): else: item['description'] = "" item['site'] = scene['studio'] + if item['site'].lower().replace(" ", "") == "familysinners": + item['site'] = "FAMILY Sinners (Pornbox)" item['date'] = self.parse_date(scene['publish_date']).isoformat() item['image'] = scene['player_poster'] item['image_blob'] = self.get_image_blob_from_link(item['image']) @@ -77,10 +79,10 @@ def parse_scene(self, response): item['network'] = 'Legal Porno' item['parent'] = 'Legal Porno' - matches = ['bangbros', 'jeffsmodels', 'private', 'antoniosuleiman', 'bradmontana', 'richardmannsworld', 'only3xnetwork', 'privateblack', 'pornforce', 'immorallive', 'girlfriendsfilms', + matches = ['bangbros', 'jeffsmodels', 'private', 'exposedlatinas', 'antoniosuleiman', 'bradmontana', 'richardmannsworld', 'only3xnetwork', 'privateblack', 'pornforce', 'immorallive', 'girlfriendsfilms', 'hentaied', 'vipissy', 'justanal', 'hussiepass', 'filthykings', 'puffynetwork', 'fit18', 'cuckhunter', 'bruceandmorgan', 'privateclassics', 'seehimfuck', 'filthyfamily', 'ukpornparty', 'jayspov', 'only3xgirls', 'parasited', 'hazeher', 'collegerules', 'abuseme', 'only3xvr', 'justpov', 'girlsgonewild', 'plumperpassstudio', 'only3xlost', 'onlygolddigger', 'wetandpuffy', 'mypervyfamily', 'mykebrazil', 'mylifeinmiami', 'claudiamarie', 'rawwhitemeat', 'industryinvaders', 'cockyboys', 'touchmywife', 'blackbullchallenge', 'topwebmodels', 'realsexpass', 'riggsfilms', 'pervfect', 'mollyredwolf', 'bluepillmen', 'blacksonmoms', 'peter\'skingdom', - 'pornmuschimovie', 'chickpass', 'grooby', 'pornpros', 'lubed', 'povd', 'facials4k', 'girlcum', 'exotic4k', 'nannyspy', 'castingcouch-x', 'mom4k', 'bluebirdfilms', 'dreamtranny', 'pornworld', 'randyblue'] + 'pornmuschimovie', 'chickpass', 'grooby', 'pornpros', 'lubed', 'povd', 'facials4k', 'girlcum', 'exotic4k', 'nannyspy', 'castingcouch-x', 'mom4k', 'bluebirdfilms', 'dreamtranny', 'pornworld', 'randyblue', 'plantsvscunts', 'mugurporn'] if not any(x in item['site'].lower().replace(" ", "") for x in matches): yield self.check_item(item, self.days) diff --git a/scenes/networkPornhub.py b/scenes/networkPornhub.py index 5ac50c3d..02014c72 100644 --- a/scenes/networkPornhub.py +++ b/scenes/networkPornhub.py @@ -8,6 +8,7 @@ class NetworkPornhubSpider(BaseSceneScraper): network = 'Pornhub' performers = [ + ["/channels/pornhub-originals-vr?o=mr&page=%s", "", "Pornhub: Pornhub Originals VR"], ["/model/404hotfound/videos?o=mr&page=%s", "404HotFound", "Pornhub: 404HotFound"], ["/model/aestra-azure/videos/upload?o=mr&page=%s", "Aestra Azure", "Pornhub: Aestra Azure"], ["/model/agataruiz/videos?page=%s", "Agata Ruiz", "Pornhub: Agata Ruiz"], @@ -41,6 +42,7 @@ class NetworkPornhubSpider(BaseSceneScraper): ["/model/fuckforeverever/videos?page=%s", "Fuckforeverever", "Pornhub: Fuckforeverever"], ["/model/gentlyperv/videos?o=mr&page=%s", "GentlyPerv", "Pornhub: GentlyPerv"], ["/model/harperthefox/videos?o=mr&page=%s", "HarperTheFox", "Pornhub: HarperTheFox"], + ["/model/helloelly/videos?o=mr&page=%s", "HelloElly", "Pornhub: HelloElly"], ["/model/joey-lee/videos?o=mr&page=%s", "Joey Lee", "Pornhub: Joey Lee"], ["/model/kelly-aleman/videos?o=mr&page=%s", "Kelly Aleman", "Pornhub: Kelly Aleman"], ["/model/loly-lips/videos?o=mr&page=%s", "Loly Lips", "Pornhub: Loly Lips"], @@ -146,7 +148,10 @@ def parse(self, response, **kwargs): def get_scenes(self, response): meta = response.meta - scenes = response.xpath('//div[contains(@class,"videoUList")]//div[@class="phimage"]/a/@href').getall() + if "channels" in response.url: + scenes = response.xpath('//ul[contains(@id, "showAllChanelVideos")]//li[contains(@class, "VideoListItem")]/div/div[@class="phimage"]/a/@href').getall() + else: + scenes = response.xpath('//div[contains(@class,"videoUList")]//div[@class="phimage"]/a/@href').getall() for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) @@ -154,10 +159,11 @@ def get_scenes(self, response): def get_performers(self, response): meta = response.meta performers = [] - new_perf = response.xpath('//div[contains(@class,"pornstarsWrapper")]/a/@data-mxptext') + new_perf = response.xpath('//div[contains(@class,"pornstarsWrapper")]/a/@data-mxptext|//div[contains(@class,"pornstarsWrapper")]/a/img/following-sibling::text()[1]') if new_perf: new_perf = new_perf.getall() performers = new_perf - if meta['initial_performers'][0] not in performers: - performers.append(meta['initial_performers'][0]) - return performers + if meta['initial_performers'][0]: + if meta['initial_performers'][0] not in performers: + performers.append(meta['initial_performers'][0]) + return list(map(lambda x: self.cleanup_title(x.strip()), performers)) diff --git a/scenes/networkProjectOneService.py b/scenes/networkProjectOneService.py index f78b23cb..657b58cd 100644 --- a/scenes/networkProjectOneService.py +++ b/scenes/networkProjectOneService.py @@ -190,6 +190,7 @@ class ProjectOneServiceSpider(BaseSceneScraper): # 'https://www.wivesinpantyhose.com', 'https://www.seancody.com', + 'https://www.sexselector.com', 'https://www.sexyhub.com', # 'https://www.danejones.com', # 'https://www.fitnessrooms.com', @@ -228,6 +229,7 @@ class ProjectOneServiceSpider(BaseSceneScraper): # 'https://www.twistyshard.com', # 'https://www.whengirlsplay.com', + 'https://www.voyr.com', 'https://www.whynotbi.com', ] @@ -365,6 +367,9 @@ def get_scenes(self, response): if brand == "bangbros" and item['date'] < "2023-06-21": yield_item = False + if item['site'] == "Sex Selector" and item['date'] < "2024-01-13": + yield_item = False + if self.check_item(item, self.days) and yield_item: scene_count = scene_count + 1 yield item diff --git a/scenes/networkScorePass.py b/scenes/networkScorePass.py index dabc70a1..b5015bb1 100644 --- a/scenes/networkScorePass.py +++ b/scenes/networkScorePass.py @@ -176,7 +176,7 @@ class ScorePassSpider(BaseSceneScraper): # 'https://www.yourmomsgotbigtits.com', # 'https://www.yourwifemymeat.com', - 'https://www.pornmegaload.com', + # 'https://www.pornmegaload.com', Moved to PornMegaLoad Playwright Scraper # ----------------------------------- # 'https://www.18eighteen.com', # 'https://www.40somethingmag.com', diff --git a/scenes/networkSexLikeRealAPI.py b/scenes/networkSexLikeRealAPI.py index 1d5606d4..02147260 100644 --- a/scenes/networkSexLikeRealAPI.py +++ b/scenes/networkSexLikeRealAPI.py @@ -25,6 +25,7 @@ class SexLikeRealSpider(BaseSceneScraper): 'image': '//meta[@name="twitter:image1"]/@content or //meta[@name="twitter:image2"]/@content or //meta[@name="twitter:image3"]/@content or //meta[@name="twitter:image"]/@content', 'trailer': '', 'pagination': '/scenes?type=premium&sort=most_recent&page=%s' + # ~ 'pagination': '/trans/studios/transexvr?page=%s' } def get_scenes(self, response): @@ -38,6 +39,7 @@ def get_scenes(self, response): except Exception: print(f"Failed on scene: {scene}") url = f"https://api.sexlikereal.com/virtualreality/video/id/{idnum}" + print(url) if idnum: yield scrapy.Request(url, callback=self.parse_scene, meta=meta) @@ -88,8 +90,8 @@ def parse_scene(self, response): item['tags'] = list(map(lambda x: string.capwords(x.strip()), list(set(item['tags'])))) matches = ['vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'fuckpassvr', 'czechvr', 'stripzvr','badoink','realvr','kinkvr','babevr','vrcosplayx','18vr','wankzvr','vrhush','naughtyamerica'] if not any(x in item['id'] for x in matches) and not any(x in shortsite for x in matches): - matches = ['virtualtaboo', 'virtualrealporn', 'virtualrealtrans', 'virtualrealpassion', 'virtualrealamateur', 'realjamvr', 'only3x', 'wankzvr', 'naughtyamerica', 'vrhush'] + matches = ['virtualtaboo', 'virtualrealporn', 'virtualrealtrans', 'virtualrealpassion', 'virtualrealamateur', 'realjamvr', 'only3x', 'wankzvr', 'naughtyamerica', 'vrhush', 'realitylovers'] if not any(x in item['id'] for x in matches) and not any(x in shortsite for x in matches): - matches = ['swallowbay', 'wankitnowvr', 'baberoticavr', 'vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'stripzvr','badoink', 'slr-milfvr', 'milfvr'] + matches = ['swallowbay', 'wankitnowvr', 'baberoticavr', 'vr-bangers', 'vrconk', 'vrbtrans', 'vrbgay', 'sinsvr', 'realjamvr', 'baberoticavr', 'stripzvr','badoink', 'slr-milfvr', 'milfvr', 'tranzvr'] if not any(x in item['site'].lower() for x in matches) and not any(x in shortsite for x in matches): yield self.check_item(item, self.days) diff --git a/scenes/networkTeamSkeetPlaywright.py b/scenes/networkTeamSkeetPlaywright.py index b3b472c4..fabb6586 100644 --- a/scenes/networkTeamSkeetPlaywright.py +++ b/scenes/networkTeamSkeetPlaywright.py @@ -139,6 +139,7 @@ def parse_scene(self, response): else: data = '' item = SceneItem() + # ~ print(data) if ('isUpcoming' in data and not data['isUpcoming']) or 'isUpcoming' not in data: is_v2 = "store2" in response.url @@ -147,6 +148,8 @@ def parse_scene(self, response): item['title'] = data['title'] item['description'] = data['description'] item['image'] = data['img'] + if "med.jpg" in item['image']: + item['image'] = item['image'].replace("med.jpg", "hi.jpg") item['image_blob'] = self.get_image_blob_from_link(item['image']) if 'tags' in data: @@ -182,6 +185,8 @@ def parse_scene(self, response): if is_v2: if "Say Uncle" in response.meta['site']: item['url'] = "https://www.sayuncle.com/movies/" + data['id'] + elif "MYLF" in response.meta['site']: + item['url'] = "https://www.mylf.com/movies/" + data['id'] else: item['url'] = "https://www.teamskeet.com/movies/" + data['id'] diff --git a/scenes/networkVRLife.py b/scenes/networkVRLife.py index 1d67b623..3d7e4b08 100644 --- a/scenes/networkVRLife.py +++ b/scenes/networkVRLife.py @@ -49,7 +49,8 @@ def get_scenes(self, response): scene_id = self.process_xpath(scene, self.get_selector_map('id')).get() title = self.process_xpath(scene, self.get_selector_map('title')).get() url = self.process_xpath(scene, self.get_selector_map('url')).get() - yield scrapy.Request(url=self.format_link(response, url), callback=self.parse_scene, meta={'id': scene_id, 'title': title}) + if url: + yield scrapy.Request(url=self.format_link(response, url), callback=self.parse_scene, meta={'id': scene_id, 'title': title}) def parse_scene(self, response): jslde = JsonLdExtractor() diff --git a/scenes/networkVRNetwork.py b/scenes/networkVRNetwork.py index 15d80125..01023df3 100644 --- a/scenes/networkVRNetwork.py +++ b/scenes/networkVRNetwork.py @@ -17,7 +17,7 @@ class NetworkVRNetworkSpider(BaseSceneScraper): selector_map = { 'title': '//h1[contains(@class,"page-title")]/text()', 'description': '//div[contains(@class,"second-text")]/div/p//text()', - 'date': '//div[contains(@class, "info-item") and contains(.//text(), "Release")]//text()', + 'date': '//div[contains(@class, "info-item") and contains(.//text(), "Release")]//text()|//span[contains(text(), "Release date")]/following-sibling::text()', 're_date': r'(\w{2,4} \d{1,2}, \d{4})', 'date_formats': ['%b %d, %Y'], 'duration': '//span[contains(text(), "Duration")]/following-sibling::span[1]/text()', @@ -64,7 +64,7 @@ def get_duration(self, response): hour = re.search(r'(\d+) h', duration) if hour: hour = hour.group(1) - hour = int(hour) * 3660 + hour = int(hour) * 3600 totalduration = totalduration + hour return str(totalduration) return None diff --git a/scenes/networkVixen.py b/scenes/networkVixen.py index 541d707b..0f5e689b 100644 --- a/scenes/networkVixen.py +++ b/scenes/networkVixen.py @@ -80,66 +80,70 @@ def parse(self, response, **kwargs): def parse_scene(self, response): data = response.json()['data']['findOneVideo'] + # ~ print(data) scene = SceneItem() - try: - scene['id'] = data['id'] + # ~ try: + scene['id'] = data['id'] - scene['title'] = self.cleanup_title(data['title']) - scene['description'] = self.cleanup_description(data['description']) if 'description' in data else '' + scene['title'] = self.cleanup_title(data['title']) + scene['description'] = self.cleanup_description(data['description']) if 'description' in data else '' - site = data['site'] - if site.upper() in self.sites: - site = self.sites[site.upper()] - scene['site'] = site + site = data['site'] + if site.upper() in self.sites: + site = self.sites[site.upper()] + scene['site'] = site - scene['network'] = 'Vixen' - scene['parent'] = site + scene['network'] = 'Vixen' + scene['parent'] = site - scene['date'] = self.parse_date(data['releaseDate']).isoformat() - scene['url'] = self.format_link(response, '/videos/' + data['slug']) + scene['date'] = self.parse_date(data['releaseDate']).isoformat() + scene['url'] = self.format_link(response, '/videos/' + data['slug']) - scene['performers'] = [] - for model in data['models']: - scene['performers'].append(model['name']) + if "directors" in data and len(data['directors']): + scene['director'] = data['directors'][0]['name'] - scene['tags'] = [] - if data['tags']: - for tag in data['tags']: - scene['tags'].append(tag) + scene['performers'] = [] + for model in data['models']: + scene['performers'].append(model['name']) - scene['markers'] = [] - if 'chapters' in data: - if data['chapters']: - for timetag in data['chapters']['video']: - timestamp = {} - timestamp['name'] = self.cleanup_title(timetag['title']) - timestamp['start'] = str(timetag['seconds']) - scene['markers'].append(timestamp) - scene['tags'].append(timestamp['name']) + scene['tags'] = [] + if data['tags']: + for tag in data['tags']: + scene['tags'].append(tag) - scene['tags'] = list(map(lambda x: string.capwords(x.strip()), list(set(scene['tags'])))) + scene['markers'] = [] + if 'chapters' in data: + if data['chapters']: + for timetag in data['chapters']['video']: + timestamp = {} + timestamp['name'] = self.cleanup_title(timetag['title']) + timestamp['start'] = str(timetag['seconds']) + scene['markers'].append(timestamp) + scene['tags'].append(timestamp['name']) - largest = 0 - for image in data['images']['poster']: - if image['width'] > largest: - scene['image'] = image['src'] - largest = image['width'] + scene['tags'] = list(map(lambda x: string.capwords(x.strip()), list(set(scene['tags'])))) - largest = 0 - scene['image_blob'] = self.get_image_blob_from_link(scene['image']) + largest = 0 + for image in data['images']['poster']: + if image['width'] > largest: + scene['image'] = image['src'] + largest = image['width'] - for trailer in data['previews']['poster']: - if trailer['width'] > largest: - scene['trailer'] = trailer['src'] - largest = trailer['width'] + largest = 0 + scene['image_blob'] = self.get_image_blob_from_link(scene['image']) - scene['trailer'] = '' if 'trailer' not in scene or not scene['trailer'] else scene['trailer'] + for trailer in data['previews']['poster']: + if trailer['width'] > largest: + scene['trailer'] = trailer['src'] + largest = trailer['width'] - yield self.check_item(scene, self.days) + scene['trailer'] = '' if 'trailer' not in scene or not scene['trailer'] else scene['trailer'] - except Exception: - print(f"Failed Request on: {response.url}") + yield self.check_item(scene, self.days) + + # ~ except Exception: + # ~ print(f"Failed Request on: {response.url}") def get_graphql_search_body(self, per_page, page, link): site_name = urlparse(link).hostname.replace('www.', '').replace('.com', '').upper() @@ -223,6 +227,9 @@ def get_grapgql_query(self): seconds } } + directors { + name + } models { name slug diff --git a/scenes/networkWankz.py b/scenes/networkWankz.py index d1fd2bb7..cf0b9907 100644 --- a/scenes/networkWankz.py +++ b/scenes/networkWankz.py @@ -76,8 +76,12 @@ class NetworkWankzSpider(BaseSceneScraper): ] paginations = [ - '/sites/teen-girls?p=%s', - '/videos?p=%s#', + # ~ '/sites/bubbly-massage?p=%s', + # ~ '/sites/cfnm-exposed?p=%s', + '/sites/my-milf-boss?p=%s', + # ~ '/sites/wankz-3d?p=%s', + # ~ '/sites/teen-girls?p=%s', + # ~ '/videos?p=%s#', ] selector_map = { @@ -89,7 +93,7 @@ class NetworkWankzSpider(BaseSceneScraper): 'image': '//a[@class="noplayer"]/img/@src', 'performers': '//div[@class="models-wrapper actors"]/a/span/text()', 'tags': "//a[@class='cat']/text()", - 'external_id': '-(\\d+)$', + 'external_id': r'-(\d+)$', 'trailer': '', 'pagination': '/videos?p=%s#' } diff --git a/scenes/site2Poles1Hole.py b/scenes/site2Poles1Hole.py new file mode 100644 index 00000000..91705813 --- /dev/null +++ b/scenes/site2Poles1Hole.py @@ -0,0 +1,101 @@ +import re +import json +import scrapy +import requests +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class Site2Poles1HoleSpider(BaseSceneScraper): + name = '2Poles1Hole' + network = '2Poles1Hole' + parent = '2Poles1Hole' + site = '2Poles1Hole' + + start_urls = [ + 'https://2poles1hole.com', + ] + + cookies = {"name": "consent", "value": "true"} + + headers = { + 'X-Nats-Cms-Area-Id': 2, + 'X-Nats-Entity-Decode': 1, + } + + selector_map = { + 'title': '', + 'description': '', + 'date': '', + 'image': '', + 'performers': '', + 'tags': '', + 'duration': '', + 'trailer': '', + 'external_id': r'', + 'pagination': '/videos?page=%s', + 'type': 'Scene', + } + + def get_next_page_url(self, base, page): + page = str((int(page) - 1) * 16) + pagination = 'https://azianistudios.com/tour_api.php/content/sets?cms_set_ids=&data_types=1&content_count=1&count=16&start=%s&cms_area_id=2&cms_block_id=100086&orderby=published_desc&content_type=video&status=enabled&text_search=' + link = pagination % page + return link + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = "https://2poles1hole.com/videos" + yield scrapy.Request(link, callback=self.start_requests_2, meta=meta, headers=self.headers, cookies=self.cookies) + + def start_requests_2(self, response): + meta = response.meta + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers) + + def get_scenes(self, response): + scenes = json.loads(response.text) + for scene in scenes['sets']: + item = SceneItem() + item['title'] = self.cleanup_title(scene['name']) + item['id'] = scene['cms_set_id'] + item['description'] = self.cleanup_description(re.sub('<[^<]+?>', '', scene['description'])) + + for thumb in scene['preview_formatted']['thumb']: + scenethumb = thumb + scenethumb = scene['preview_formatted']['thumb'][scenethumb][0] + image = "https://y2y8k2k4.ssl.hwcdn.net/" + scenethumb['fileuri'] + "?" + scenethumb['signature'] + item['image'] = image.replace(" ", "%20") + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['image'] = re.search(r'(.*?)\?', image).group(1) + item['trailer'] = "" + + item['date'] = scene['added_nice'] + + item['url'] = f"https://2poles1hole.com/video/{item['id']}" + item['tags'] = [] + for dataset in scene['data_types']: + if dataset['data_type'] == 'Tags': + for tag in dataset['data_values']: + item['tags'].append(tag['name']) + + item['duration'] = scene['lengths']['total'] + item['site'] = '2Poles1Hole' + item['parent'] = '2Poles1Hole' + item['network'] = '2Poles1Hole' + item['performers'] = [] + for dataset in scene['data_types']: + if dataset['data_type'] == 'Models': + for model in dataset['data_values']: + item['performers'].append(model['name']) + + yield self.check_item(item, self.days) + + def get_image_from_link(self, image): + if image: + req = requests.get(image) + if req and req.ok: + return req.content + return None diff --git a/scenes/siteAdultAllStars.py b/scenes/siteAdultAllStars.py new file mode 100644 index 00000000..c04b5798 --- /dev/null +++ b/scenes/siteAdultAllStars.py @@ -0,0 +1,40 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteAdultAllStarsSpider(BaseSceneScraper): + name = 'AdultAllStars' + network = 'Adult All Stars' + parent = 'Adult All Stars' + site = 'Adult All Stars' + + start_urls = [ + 'https://www.adultallstars.com', + ] + + selector_map = { + 'title': '//div[@class="update_table_left"]//span[contains(@class, "update_title")]/text()', + 'description': '//div[@class="update_table_left"]//span[contains(@class, "latest_update")]/text()', + 'date': '//div[@class="update_table_left"]//span[contains(@class, "availdate")]/text()', + 'date_formats': ['%d/%m/%Y'], + 'image': '//div[@class="update_table_right"]/div[contains(@class, "update_image")]/a/img[1]/@src', + 'performers': '//div[@class="update_table_left"]//span[contains(@class, "update_models")]/a/text()', + 'tags': '//div[@class="update_table_left"]//span[contains(@class, "update_tags")]/a/text()', + 'duration': '', + 'trailer': '', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="updateItem"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + sceneid = super().get_id(response) + return sceneid.lower() diff --git a/scenes/siteAngelaWhiteScenes.py b/scenes/siteAngelaWhiteScenes.py index e5d8f6a0..749dbad9 100644 --- a/scenes/siteAngelaWhiteScenes.py +++ b/scenes/siteAngelaWhiteScenes.py @@ -26,6 +26,26 @@ class SiteAngelaWhiteScenesSpider(BaseSceneScraper): 'type': 'Scene', } + custom_scraper_settings = { + 'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0', + 'AUTOTHROTTLE_ENABLED': True, + 'AUTOTHROTTLE_START_DELAY': 1, + 'AUTOTHROTTLE_MAX_DELAY': 10, + 'CONCURRENT_REQUESTS': 1, + 'RANDOMIZE_DOWNLOAD_DELAY': True, + 'CONCURRENT_REQUESTS_PER_DOMAIN': 1, + 'CONCURRENT_REQUESTS_PER_IP': 1, + 'DOWNLOADER_MIDDLEWARES': {}, + # ~ 'DOWNLOAD_DELAY': 30, + 'DOWNLOAD_MAXSIZE': 0, + 'DOWNLOAD_TIMEOUT': 100000, + 'DOWNLOAD_WARNSIZE': 0, + 'HTTPCACHE_ENABLED': False, + 'RETRY_ENABLED': True, + "MEDIA_ALLOW_REDIRECTS": True, + "HTTPERROR_ALLOWED_CODES": [404], + } + def start_requests(self): meta = {} meta['page'] = 1 diff --git a/scenes/siteAngeloGodshackOfficial.py b/scenes/siteAngeloGodshackOfficial.py new file mode 100644 index 00000000..8e6e1fa2 --- /dev/null +++ b/scenes/siteAngeloGodshackOfficial.py @@ -0,0 +1,35 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteAngeloGodshackOfficialSpider(BaseSceneScraper): + name = 'AngeloGodshackOfficial' + network = 'Angelo Godshack' + parent = 'Angelo Godshack Official' + site = 'Angelo Godshack Official' + + start_urls = [ + 'https://angelogodshackxxx.com', + ] + + selector_map = { + 'title': '//div[@class="video-detail"]//div[contains(@class, "header")]/h1/text()', + 'description': '//div/strong[contains(text(), "Description")]/../following-sibling::p/text()', + 'date': '', + 'image': '//video-js/@data-poster', + 'performers': '//div[contains(@class,"video-detail__description")]//div[@class="title"]/text()', + 'tags': '', + 'duration': '', + 'trailer': '//video/source/@src', + 'external_id': r'.*/(.*?)$', + 'pagination': '/newest?page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "library-item")]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteBaitBuddies.py b/scenes/siteBaitBuddies.py new file mode 100644 index 00000000..fba0f68f --- /dev/null +++ b/scenes/siteBaitBuddies.py @@ -0,0 +1,39 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteBaitBuddiesSpider(BaseSceneScraper): + name = 'BaitBuddies' + network = 'Bait Buddies' + parent = 'Bait Buddies' + site = 'Bait Buddies' + + start_urls = [ + 'https://www.baitbuddies.com', + ] + + selector_map = { + 'description': '//div[@class="TabbedPanelsContentWrap"]//text()', + 'image': '//div[@class="main_video"]/a[1]/img/@src', + 'performers': '//div[@class="header_txt"]/strong/following-sibling::a/text()', + 'tags': '//div[@id="tags"]/a/text()', + 'external_id': r'contentId=(.*?)_', + 'pagination': '/?page=preview&p=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="videos-thumb"]') + for scene in scenes: + scenedate = scene.xpath('.//strong[contains(text(), "Release")]/following-sibling::text()') + if scenedate: + meta['date'] = self.parse_date(scenedate.get(), date_formats=['%m/%d/%Y']).strftime('%Y-%m-%d') + scene = scene.xpath('./a/@href').get() + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_title(self, response): + performers = self.get_performers(response) + return " and ".join(performers) diff --git a/scenes/siteBananaFever.py b/scenes/siteBananaFever.py index f68cb0a9..a4c9ae50 100644 --- a/scenes/siteBananaFever.py +++ b/scenes/siteBananaFever.py @@ -57,12 +57,12 @@ def get_scenes(self, response): jsondata = json.loads(response.text) for scene in jsondata: item = SceneItem() - if 'og_image' in scene["yoast_head_json"]: - item['image'] = scene["yoast_head_json"]['og_image'][0]['url'] - item['image_blob'] = self.get_image_blob_from_link(item['image']) - else: - item['image'] = "" - item['image_blob'] = "" + # ~ if 'og_image' in scene["yoast_head_json"]: + # ~ item['image'] = scene["yoast_head_json"]['og_image'][0]['url'] + # ~ item['image_blob'] = self.get_image_blob_from_link(item['image']) + # ~ else: + # ~ item['image'] = "" + # ~ item['image_blob'] = "" item['id'] = str(scene['id']) item['date'] = scene['date'] @@ -85,20 +85,26 @@ def get_scenes(self, response): item['network'] = 'Banana Fever' item['url'] = scene['link'] - # ~ meta['item'] = item + meta['item'] = item - # ~ if image_url: - # ~ req = requests.get(image_url) - # ~ if req and len(req.text) > 5: - # ~ imagerow = json.loads(req.text) - # ~ else: - # ~ imagerow = None + if "wp:attachment" in scene['_links'] and scene['_links']['wp:featuredmedia'][0]['href']: + image_url = scene['_links']['wp:featuredmedia'][0]['href'] + else: + image_url = None - # ~ item['image'] = imagerow['guid']['rendered'] - # ~ item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ else: - # ~ item['image'] = None - # ~ item['image_blob'] = None + item['image'] = None + item['image_blob'] = None + if image_url: + req = requests.get(image_url) + if req and len(req.text) > 5: + imagerow = json.loads(req.text) + else: + imagerow = None + + if imagerow and 'guid' in imagerow: + if 'rendered' in imagerow['guid'] and imagerow['guid']['rendered']: + item['image'] = imagerow['guid']['rendered'] + item['image_blob'] = self.get_image_blob_from_link(item['image']) if " - Demo" not in item['title'] and " - Trailer" not in item['title']: yield item diff --git a/scenes/siteBlueBirdFilmsMovie.py b/scenes/siteBlueBirdFilmsMovie.py deleted file mode 100644 index d2df51e2..00000000 --- a/scenes/siteBlueBirdFilmsMovie.py +++ /dev/null @@ -1,181 +0,0 @@ -import re -import requests -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class DorcelClubSpider(BaseSceneScraper): - name = 'BlueBirdFilmsMovie' - network = 'Blue Bird Films' - parent = 'Blue Bird Films' - - start_urls = [ - 'https://www.bluebirdfilms.com' - ] - - headers = { - 'Accept-Language': 'en-US,en', - 'x-requested-with': 'XMLHttpRequest', - } - - selector_map = { - 'title': '//h1/text()', - 'description': '///span[@class="full"]/p/text()', - 'image': 'picture img.thumbnail::attr(data-src)', - 'performers': '//div[@class="actress"]/a/text()', - 'date': '//span[@class="publish_date"]/text()', - 'tags': '', - 'external_id': 'scene/(\\d+)', - 'trailer': '', - 'pagination': '/vod/dvds/dvds_page_%s.html' - } - - cookies = { - 'disclaimer2': 'xx' - } - - def parse(self, response, **kwargs): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - meta['movie'] = movie - yield movie - - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - movies = response.xpath('//figure') - for movie in movies: - movieurl = self.format_link(response, movie.xpath('./a[1]/@href').get()) - yield scrapy.Request(movieurl, callback=self.parse_movie, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//h2/text()').get().strip()) - item['description'] = '' - image = response.xpath('//img[contains(@class, "dvd_box")]/@src0_2x').get() - item['image'] = self.format_link(response, image.replace("//", "/")) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - director = response.xpath('//div[@class="director"]/text()') - if director: - director = director.get() - if ":" in director: - director = re.search(r'\:\s+?(.*)', director).group(1).strip() - item['director'] = '' - item['performers'] = response.xpath('//div[contains(@class, "modelnames") and contains(text(), "Featuring")]/a/text()').getall() - item['performers'] = list(map(lambda x: self.cleanup_title(x.strip()), item['performers'])) - item['tags'] = [] - item['trailer'] = '' - item['date'] = None - item['type'] = 'Movie' - item['network'] = 'Blue Bird Films' - item['parent'] = 'Blue Bird Films' - item['site'] = 'Blue Bird Films' - item['store'] = 'Blue Bird Films' - item['url'] = response.url - item['id'] = re.search(r'.*/(.*)\.htm', response.url).group(1) - item['scenes'] = [] - meta['scenes'] = [] - scenes = response.xpath('//figure[contains(@class, "setVideoThumb")]') - item['duration'] = '0' - - date_url = "https://www.bluebirdfilms.com/vod/" + response.xpath('//figure[contains(@class, "setVideoThumb")]/a/@href').get() - date_text = requests.get(date_url) - date_text = date_text.text.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") - scene_date = re.search(r'ReleaseDate.*?(\d{1,2}/\d{1,2}/\d{4})', date_text) - if scene_date: - item['date'] = self.parse_date(scene_date.group(1), date_formats=['%d/%m/%Y']).isoformat() - - for scene in scenes: - scene_item = {} - scene_item['scene_url'] = "https://www.bluebirdfilms.com/vod/" + scene.xpath('./a/@href').get() - scene_item['scene_id'] = scene.xpath('./@id').get() - scene_duration = scene.xpath('.//i[contains(@class, "glyphicon-time")]/following-sibling::comment()') - if scene_duration: - scene_duration = scene_duration.get() - scene_duration = scene_duration.replace("s", "").replace("m", "") - scene_duration = re.search(r'(\d{1,2}:\d{2}(?::\d{2})?)', scene_duration).group(1) - scene_item['scene_duration'] = self.duration_to_seconds(scene_duration) - item['duration'] = str(int(item['duration']) + int(scene_item['scene_duration'])) - scene_item['scene_trailer'] = scene.xpath('.//source/@src').get() - - scene_item['scene_image'] = None - scene_image = scene.xpath('.//picture/img/@src0_2x') - if scene_image: - scene_image = self.format_link(response, scene_image.get()).replace("content//", "content/") - if not scene_image: - scene_image = scene.xpath('.//picture/img/@src0_1x') - if scene_image: - scene_image = self.format_link(response, scene_image.get()).replace("content//", "content/") - if not scene_image: - scene_image = scene.xpath('.//picture/img/@src0_3x') - if scene_image: - scene_image = self.format_link(response, scene_image.get()).replace("content//", "content/") - if not scene_image: - scene_image = scene.xpath('.//picture/img/@src') - if scene_image: - scene_image = self.format_link(response, scene_image.get()).replace("content//", "content/") - - if scene_image: - scene_item['scene_image'] = scene_image - - item['scenes'].append({'site': item['site'], 'external_id': scene_item['scene_id']}) - meta['scenes'].append(scene_item) - meta['movie'] = item - yield self.check_item(item, self.days) - - for sceneurl in meta['scenes']: - meta['currscene'] = sceneurl - yield scrapy.Request(self.format_link(response, sceneurl['scene_url']), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - currscene = meta['currscene'] - item = SceneItem() - - item['url'] = currscene['scene_url'] - item['trailer'] = currscene['scene_trailer'] - item['duration'] = currscene['scene_duration'] - item['id'] = currscene['scene_id'] - if currscene['scene_image']: - item['image'] = currscene['scene_image'] - item['image_blob'] = self.get_image_blob_from_link(item['image']) - else: - item['image'] = None - item['image_blob'] = None - - item['type'] = 'Scene' - - title = response.xpath('//h3[@class="mb-5"]//text()').getall() - item['title'] = self.cleanup_title(" ".join(title).replace(" ", " ")) - item['title'] = item['title'].replace("( ", "(").replace(" )", ")") - - scenedate = response.xpath('//td[contains(text(), "Date:")]/following-sibling::td/text()') - if scenedate: - scenedate = scenedate.get() - item['date'] = self.parse_date(scenedate, date_formats=['%d/%m/%Y']).isoformat() - else: - item['date'] = '' - - item['description'] = item['title'] - item['director'] = '' - item['performers'] = response.xpath('//span[contains(@class, "update_models")]/a/text()').getall() - item['performers'] = list(map(lambda x: self.cleanup_title(x.strip()), item['performers'])) - - item['tags'] = response.xpath('//td[contains(text(), "Tags:")]/following-sibling::td/a/text()').getall() - item['tags'] = list(map(lambda x: self.cleanup_title(x.strip()), item['tags'])) - - item['network'] = 'Blue Bird Films' - item['parent'] = 'Blue Bird Films' - item['site'] = 'Blue Bird Films' - yield self.check_item(item, self.days) diff --git a/scenes/siteBratPrincess.py b/scenes/siteBratPrincess.py new file mode 100644 index 00000000..f4480fa4 --- /dev/null +++ b/scenes/siteBratPrincess.py @@ -0,0 +1,54 @@ +import slugify +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteBratPrincessSpider(BaseSceneScraper): + name = 'BratPrincess' + site = 'Brat Princess' + parent = 'Brat Princess' + network = 'Brat Princess' + + start_urls = [ + 'https://www.bratprincess.us', + ] + + selector_map = { + 'title': './/h6/text()', + 'description': './following-sibling::img[1]/following-sibling::div[contains(@class, "summary")]//p/text()', + 'date': '', + 'image': './following-sibling::div[contains(@class, "poster")]//img/@src', + 'performers': '', + 'tags': '', + 'duration': './following-sibling::div[contains(@class, "runtime")]/div[not(contains(text(), "Runtime"))]/div/text()', + 'trailer': '', + 'external_id': r'', + 'pagination': '/video-list?page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + scenes = response.xpath('//div[contains(@class, "field-name-title")]') + for scene in scenes: + item = SceneItem() + item['title'] = self.get_title(scene) + item['description'] = self.get_description(scene) + item['date'] = "" + item['image'] = self.format_link(response, scene.xpath('./following-sibling::div[contains(@class, "poster")]//img/@src').get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['performers'] = [] + item['tags'] = ['Female Domination', 'Domination'] + item['id'] = slugify.slugify(item['title']) + item['trailer'] = "" + item['duration'] = self.get_duration(scene) + sceneurl = scene.xpath('./following-sibling::div[contains(@class, "links")]//a[contains(@href, "gallery")]/@href').get() + if sceneurl: + item['url'] = self.format_link(response, sceneurl) + else: + item['url'] = f"https://www.bratprincess.us/gallery-view-videos/{item['id']}" + item['network'] = self.network + item['parent'] = self.parent + item['site'] = self.site + item['type'] = 'Scene' + + yield self.check_item(item, self.days) diff --git a/scenes/siteChristianWilde.py b/scenes/siteChristianWilde.py new file mode 100644 index 00000000..57683bbd --- /dev/null +++ b/scenes/siteChristianWilde.py @@ -0,0 +1,56 @@ +import re +import html +import unidecode +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteChristianWildeSpider(BaseSceneScraper): + name = 'ChristianWilde' + site = 'Christian Wilde' + parent = 'Christian Wilde' + network = 'Christian Wilde' + + start_urls = [ + 'https://christianwilde.com', + ] + + selector_map = { + 'title': '//span[contains(@class,"update_title")]/text()', + 'description': '//span[contains(@class,"update_description")]/text()', + 'date': '//span[contains(@class,"availdate")]/text()[1]', + 're_date': r'(\d{1,2}/\d{1,2}/\d{4})', + 'date_formats': ['%m/%d/%Y'], + 'image': '//div[contains(@class,"update_image")]/a[contains(@href, "updates")][1]/img/@src0_4x|//div[contains(@class,"update_image")]/a[contains(@href, "updates")][1]/img/@src0_3x|//div[contains(@class,"update_image")]/a[contains(@href, "updates")][1]/img/@src0_2x|//div[contains(@class,"update_image")]/a[contains(@href, "updates")][1]/img/@src0_1x', + 'performers': '//span[contains(@class,"update_models")]/a/text()', + 'tags': '//span[contains(@class,"update_tags")]/a/text()', + 'duration': '', + 'trailer': '//div[contains(@class,"update_image")]/a[contains(@href, "updates")][1]/@onclick', + 're_trailer': r'\'(/trailer.*?)\'', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="updateItem"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + return super().get_id(response).lower() + + def get_duration(self, response): + duration = response.xpath('//div[@class="update_counts_preview_table"]/text()') + if duration: + duration = duration.get() + duration = unidecode.unidecode(html.unescape(duration.lower().replace(" ", " ").replace("\xa0", " "))).replace(" ", "") + duration = re.search(r'(\d+)min', duration) + if duration: + return str(int(duration.group(1)) * 60) + return None + + def get_description(self, response): + return super().get_description(response).replace("\r", " ").replace("\n", " ").replace("\t", " ") diff --git a/scenes/siteCosmid.py b/scenes/siteCosmid.py index f4d12e2a..297a7278 100644 --- a/scenes/siteCosmid.py +++ b/scenes/siteCosmid.py @@ -82,54 +82,39 @@ def parse_model_scenes(self, response): if scenedate: item['date'] = self.parse_date(scenedate, date_formats=['%Y-%m-%d']).isoformat() else: - item['date'] = self.parse_date('today').isoformat() + item['date'] = self.parse_date('today').strftime('%Y-%m-%d') - image = scene.xpath('.//div[contains(@class,"videothumb")]/img/@src').get() - if image: - item['image'] = "https://cosmid.net" + image.replace('//', '/').replace('#id#', '').strip() - else: - item['image'] = None - - item['image_blob'] = self.get_image_blob_from_link(item['image']) + if self.check_item(item, self.days): + image = scene.xpath('.//div[contains(@class,"videothumb")]/img/@src').get() + if image: + item['image'] = "https://cosmid.net" + image.replace('//', '/').replace('#id#', '').strip() + else: + item['image'] = None - trailer = scene.xpath('.//div[contains(@class,"videothumb")]/video/source/@src').get() - if trailer: - item['trailer'] = "https://cosmid.net" + trailer.replace(" ", "%20").replace('#id#', '').strip() - else: - item['trailer'] = '' - - duration = scene.xpath('.//div[@class="time"]/text()') - if duration: - item['duration'] = self.duration_to_seconds(duration.get()) - - externalid = title.replace("_", "-").strip().lower() - externalid = externalid.replace(" ", " ") - externalid = externalid.replace(" ", "-") - externalid = re.sub('[^a-zA-Z0-9-]', '', externalid) - if externalid: - item['id'] = externalid - else: - item['id'] = '' + item['image_blob'] = self.get_image_blob_from_link(item['image']) - item['tags'] = [] + trailer = scene.xpath('.//div[contains(@class,"videothumb")]/video/source/@src').get() + if trailer: + item['trailer'] = "https://cosmid.net" + trailer.replace(" ", "%20").replace('#id#', '').strip() + else: + item['trailer'] = '' + + duration = scene.xpath('.//div[@class="time"]/text()') + if duration: + item['duration'] = self.duration_to_seconds(duration.get()) + + externalid = title.replace("_", "-").strip().lower() + externalid = externalid.replace(" ", " ") + externalid = externalid.replace(" ", "-") + externalid = re.sub('[^a-zA-Z0-9-]', '', externalid) + if externalid: + item['id'] = externalid + else: + item['id'] = '' - item['url'] = response.url + item['tags'] = [] - if item['id'] and item['title'] and item['date']: - days = int(self.days) - if days > 27375: - filterdate = "0000-00-00" - else: - filterdate = date.today() - timedelta(days) - filterdate = filterdate.strftime('%Y-%m-%d') + item['url'] = response.url - if self.debug: - if not item['date'] > filterdate: - item['filtered'] = "Scene filtered due to date restraint" - print(item) - else: - if filterdate: - if item['date'] > filterdate: - yield item - else: - yield item + if item['id'] and item['title'] and item['date']: + yield self.check_item(item, self.days) diff --git a/scenes/siteCustomFetishVideos.py b/scenes/siteCustomFetishVideos.py new file mode 100644 index 00000000..70b7b909 --- /dev/null +++ b/scenes/siteCustomFetishVideos.py @@ -0,0 +1,48 @@ +import re +import unidecode +import html +import json +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteCustomFetishVideosSpider(BaseSceneScraper): + name = 'CustomFetishVideos' + site = 'Custom Fetish Videos' + parent = 'Anatomik Media' + network = 'Gamma Enterprises' + + start_urls = [ + 'https://customfetishvideos.com', + ] + + selector_map = { + 'date': '//script[contains(text(), "datePublished")]/text()', + 're_date': r'datePublished.*?(\d{4}-\d{2}-\d{2})', + 'performers': '//span[contains(text(), "Models:")]/a/text()|//span[contains(text(), "Model:")]/a/text()', + 'tags': '//span[contains(text(), "Fetish:")]/a/text()', + 'trailer': '//div[contains(@class, "video_slide")]/video/source/@src', + 'external_id': r'', + 'pagination': '', + 'type': 'Scene', + } + + def get_next_page_url(self, base, page): + link = f"https://customfetishvideos.com/wp-json/wc/store/v1/products?orderby=date&order=desc&catalog_visibility=catalog&per_page=18&page={page}&_locale=user" + return link + + def get_scenes(self, response): + meta = response.meta + jsondata = json.loads(response.text) + + for scene in jsondata: + meta['id'] = scene['id'] + meta['title'] = scene['name'] + if "description" in scene: + meta['description'] = unidecode.unidecode(html.unescape(re.sub(r'<[^<]+?>', '', scene['description']))) + else: + meta['description'] = unidecode.unidecode(html.unescape(re.sub(r'<[^<]+?>', '', scene['short_description']))) + meta['image'] = scene['images'][0]['src'] + meta['image_blob'] = self.get_image_blob_from_link(meta['image']) + link = scene['permalink'] + yield scrapy.Request(link, callback=self.parse_scene, meta=meta) diff --git a/scenes/siteDanni.py b/scenes/siteDanni.py new file mode 100644 index 00000000..47a1cf9c --- /dev/null +++ b/scenes/siteDanni.py @@ -0,0 +1,39 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteDanniSpider(BaseSceneScraper): + name = 'Danni' + network = 'Sexual Prime' + parent = 'Danni' + site = 'Danni' + + start_urls = [ + 'https://www.danni.com', + ] + + selector_map = { + 'title': '//div[@class="scene-title"]/text()', + 'description': '', + 'date': '', + 'image': '//script[contains(text(), "vJSPlayer")]/text()', + 're_image': r'poster.*?(http.*?)[\'\"]', + 'performers': '//div[@class="scene-title"]/following-sibling::div[contains(@class, "model-list")]/a/text()', + 'tags': '//div[@class="scene-title"]/following-sibling::div[contains(@class, "scene-tags")]/a/text()', + 'duration': '//div[contains(@class, "danni-clock")]/following-sibling::span/text()', + 'trailer': '', + 'external_id': r'.*/(.*?)_vid', + 'pagination': '/categories/videos_%s_d', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="danni-card-name-wrapper"]/div/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + return super().get_id(response).lower() diff --git a/scenes/siteDarkroomVR.py b/scenes/siteDarkroomVR.py index 888b5301..d4f288b0 100644 --- a/scenes/siteDarkroomVR.py +++ b/scenes/siteDarkroomVR.py @@ -30,7 +30,7 @@ class SiteDarkroomVRSpider(BaseSceneScraper): } def get_scenes(self, response): - scenes = response.xpath('//a[@class="video-card__item"]/@href').getall() + scenes = response.xpath('//div[@class="video-card__item"]/a/@href').getall() for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) diff --git a/scenes/siteDeepLush.py b/scenes/siteDeepLush.py index 83150fcb..8327d4e8 100644 --- a/scenes/siteDeepLush.py +++ b/scenes/siteDeepLush.py @@ -7,7 +7,9 @@ class DeepLushSpider(BaseSceneScraper): network = 'Deep Lush' parent = 'Deep Lush' - start_urls = ["https://deeplush.com"] + start_urls = [ + # ~ "https://deeplush.com" + ] selector_map = { 'title': '//h2/text()', diff --git a/scenes/siteDorcelClub.py b/scenes/siteDorcelClub.py index 16a8ebc6..9864f5c9 100644 --- a/scenes/siteDorcelClub.py +++ b/scenes/siteDorcelClub.py @@ -4,6 +4,7 @@ true = True false = False + class DorcelClubSpider(BaseSceneScraper): name = 'DorcelClub' network = 'Dorcel Club' diff --git a/scenes/siteDorcelClubMovie.py b/scenes/siteDorcelClubMovie.py deleted file mode 100644 index bcaf45ee..00000000 --- a/scenes/siteDorcelClubMovie.py +++ /dev/null @@ -1,218 +0,0 @@ -import re -import scrapy -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class DorcelClubSpider(BaseSceneScraper): - name = 'DorcelClubMovie' - network = 'Dorcel Club' - parent = 'Dorcel Club' - - start_urls = [ - 'https://www.dorcelclub.com' - ] - - headers = { - 'Accept-Language': 'en-US,en', - 'x-requested-with': 'XMLHttpRequest', - } - - selector_map = { - 'title': '//h1/text()', - 'description': '///span[@class="full"]/p/text()', - 'image': 'picture img.thumbnail::attr(data-src)', - 'performers': '//div[@class="actress"]/a/text()', - 'date': '//span[@class="publish_date"]/text()', - 'tags': '', - 'external_id': 'scene/(\\d+)', - 'trailer': '', - 'pagination': '/movies/more?lang=en&page=%s' - } - - cookies = { - 'disclaimer2': 'xx' - } - - def start_requests(self): - yield scrapy.Request("https://www.dorcelclub.com/en/", callback=self.start_requests_2, headers=self.headers, cookies=self.cookies) - - def start_requests_2(self, response): - for link in self.start_urls: - yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta={'page': self.page}, headers=self.headers, cookies=self.cookies) - - def parse(self, response, **kwargs): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - meta['movie'] = movie - yield movie - # ~ for sceneurl in movie['sceneurls']: - # ~ yield scrapy.Request(self.format_link(response, sceneurl), meta=meta, callback=self.parse_scene, headers=self.headers, cookies=self.cookies) - - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - movies = response.xpath('//a[@class="movie thumbnail"]/@href').getall() - for movie in movies: - movieurl = self.format_link(response, movie) - yield scrapy.Request(movieurl, callback=self.parse_movie, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - scenes = response.xpath('//span[@class="scenes"]/text()').get() - scenes = int(re.search(r'\:\s+?(\d+)', scenes).group(1)) - if scenes > 1: - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//h1/text()').get().strip()) - scenedate = response.xpath('//span[@class="out_date"]/text()').get() - scenedate = re.search(r'(\d{4})', scenedate).group(1).strip() - scenedate = scenedate + "-01-01" - item['date'] = self.parse_date(scenedate).isoformat() - item['description'] = '' - images = response.xpath('//div[@class="header"]//source[contains(@media, "max-width") and contains(@data-srcset, "cover")]/@data-srcset').getall() - images = sorted(images, reverse=True) - image = images[0] - item['image'] = re.search(r'(.*?) 1x', image).group(1) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - director = response.xpath('//div[@class="director"]/text()') - if director: - director = director.get() - if ":" in director: - director = re.search(r'\:\s+?(.*)', director).group(1).strip() - if director: - item['director'] = director - else: - item['director'] = '' - item['performers'] = response.xpath('//div[contains(@class, "actor thumbnail")]/a/div/text()').getall() - item['performers'] = list(map(lambda x: x.strip(), item['performers'])) - duration = response.xpath('//span[@class="duration"]/text()') - if duration: - duration = duration.get().lower() - hours = '' - minutes = '' - seconds = '' - if "h" in duration: - hours = (int(re.search(r'(\d{1,2})h', duration).group(1)) * 3600) - if "m" not in duration: - minutes = re.search(r'h\s+?(\d{1,2})', duration) - if minutes: - minutes = minutes.group(1) - else: - hours = 0 - if "m" in duration and not minutes: - minutes = re.search(r'(\d{1,2})m', duration) - if minutes: - minutes = minutes.group(1) - else: - minutes = 0 - seconds = re.search(r'm(\d{1,2})', duration) - if seconds: - seconds = seconds.group(1) - else: - seconds = 0 - if minutes: - minutes = int(minutes) * 60 - else: - minutes = 0 - if seconds: - seconds = int(seconds) - else: - seconds = 0 - item['duration'] = str(hours + minutes + seconds) - else: - item['duration'] = None - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Movie' - item['network'] = 'Dorcel Club' - item['parent'] = 'Dorcel Club' - item['site'] = 'Dorcel Club' - item['url'] = response.url - item['id'] = re.search(r'movie/.*?/.*?/(\d+)', item['image']).group(1) - sceneurls = response.xpath('//div[@class="scenes"]/div/div/a/@href').getall() - item['scenes'] = [] - for sceneurl in sceneurls: - item['scenes'].append({'site': item['site'], 'external_id': re.search(r'scene/(\d+)/', sceneurl).group(1)}) - meta['movie'] = item - yield item - for sceneurl in sceneurls: - yield scrapy.Request(self.format_link(response, sceneurl), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - movie = meta['movie'] - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//h1[@class="title"]/text()').get().strip()) - scenedate = response.xpath('//span[@class="publish_date"]/text()') - if scenedate: - scenedate = scenedate.get() - item['date'] = self.parse_date(scenedate, date_formats=['%B %d, %Y']).isoformat() - else: - item['date'] = self.parse_date('today').isoformat() - description = response.xpath('//div[@class="content-text"]/span[@class="full"]//text()|//div[@class="content-text"]/span[@class="small"]//text()|//div[@class="content-text"]/p/text()') - if description: - item['description'] = description.get().strip() - else: - item['description'] = '' - item['image'] = '' - images = response.xpath('//div[contains(@class,"player_container")]//source[contains(@media, "max-width")]/@data-srcset') - if images: - images = images.getall() - images = sorted(images, reverse=True) - image = images[0] - item['image'] = re.search(r'(.*?) 1x', image).group(1) - else: - image = response.xpath('//script[contains(text(), "VodPlayer")]/text()') - if image: - image = re.search(r'image\:.*?(http.*?)\"', image.get()) - if image: - item['image'] = self.get_image(image.group(1)) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - item['director'] = movie['director'] - item['performers'] = response.xpath('//div[@class="actress"]/a/text()').getall() - item['performers'] = list(map(lambda x: x.strip(), item['performers'])) - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Scene' - item['network'] = 'Dorcel Club' - item['parent'] = 'Dorcel Club' - item['site'] = 'Dorcel Club' - duration = response.xpath('//span[@class="duration"]/text()') - if duration: - duration = duration.get().lower() - if "h" in duration: - hours = (int(re.search(r'(\d{1,2})h', duration).group(1)) * 3600) - else: - hours = 0 - minutes = re.search(r'(\d{1,2})m', duration) - if minutes: - minutes = minutes.group(1) - else: - minutes = 0 - seconds = re.search(r'm(\d{1,2})', duration) - if seconds: - seconds = seconds.group(1) - else: - seconds = 0 - item['duration'] = str(hours + (int(minutes) * 60) + int(seconds)) - else: - item['duration'] = None - - item['url'] = response.url - item['id'] = re.search(r'scene/(\d+)', response.url).group(1) - yield item - - def get_image(self, image): - trash = '_' + image.split('_', 3)[-1].rsplit('.', 1)[0] - image = image.replace(trash, '', 1) - return image diff --git a/scenes/siteDownBlouseWow.py b/scenes/siteDownBlouseWow.py new file mode 100644 index 00000000..8bc032a5 --- /dev/null +++ b/scenes/siteDownBlouseWow.py @@ -0,0 +1,53 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteDownBlouseWowSpider(BaseSceneScraper): + name = 'DownblouseWow' + site = 'Downblouse Wow' + parent = 'Downblouse Wow' + network = 'Downblouse Wow' + + start_urls = [ + 'https://downblousewow.com/?videos', + ] + + selector_map = { + 'title': '//h2[@class="name"]/text()', + 'description': '', + 'date': '//div[@class="info"]//p[contains(text(), "Added")]/text()', + 're_date': r'(\w+ \d{1,2}, \d{4})', + 'image': '//div[@class="container"]//div[@class="row"]/p[1]/a[1]/img/@src', + 'performers': '//h2[@class="name"]/text()', + 'tags': '//div[@class="info"]//p[@class="tags"]/a/text()', + 'duration': '', + 'trailer': '', + 'external_id': r'lid=(\d+)', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + for link in self.start_urls: + yield scrapy.Request(link, callback=self.get_scenes, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="itemminfo"]/p[1]/a/@href').getall() + for scene in scenes: + if "join" not in scene: + meta['id'] = re.search(r'lid=(\d+)', scene).group(1) + if meta['id']: + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_duration(self, response): + duration = response.xpath('//div[@class="info"]//p[contains(text(), "Added")]/text()') + if duration: + duration = duration.get() + duration = re.search(r'(\d+\.\d+)min', duration) + if duration: + duration = duration.group(1) + duration = str(int(float(duration) * 60)) + return duration + return None diff --git a/scenes/siteDownBlouseWow_ByPerformer.py b/scenes/siteDownBlouseWow_ByPerformer.py new file mode 100644 index 00000000..6fb38fe9 --- /dev/null +++ b/scenes/siteDownBlouseWow_ByPerformer.py @@ -0,0 +1,61 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteDownBlouseWow_ByPerformerSpider(BaseSceneScraper): + name = 'siteDownBlouseWow_ByPerformer' + site = 'Downblouse Wow' + parent = 'Downblouse Wow' + network = 'Downblouse Wow' + + start_urls = [ + 'https://downblousewow.com', + ] + + selector_map = { + 'pagination': r'/show.php?a=147_%s', + 'external_id': r'lid=(\d+)', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + performers = response.xpath('//div[@class="itemminfo"]/p/a/@href').getall() + for performer in performers: + yield scrapy.Request(url=self.format_link(response, performer), callback=self.get_performer_scenes, meta=meta) + + def get_performer_scenes(self, response): + scenes = response.xpath('//div[contains(@class,"vidblock")]') + for scene in scenes: + item = SceneItem() + item['title'] = self.cleanup_title(scene.xpath('.//p[@class="vidname"]/a/text()').get()) + item['description'] = "" + item['date'] = "" + scenedate = scene.xpath('.//p[@class="date"]/text()') + if scenedate: + scenedate = scenedate.get() + item['date'] = self.parse_date(scenedate).strftime('%Y-%m-%d') + item['image'] = "" + item['image_blob'] = "" + image = scene.xpath('.//img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + performer = scene.xpath('.//p[@class="vidname"]/a/text()') + item['performers'] = [] + if performer: + performer = performer.get() + item['performers']. append(performer.strip()) + item['tags'] = ['Downblouse', 'Voyeur'] + item['trailer'] = '' + sceneid = re.search(r'/(DB\d+.*?)_\d+', item['image']) + if not sceneid: + sceneid = re.search(r'.*/(.*?)\.', item['image']) + item['id'] = sceneid.group(1) + item['network'] = "Downblouse Wow" + item['parent'] = "Downblouse Wow" + item['site'] = "Downblouse Wow" + item['url'] = f"https://downblousewow.com/join.html?g=content/DBW/movies/{item['id']}" + yield self.check_item(item, self.days) diff --git a/scenes/siteEnjoyX.py b/scenes/siteEnjoyX.py new file mode 100644 index 00000000..a59d50bb --- /dev/null +++ b/scenes/siteEnjoyX.py @@ -0,0 +1,48 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteEnjoyxSpider(BaseSceneScraper): + name = 'Enjoyx' + site = 'Enjoyx' + parent = 'Enjoyx' + network = 'Enjoyx' + + start_urls = [ + 'https://enjoyx.com', + ] + + selector_map = { + 'title': '//div[contains(@class, "video-detail__title")]/text()', + 'description': '', + 'date': '//div[contains(@class,"desktop-sidebar")]//div[contains(@class, "video-info__time")]/text()', + 're_date': r'(\d{1,2} \w+, \d{4})', + 'date_formats': ['%d %B, %Y'], + 'image': '//script[contains(text(), "poster") and contains(text(), "coreSettings")]/text()', + 're_image': r'poster.*?url.*?(http.*?)[\'\"]', + 'performers': '//div[contains(@class,"desktop-sidebar")]//div[@class="video-info__text"]/a/text()', + 'tags': '//div[contains(@class, "tags__container")]/a/text()', + 'duration': '//div[contains(@class,"desktop-sidebar")]//div[contains(@class, "video-info__time")]/text()', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'trailer': '', + 'external_id': r'.*/(.*?)$', + 'pagination': '/video?page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//a[contains(@class, "videos-item")]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_image(self, response): + image = response.xpath('//script[contains(text(), "poster") and contains(text(), "coreSettings")]/text()') + if image: + image = "".join(image.get()).replace("\r", "").replace("\n", "").replace("\t", "") + image = re.search(r'poster.*?url.*?(http.*?)[\'\"]', image) + if image: + return image.group(1) + return "" diff --git a/scenes/siteExposedNurses.py b/scenes/siteExposedNurses.py new file mode 100644 index 00000000..1f6fec79 --- /dev/null +++ b/scenes/siteExposedNurses.py @@ -0,0 +1,62 @@ +import re +import html +import string +import slugify +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteExposedNursesSpider(BaseSceneScraper): + name = 'ExposedNurses' + network = 'Apollo Cash' + + start_urls = [ + 'https://www.exposednurses.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/index.php?updates=&page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//th/h1/..') + for scene in scenes: + item = SceneItem() + item['title'] = self.cleanup_title(scene.xpath('./h1/text()').get()) + item['description'] = "" + item['date'] = '' + item['image'] = "" + item['image_blob'] = "" + image = scene.xpath('.//img[@class="image1" and contains(@src, "1_3")]/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + performers = scene.xpath('.//b[contains(text(), "Nurse")]/following-sibling::text()[1]') + item['performers'] = [] + if performers: + performers = performers.getall() + for performer in performers: + performer = html.unescape(performer.replace(" ", " ").replace("\xa0", " ")) + if "(" in performer: + performer = re.search(r'(.*?) \(', performer).group(1) + item['performers'].append(performer.strip()) + tags = scene.xpath('.//b[contains(text(), "Tags")]/following-sibling::text()[1]') + item['tags'] = [] + if tags: + tags = tags.get() + tags = tags.split(",") + item['tags'] = list(map(lambda x: string.capwords(x.strip()), tags)) + for tag in item['tags']: + if "..." in tag: + item['tags'].remove(tag) + item['trailer'] = '' + item['id'] = slugify.slugify(item['title'].lower()) + item['network'] = "Apollo Cash" + item['parent'] = "Exposed Nurses" + item['site'] = "Exposed Nurses" + item['url'] = f"https://www.exposednurses.com/{item['id']}" + yield self.check_item(item, self.days) diff --git a/scenes/siteFamilyCuckolds.py b/scenes/siteFamilyCuckolds.py new file mode 100644 index 00000000..87dbcf8c --- /dev/null +++ b/scenes/siteFamilyCuckolds.py @@ -0,0 +1,64 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteFamilyCuckoldsSpider(BaseSceneScraper): + name = 'FamilyCuckolds' + + start_url = 'https://familycuckolds.com/' + + selector_map = { + 'title': './/div[@class="name"]/span/text()', + 'description': '', + 'date': './/div[@class="date"]/text()', + 'date_formats': ['%b %d, %Y'], + 'image': './following-sibling::script[1]/text()', + 're_image': r'show_poster.*?(http.*?)[\'\"]', + 'performers': '', + 'tags': '', + 'duration': '', + 'trailer': '', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/tour/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + yield scrapy.Request('https://familycuckolds.com/', callback=self.get_scenes, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + scenes = response.xpath('//div[@class="episode-item"]') + for scene in scenes: + item = SceneItem() + + item['title'] = self.get_title(scene) + item['description'] = "" + item['site'] = "Family Cuckolds" + item['date'] = self.get_date(scene) + item['image'] = self.get_image(scene, response) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['performers'] = [] + item['tags'] = ['Cuckold'] + sceneid = scene.xpath('./@id').get() + item['id'] = re.search(r'(\d+)', sceneid).group(1) + item['trailer'] = "" + item['url'] = self.get_url(response) + "video/" + sceneid + item['network'] = "Family Cuckolds" + item['parent'] = "Family Cuckolds" + item['type'] = 'Scene' + + yield self.check_item(item, self.days) + + def get_image(self, scene, response, path=None): + if 'image' in self.get_selector_map(): + image = self.get_element(scene, 'image', 're_image') + if isinstance(image, list): + image = image[0] + image = image.strip() + image = image.replace(" ", "%20") + return self.format_link(response, image) + return '' diff --git a/scenes/siteFapHouse.py b/scenes/siteFapHouse.py new file mode 100644 index 00000000..f9d7fa6e --- /dev/null +++ b/scenes/siteFapHouse.py @@ -0,0 +1,115 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteFapHouseSpider(BaseSceneScraper): + name = 'FapHouse' + + start_url = 'https://faphouse.com' + + paginations = [ + ['https://faphouse.com/studios/private-society?page=%s', 'Private Society'], + ] + + selector_map = { + 'title': '//h1[contains(@class, "title")]/text()', + 'description': '//div[contains(@class, "description")]//p/text()', + 'date': '//span[@class="video-publish-date"]/text()', + 're_date': r'(\d{1,2}\.\d{1,2}\.\d{4})', + 'date_formats': ['%d.%m.%Y'], + 'image': '//meta[@property="og:image"]/@content', + 'performers': '//div[@data-el="RelatedTags"]//a[contains(@href, "/pornstars/")]/span[2]/text()', + 'tags': '//div[@data-el="RelatedTags"]//a[contains(@class, "__category") and contains(@href, "/videos")]/text()', + 'duration': '//span[contains(@class, "video-duration")]/text()', + 'external_id': r'.*/(.*?)$', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + for pagination in self.paginations: + meta['pagination'] = pagination[0] + meta['site'] = pagination[1] + meta['parent'] = pagination[1] + meta['network'] = pagination[1] + yield scrapy.Request(url=self.get_next_page_url(self.start_url, self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="thumb__main"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + item = SceneItem() + + item['title'] = self.get_title(response) + item['description'] = self.get_description(response) + item['site'] = response.meta['site'] + item['date'] = self.get_date(response) + item['image'] = self.get_image(response) + + if 'image' not in item or not item['image']: + item['image'] = None + + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = '' + item['image_blob'] = '' + + if item['image']: + if "?" in item['image'] and ("token" in item['image'].lower() or "expire" in item['image'].lower()): + item['image'] = re.search(r'(.*?)\?', item['image']).group(1) + + item['performers'] = self.get_performers(response) + item['tags'] = self.get_tags(response) + item['id'] = self.get_id(response) + item['trailer'] = self.get_trailer(response) + item['duration'] = self.get_duration(response) + item['url'] = self.get_url(response) + item['network'] = response.meta['network'] + item['parent'] = response.meta['parent'] + item['type'] = 'Scene' + + allow_site = True + if 'ignore_sites' in response.meta: + ignore_sites = response.meta['ignore_sites'] + ignore_sites = ignore_sites.split(",") + for ignore_site in ignore_sites: + ignore_site = re.sub('[^0-9a-zA-Z]', '', ignore_site.lower()) + site = re.sub('[^0-9a-zA-Z]', '', item['site'].lower()) + if site == ignore_site: + allow_site = False + + if "Private Society" in item['site']: + if item['date'] < '2022-09-22': + allow_site = False + + if allow_site: + yield self.check_item(item, self.days) + else: + print(f"*** Not processing item due to disallowed site or date: {item['site']}") diff --git a/scenes/siteFemjoy.py b/scenes/siteFemjoy.py index a38e911b..087d2f03 100644 --- a/scenes/siteFemjoy.py +++ b/scenes/siteFemjoy.py @@ -50,8 +50,10 @@ def get_scenes(self, response): item = SceneItem() item['title'] = self.cleanup_title(scene.xpath('./div/h1/a[1]/text()').get()) - item['date'] = self.parse_date(scene.xpath('./div//span[@class="posted_on"]/text()').get(), date_formats=['%b %d, %Y']).isoformat() - item['duration'] = self.duration_to_seconds(scene.xpath('./div//span[@class="posted_on"]/following-sibling::span/text()').get()) + item['date'] = self.parse_date(scene.xpath('./div//span[@class="posted_on"]/text()').get(), date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + duration = scene.xpath('./div//span[@class="posted_on"]/following-sibling::span/text()').get() + if duration: + item['duration'] = self.duration_to_seconds(duration) item['director'] = scene.xpath('.//h2/a[contains(@href, "/director/")]/text()').get() item['performers'] = scene.xpath('.//h2/a[contains(@href, "/models/")]/text()').getall() item['site'] = 'Femjoy' diff --git a/scenes/siteFrolicMe.py b/scenes/siteFrolicMe.py index 2e337b60..3be00ed3 100644 --- a/scenes/siteFrolicMe.py +++ b/scenes/siteFrolicMe.py @@ -1,7 +1,6 @@ import re import scrapy from tpdb.BaseSceneScraper import BaseSceneScraper -from scrapy.http import FormRequest class FrolicMeSpider(BaseSceneScraper): @@ -11,40 +10,50 @@ class FrolicMeSpider(BaseSceneScraper): site = 'Frolic Me' start_urls = [ - 'https://www.frolicme.com/', + 'https://www.frolicme.com', ] title_trash = ['- film', '- Film'] selector_map = { - 'title': '//div[@class="film-entry-title"]/text()', - 'description': '//div[@class="film-content"]/p/text()|//div[@class="film-content"]/p/span/text()|//div[@class="film-content"]/div/p/text()', + 'title': '//div[@class="entry-title"]/text()', + 'description': '//div[@class="entry-content"]/p/span//text()', 'date': '//script[contains(text(), "datePublished")]/text()', - 're_date': r'datePublished\": ?\"(\d{4}-\d{2}-\d{2}.*?)\"', + 're_date': r'datePublished[\'\"]:.*?(\d{4}-\d{2}-\d{2}.*?)[\'\"]', 'image': '//meta[@property="og:image"]/@content', - 'performers': '//h3/a[contains(@href,"/models/")]/text()', - 'tags': '//i[contains(@class,"fa-tags")]/following-sibling::a/text()', + 'duration': '//span[contains(@class,"inline-flex")]//i[contains(@class, "clock")]/following-sibling::text()', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'performers': '//span[contains(@class,"inline-flex")]/a[contains(@href, "/models/")]/text()', + 'tags': '//span[contains(@class,"inline-flex")]//i[contains(@class, "tag")]/following-sibling::a/text()', 'external_id': r'.*\/(.*?)\/$', 'trailer': '', - 'pagination': '/publications/page/%s/' + 'pagination': '/films/page/%s/?order_by=date_desc' } def start_requests(self): - frmheaders = {} - frmheaders['Content-Type'] = 'application/x-www-form-urlencoded' - frmdata = {"dob": "1995-05-09", "country": "RU"} - url = "https://www.frolicme.com/wp-json/frolic/v1/verify" - yield FormRequest(url, headers=frmheaders, formdata=frmdata) + meta = {} + meta['page'] = self.page + yield scrapy.Request('https://www.frolicme.com', callback=self.age_verify, meta=meta, headers=self.headers, cookies=self.cookies) + + def age_verify(self, response): + meta = response.meta + yield scrapy.FormRequest(url="https://www.frolicme.com/wp-json/frolic/v1/verify", meta=meta, formdata={"dob": "1985-05-02", "country": "US", "search_terms": ""}, callback=self.start_requests_2) + + def start_requests_2(self, response): + meta = response.meta for link in self.start_urls: - yield scrapy.Request(url=self.get_next_page_url(link, self.page), - callback=self.parse, - meta={'page': self.page}, - headers=self.headers, - cookies=self.cookies) + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) def get_scenes(self, response): - scenes = response.xpath('//article[contains(@class,"cpt_films")]/a/@href').getall() + meta = response.meta + scenes = response.xpath('//article[contains(@class, "post")]/a/@href').getall() for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_title(self, response): + title = super().get_title(response) + for trash in self.title_trash: + title = title.replace(trash, "").strip() + return title diff --git a/scenes/siteFutanariXXX.py b/scenes/siteFutanariXXX.py index 3b19fdbe..b7c21c11 100644 --- a/scenes/siteFutanariXXX.py +++ b/scenes/siteFutanariXXX.py @@ -5,7 +5,7 @@ class SiteFutanariXXXSpider(BaseSceneScraper): name = 'FutanariXXX' - network = 'Futanari XXX' + network = 'Hentaied' parent = 'Futanari XXX' site = 'Futanari XXX' @@ -19,7 +19,7 @@ class SiteFutanariXXXSpider(BaseSceneScraper): 'date': '//meta[@property="article:published_time"]/@content', 'image': '//meta[@property="og:image"]/@content', 'duration': '//div[contains(@class,"duration")]/img/following-sibling::text()', - 'performers': '//div[contains(@class,"tagsmodels")]/a/text()', + 'performers': '//div[contains(@class,"taglist")]/a/text()', 'tags': '//ul[@class="post-categories"]/li/a/text()', 'director': '//div[contains(@class,"director")]/span/a/text()', 'external_id': '.*\/(.*?)\/$', diff --git a/scenes/siteGapeMyPussy.py b/scenes/siteGapeMyPussy.py new file mode 100644 index 00000000..43b1ed27 --- /dev/null +++ b/scenes/siteGapeMyPussy.py @@ -0,0 +1,56 @@ +import re +import html +import string +import slugify +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteGapeMyPussySpider(BaseSceneScraper): + name = 'GapeMyPussy' + network = 'Apollo Cash' + + start_urls = [ + 'https://www.gapemypussy.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/index.php?updates=&page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//table[@width="400"]') + for scene in scenes: + item = SceneItem() + item['title'] = self.cleanup_title(scene.xpath('.//td[@height="36"]/text()').get()) + item['description'] = "" + item['date'] = '' + item['image'] = "" + item['image_blob'] = "" + image = scene.xpath('.//img[contains(@src, "1_1")]/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + performers = scene.xpath('.//span[contains(text(), "Model")]/b[1]/preceding-sibling::text()') + item['performers'] = [] + if performers: + performers = performers.getall() + for performer in performers: + performer = html.unescape(performer.replace(" ", " ").replace("\xa0", " ")) + if "(" in performer: + performer = re.search(r'(.*?) \(', performer).group(1) + if ":" in performer: + performer = re.search(r': (.*)', performer).group(1) + item['performers'].append(performer.strip()) + item['tags'] = ['Gaping'] + item['trailer'] = '' + item['id'] = slugify.slugify(item['title'].lower()) + item['network'] = "Apollo Cash" + item['parent'] = "Gape My Pussy" + item['site'] = "Gape My Pussy" + item['url'] = f"https://www.gapemypussy.com/{item['id']}" + yield self.check_item(item, self.days) diff --git a/scenes/siteGirlsDeep.py b/scenes/siteGirlsDeep.py new file mode 100644 index 00000000..8f50639a --- /dev/null +++ b/scenes/siteGirlsDeep.py @@ -0,0 +1,58 @@ +import re +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class siteGirlsDeepSpider(BaseSceneScraper): + name = 'GirlsDeep' + site = 'Girls Deep' + parent = 'Girls Deep' + network = 'Girls Deep' + + start_urls = [ + 'https://girlsdeep.com', + ] + + selector_map = { + 'title': './/h3/a/text()', + 'description': './/div[contains(@class,"entry-summary")]/p[1]/text()', + 'date': './/time/@datetime', + 're_date': r'(\d{4}-\d{2}-\d{2})', + 'performers': './/span[contains(text(), "MODEL:")]/following-sibling::a[1]/strong/text()', + 'tags': './/span[contains(@class,"entry-categories-inner")]/a/text()', + 'duration': './/span[contains(@class,"video-duration")]/text()', + 'trailer': '', + 'external_id': r'', + 'pagination': '/page/%s/', + 'type': 'Scene', + } + + def get_scenes(self, response): + scenes = response.xpath('//li[contains(@class,"ax-collection-item-1of2")]/article[contains(@class, "post-format-video")]') + for scene in scenes: + item = SceneItem() + item['title'] = self.get_title(scene) + item['description'] = self.get_description(scene) + item['date'] = self.get_date(scene) + image = scene.xpath('.//div[contains(@class,"entry-featured-media")]//img[contains(@data-src, "uploads")]/@data-src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = '' + item['image_blob'] = '' + item['duration'] = self.get_duration(scene) + item['type'] = 'Scene' + item['url'] = self.format_link(response, scene.xpath('./div[1]/a[1]/@href').get()) + item['id'] = re.search(r'.*/(.*?)/', item['url']).group(1) + item['site'] = "Girls Deep" + item['parent'] = "Girls Deep" + item['network'] = "Girls Deep" + item['tags'] = self.get_tags(scene) + if 'ArianaBright' in item['tags']: + item['tags'].remove('ArianaBright') + item['performers'] = self.get_performers(scene) + item['trailer'] = '' + + if "TRAILERS" not in item['tags']: + yield self.check_item(item, self.days) diff --git a/scenes/siteHeatwaveScene.py b/scenes/siteHeatwaveScene.py new file mode 100644 index 00000000..64d265c6 --- /dev/null +++ b/scenes/siteHeatwaveScene.py @@ -0,0 +1,73 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteHeatwaveSceneSpider(BaseSceneScraper): + name = 'HeatwaveScene' + network = 'Heatwave' + parent = 'Heatwave' + site = 'Heatwave' + + start_urls = [ + 'http://www.heatwavepass.com', + ] + + selector_map = { + 'title': '//h1[@class="title"]/text()', + 'description': '', + 'date': '//div[contains(@id,"info_container")]//span[contains(text(), "Added")]/following-sibling::text()', + 'date_formats': ['%B %d, %Y'], + 'image': '//div[@id="promo-shots"]/div[1]/@style', + 're_image': r'(http.*?)\)', + 'performers': '//div[@class="cast"]//div[@class="name"]/a/text()', + 'tags': '//span[contains(text(), "Tags")]/following-sibling::a/text()', + 'trailer': '', + 'external_id': r'.*-(\d+)\.htm', + 'pagination': '/scenes.html?p=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//ul[contains(@class,"scene-list")]/li/h3/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_duration(self, response): + + duration = response.xpath('//span[contains(text(), "Duration")]/following-sibling::text()') + if duration: + duration = duration.get().lower().replace(" ", "") + hours = '' + minutes = '' + if "h" in duration: + hours = (int(re.search(r'(\d{1,2})h', duration).group(1)) * 3600) + else: + hours = 0 + if "m" in duration: + minutes = (int(re.search(r'(\d{1,2})m', duration).group(1)) * 60) + else: + minutes = 0 + if "s" in duration: + seconds = int(re.search(r'(\d{1,2})s', duration).group(1)) + else: + seconds = 0 + tot_duration = str(hours + minutes + seconds) + else: + tot_duration = None + + return tot_duration + + def get_date(self, response): + scenedate = super().get_date(response) + if not scenedate: + scenedate = "2012-01-01" + return scenedate + + def get_image(self, response): + image = super().get_image(response) + if "/images/" in image: + image = image.replace("/images/", "/sc/") + return image diff --git a/scenes/siteHookupHotshot.py b/scenes/siteHookupHotshot.py index efcb3afd..a5243f5e 100644 --- a/scenes/siteHookupHotshot.py +++ b/scenes/siteHookupHotshot.py @@ -3,7 +3,7 @@ from tpdb.BaseSceneScraper import BaseSceneScraper -class CumPerfectionSpider(BaseSceneScraper): +class SiteHookupHotshotSpider(BaseSceneScraper): name = 'HookupHotshot' network = "Hookup Hotshot" parent = "Hookup Hotshot" diff --git a/scenes/siteHookupHotshotByPerformer.py b/scenes/siteHookupHotshotByPerformer.py new file mode 100644 index 00000000..54a9ae13 --- /dev/null +++ b/scenes/siteHookupHotshotByPerformer.py @@ -0,0 +1,84 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteHookupHotshotByPerformerSpider(BaseSceneScraper): + name = 'HookupHotshotByPerformer' + network = "Hookup Hotshot" + parent = "Hookup Hotshot" + + start_urls = [ + 'https://hookuphotshot.com/' + ] + + selector_map = { + 'title': '//div[contains(@class,"videoDetails")]/h3/text()', + 'description': '', + 'date': '//span[contains(text(),"Date Added")]/following-sibling::text()', + 'image': '//script[contains(text(),"video_content")]/text()', + 'performers': '//li[@class="update_models"]/a/text()', + 'tags': '//li[@class="label"]/following-sibling::li/a[contains(@href,"categories")]/text()', + 'external_id': '.*\\/(.*?)\\.html', + 'trailer': '//script[contains(text(),"video_content")]/text()', + 'pagination': '/models/%s/latest/?g=' + } + + def get_scenes(self, response): + models = response.xpath('//div[contains(@class, "item-portrait")]/a/@href').getall() + for model in models: + if "/models/" in model: + yield scrapy.Request(url=self.format_link(response, model), callback=self.get_model_scenes) + + def get_model_scenes(self, response): + scenes = response.xpath('//div[@class="item-thumb"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) + + def get_trailer(self, response): + trailer = self.process_xpath( + response, self.get_selector_map('trailer')).get() + if trailer: + trailer = re.search('src=\"(.*.mp4)\"', trailer) + if trailer: + trailer = trailer.group(1) + trailer = trailer.replace(" ", "%20") + trailer = "https://hookuphotshot.com" + trailer + return trailer + return '' + + + def get_image(self, response): + image = self.process_xpath(response, self.get_selector_map('image')).get() + if image: + image = re.search('poster=\"(.*.jpg)\"', image) + if image: + image = image.group(1) + else: + image = response.xpath('//img[contains(@class,"update_thumb")]/@src0_1x').get() + + if image: + image = image.replace(" ", "%20") + image = image.replace("https//", "https://") + image = image.replace("http//", "http://") + if "https://" not in image: + image = "https://hookuphotshot.com" + image + return image + else: + return '' + + + def get_tags(self, response): + if self.get_selector_map('tags'): + tags = self.process_xpath( + response, self.get_selector_map('tags')).getall() + if tags: + return list(map(lambda x: x.strip().title(), tags)) + return [] + + def get_description(self,response): + return '' + + def get_site(self, response): + return "Hookup Hotshot" diff --git a/scenes/siteHotMovies.py b/scenes/siteHotMovies.py index 6ea8e3b4..e40d353f 100644 --- a/scenes/siteHotMovies.py +++ b/scenes/siteHotMovies.py @@ -267,7 +267,7 @@ def convert_duration(self, duration): if "PT" in duration: if "H" in duration: duration = re.search(r'(\d{1,2})H(\d{1,2})M(\d{1,2})S', duration) - hours = int(duration.group(1)) * 3660 + hours = int(duration.group(1)) * 3600 minutes = int(duration.group(2)) * 60 seconds = int(duration.group(3)) duration = str(hours + minutes + seconds) diff --git a/scenes/siteIWantClipsSpecific.py b/scenes/siteIWantClipsSpecific.py index d94f95f0..249808b5 100644 --- a/scenes/siteIWantClipsSpecific.py +++ b/scenes/siteIWantClipsSpecific.py @@ -34,7 +34,7 @@ def start_requests(self): yield scrapy.Request(url=self.get_next_page_url(link, page + 1), callback=self.parse_token, meta={'page': page, 'url': link}, cookies=self.cookies) def get_next_page_url(self, base, page): - return f"https://iwantclips.com/store/537048/K8-Morgan?page={page}" + return f"https://iwantclips.com/store/1425368/CarlaCute?page={page}" def parse_token(self, response): match = re.search(r'searchClient.*?, \'(.*?)\'', response.text) diff --git a/scenes/siteIbicella.py b/scenes/siteIbicella.py new file mode 100644 index 00000000..9e1f34a5 --- /dev/null +++ b/scenes/siteIbicella.py @@ -0,0 +1,72 @@ +import re +import string +import unidecode +import html +import json +import scrapy +from deep_translator import GoogleTranslator +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteIbicellaSpider(BaseSceneScraper): + name = 'Ibicella' + network = 'Ibicella' + site = 'Ibicella' + + start_urls = [ + 'https://ibicella.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/api/product?slug=ibicellastudio&page=%s&per_page=12', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = json.loads(response.text) + scenes = scenes['data'] + for scene in scenes: + meta['id'] = scene['id'] + meta['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['published']).group(1) + meta['title'] = GoogleTranslator(source='fr', target='en').translate(unidecode.unidecode(self.cleanup_title(scene['title']))) + meta['duration'] = scene['file_duration'] + meta['trailer'] = scene['file_preview_video'] + meta['image'] = scene['file_preview_image'] + meta['image_blob'] = self.get_image_blob_from_link(meta['image']) + meta['description'] = GoogleTranslator(source='fr', target='en').translate(unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', scene['description'])).replace("\n", " ").strip())) + if meta['id']: + # ~ link = f"https://ibicella.com/shop/{scene['perma_name']}" + link = f"https://ibicella.com/api/product/{scene['perma_name']}?name=1" + yield scrapy.Request(link, callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + meta = response.meta + scene = json.loads(response.text) + item = SceneItem() + item['title'] = meta['title'] + item['date'] = meta['date'] + item['id'] = meta['id'] + item['duration'] = meta['duration'] + item['image'] = meta['image'] + item['image_blob'] = meta['image_blob'] + item['description'] = meta['description'] + item['trailer'] = meta['trailer'] + item['performers'] = ['Icibella'] + taglist = [] + if "categories" in scene: + for category in scene['categories']: + taglist.append(category['title_en']) + if "tags" in scene: + for tag in scene['tags']: + taglist.append(tag['title_en']) + taglist = list(map(lambda x: GoogleTranslator(source='fr', target='en').translate(x), taglist)) + taglist = list(map(lambda x: string.capwords(x.strip(",").strip().lower()), taglist)) + item['tags'] = [i for n, i in enumerate(taglist) if i not in taglist[:n]] + item['site'] = 'Ibicella' + item['network'] = 'Ibicella' + item['url'] = f"https://ibicella.com/shop/{scene['perma_name']}" + + yield self.check_item(item, self.days) diff --git a/scenes/siteInkaporn.py b/scenes/siteInkaporn.py new file mode 100644 index 00000000..d13bc8ed --- /dev/null +++ b/scenes/siteInkaporn.py @@ -0,0 +1,39 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class siteInkaPornSpider(BaseSceneScraper): + name = 'InkaPorn' + network = 'InkaPorn' + + start_urls = [ + 'https://www.inkaporn.com', + 'https://www.inkasex.com', + 'https://www.xekeko.com', + ] + + selector_map = { + 'title': '//h1/text()', + 'description': '//p[contains(@itemprop, "description")]/text()', + 'date': '//script[contains(text(), "uploadDate")]/text()', + 're_date': r'(\d{4}-\d{2}-\d{2})', + 'image': '//meta[@property="og:image"]/@content', + 'performers': '', + 'tags': '', + 'duration': '', + 'trailer': '', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/videos/latest?page_id=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="video-title"]/a[1]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + return super().get_id(response).lower() diff --git a/scenes/siteJennyJizz.py b/scenes/siteJennyJizz.py new file mode 100644 index 00000000..66b1d1d7 --- /dev/null +++ b/scenes/siteJennyJizz.py @@ -0,0 +1,41 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteJennyJizzSpider(BaseSceneScraper): + name = 'JennyJizz' + site = 'Jenny Jizz' + parent = 'Jenny Jizz' + network = 'Jenny Jizz' + + start_urls = [ + 'https://www.jennyjizz.com', + ] + + selector_map = { + 'title': '//div[contains(@class, "pagetitle")]//h1/text()', + 'description': '//div[@class="videocontent"]/p/text()', + 'date': '//div[@class="videodetails"]/p[@class="date"]/text()', + 're_date': r'(\d{1,2}/\d{1,2}/\d{4})', + 'date_formats': ['%m/%d/%Y'], + 'image': '//div[@class="videoplayer"]/img/@src0_3x|//div[@class="videoplayer"]/img/@src0_2x|//div[@class="videoplayer"]/img/@src0_1x', + 'performers': '//span[contains(@class,"tour_update_models")]/a/text()', + 'tags': '', + 'duration': '//div[@class="videodetails"]/p[@class="date"]/text()', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'trailer': '', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/tour/categories/Movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//h3/a[contains(@href, "/trailers/")]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + return super().get_id(response).lower() diff --git a/scenes/siteKendraSunderland.py b/scenes/siteKendraSunderland.py new file mode 100644 index 00000000..48592807 --- /dev/null +++ b/scenes/siteKendraSunderland.py @@ -0,0 +1,63 @@ +import re +import html +import json +import requests +import unidecode +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteKendraSunderlandVIPSpider(BaseSceneScraper): + name = 'KendraSunderlandVIP' + + start_urls = [ + 'https://www.kendrasunderlandvip.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/wp-json/wp/v2/video_posts?per_page=10&page=%s', + } + + def get_scenes(self, response): + meta = response.meta + jsondata = json.loads(response.text) + for scene in jsondata: + item = SceneItem() + + item['id'] = scene['id'] + item['title'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', scene['title']['rendered'])).replace("\n", " ").strip()) + item['description'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', scene['content']['rendered'])).replace("\n", " ").strip()) + item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['date']).group(1) + item['url'] = scene['link'] + item['performers'] = ['Kendra Sunderland'] + item['tags'] = [] + + item['trailer'] = '' + item['site'] = "Kendra Sunderland" + item['parent'] = "Kendra Sunderland" + item['network'] = "Kendra Sunderland" + item['type'] = 'Scene' + meta['item'] = item + + if "wp:attachment" in scene['_links'] and scene['_links']['wp:featuredmedia'][0]['href']: + image_url = scene['_links']['wp:featuredmedia'][0]['href'] + else: + image_url = None + + item['image'] = None + item['image_blob'] = None + if image_url: + req = requests.get(image_url) + if req and len(req.text) > 5: + imagerow = json.loads(req.text) + else: + imagerow = None + + if imagerow and 'guid' in imagerow: + if 'rendered' in imagerow['guid'] and imagerow['guid']['rendered']: + item['image'] = imagerow['guid']['rendered'] + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + if "VIP" in item['title']: + yield self.check_item(item, self.days) diff --git a/scenes/siteKingNoireXXX.py b/scenes/siteKingNoireXXX.py new file mode 100644 index 00000000..b3afeb07 --- /dev/null +++ b/scenes/siteKingNoireXXX.py @@ -0,0 +1,71 @@ +import re +import json +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteKingNoireXXXSpider(BaseSceneScraper): + name = 'KingNoireXXX' + site = 'KingNoireXXX' + parent = 'KingNoireXXX' + network = 'KingNoireXXX' + + start_urls = [ + 'https://kingnoirexxx.mymember.site', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/api/videos?count=20&page=%s', + } + + def get_scenes(self, response): + meta = response.meta + jsondata = json.loads(response.text) + jsondata = jsondata['data'] + + for scene in jsondata: + link = f"https://kingnoirexxx.mymember.site/api/videos/{str(scene['id'])}" + yield scrapy.Request(link, callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + scene = json.loads(response.text) + item = SceneItem() + + item['id'] = scene['id'] + item['duration'] = scene['duration'] + item['title'] = scene['title'] + if "{" in item['title'] and "}" in item['title']: + item['title'] = re.sub(r'{.*?}', "", item['title']).strip() + + if "description" in scene and scene['description']: + item['description'] = scene['description'] + else: + item['description'] = "" + + item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['publish_date']).group(1) + + item['performers'] = [] + for performer in scene['casts']: + item['performers'].append(performer['screen_name']) + + item['tags'] = [] + for tag in scene['tags']: + item['tags'].append(tag['name']) + + item['image'] = scene['poster_src'] + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + item['trailer'] = "" + + item['type'] = "Scene" + item['site'] = "KingNoireXXX" + item['parent'] = "KingNoireXXX" + item['network'] = "KingNoireXXX" + + item['url'] = f"https://kingnoirexxx.com/videos/{item['id']}" + + if item['id'] and item['title']: + yield self.check_item(item, self.days) diff --git a/scenes/siteKinkPlaywright.py b/scenes/siteKinkPlaywright.py index 42dce5df..ee3a8f7b 100644 --- a/scenes/siteKinkPlaywright.py +++ b/scenes/siteKinkPlaywright.py @@ -12,7 +12,7 @@ class NetworkKinkSpider(BaseSceneScraper): url = 'https://www.kink.com' paginations = [ - '/search?type=shoots&thirdParty=false&sort=published&page=%s', + '/shoots?thirdParty=false&sort=published&page=%s', # ~ '/search?type=shoots&sort=published&featuredIds=%s', # ~ '/search?type=shoots&sort=published&thirdParty=true&page=%s', # ~ '/search?type=shoots&sort=published&channelIds=wasteland&sort=published&page=%s', @@ -49,7 +49,7 @@ class NetworkKinkSpider(BaseSceneScraper): custom_scraper_settings = { 'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor', 'AUTOTHROTTLE_ENABLED': True, - 'USE_PROXY': True, + 'USE_PROXY': False, 'AUTOTHROTTLE_START_DELAY': 1, 'AUTOTHROTTLE_MAX_DELAY': 60, 'CONCURRENT_REQUESTS': 1, @@ -74,7 +74,8 @@ def start_requests(self): def start_requests2(self, response): for pagination in self.paginations: - yield scrapy.Request(url=self.get_next_page_url(self.url, self.page, pagination), callback=self.parse, meta={'page': self.page, 'pagination': pagination, "playwright": True}, headers=self.headers, cookies=self.cookies) + link = self.get_next_page_url(self.url, self.page, pagination) + yield scrapy.Request(link, callback=self.parse, meta={'page': self.page, 'pagination': pagination, "playwright": True}, headers=self.headers, cookies=self.cookies) def parse(self, response, **kwargs): if response.status == 200: @@ -93,7 +94,7 @@ def parse(self, response, **kwargs): def get_scenes(self, response): meta = response.meta - scenes = response.xpath("//a[@class='shoot-link']/@href").getall() + scenes = response.xpath('//div[contains(@class, "d-block")]/a/@href').getall() for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteLadyFyre.py b/scenes/siteLadyFyre.py new file mode 100644 index 00000000..5f463653 --- /dev/null +++ b/scenes/siteLadyFyre.py @@ -0,0 +1,51 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteLadyFyreSpider(BaseSceneScraper): + name = 'LadyFyre' + site = 'Lady Fyre' + parent = 'Lady Fyre' + network = 'Lady Fyre' + + start_urls = [ + 'https://ladyfyre.com', + ] + + selector_map = { + 'title': '//div[@class="update_block"]//span[contains(@class, "update_title")]/text()', + 'description': '//div[@class="update_block"]//span[contains(@class, "update_description")]//text()', + 'date': '//div[@class="update_block"]//span[contains(@class, "availdate")]/text()[1]', + 'date_formats': ['%m/%d/%Y'], + 'image': '//div[@class="update_block"]//div[contains(@class, "update_image")]//img[contains(@class, "large_update_thumb")]/@src', + 'performers': '//div[@class="update_block"]//span[contains(@class, "update_models")]/a/text()[1]', + 'tags': '//div[@class="update_block"]//span[contains(@class, "update_tags")]/a/text()[1]', + 'duration': '', + 'trailer': '', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/tour/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//h4/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_duration(self, response): + duration = response.xpath('//div[contains(@class, "update_counts")]/text()') + if duration: + duration = duration.get() + duration = duration.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", " ").replace("\xa0", " ").replace(" ", "").lower() + duration = re.search(r'(\d+)min', duration) + if duration: + duration = duration.group(1) + return str(int(duration) * 60) + return None + + def get_id(self, response): + sceneid = super().get_id(response) + return sceneid.lower() diff --git a/scenes/siteLadyboyGold.py b/scenes/siteLadyboyGold.py new file mode 100644 index 00000000..8b3e49ad --- /dev/null +++ b/scenes/siteLadyboyGold.py @@ -0,0 +1,80 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteLadyboyGoldSpider(BaseSceneScraper): + name = 'LadyboyGold' + site = 'Ladyboy Gold' + parent = 'Ladyboy Gold' + network = 'Ladyboy Gold' + + selector_map = { + 'title': './/p[@class="setTitle"]/text()', + 'description': './/p[contains(@class, "setTRT") and contains(@class, "hidden-xs")]/text()', + 'date': '', + 'image': './/img/@src', + 'performers': './/p[@class="setModel"]/a/text()', + 'tags': './/p[contains(@class,"setTags")]/a/text()', + 'trailer': '', + 'external_id': r'', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = 'https://www.ladyboygold.com/index.php?section=1810' + yield scrapy.Request(link, callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "videoUpdate") and contains(@class, "col-xxxld")]') + for scene in scenes: + item = SceneItem() + item['title'] = self.get_title(scene) + item['description'] = self.get_description(scene) + scenedate = scene.xpath('./comment()[contains(., "modelNames")]').getall() + scenedate = "".join(scenedate) + scenedate = scenedate.replace('\n', '').replace('\r', '').replace('\t', '') + item['date'] = '' + if scenedate: + scenedate = re.search(r'(\w+\s+\d{1,2},\s+\d{4})', scenedate) + if scenedate: + scenedate = scenedate.group(1) + item['date'] = self.parse_date(scenedate, date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + + item['image'] = scene.xpath('.//img/@src').get() + if "&width" in item['image']: + item['image'] = re.search(r'(.*)\&width', item['image']).group(1) + if "&height" in item['image']: + item['image'] = re.search(r'(.*)\&height', item['image']).group(1) + if "&crop" in item['image']: + item['image'] = re.search(r'(.*)\&crop', item['image']).group(1) + if item['image']: + item['image'] = "https://www.ladyboygold.com/" + item['image'] + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image_blob'] = '' + + item['id'] = re.search(r'gal=(\d+)', item['image']).group(1) + item['trailer'] = self.get_trailer(scene) + duration = scene.xpath('.//p[@class="setTRT" and contains(text(), "Minutes")]/text()') + if duration: + duration = re.search(r'(\d+)', duration.get()) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + item['url'] = f"https://www.ladyboygold.com/video/{item['id']}" + item['network'] = self.network + item['site'] = self.site + item['parent'] = self.parent + + item['performers'] = self.get_performers(scene) + item['tags'] = self.get_tags(scene) + + yield self.check_item(item, self.days) diff --git a/scenes/siteLadyboyGoldByPerformer.py b/scenes/siteLadyboyGoldByPerformer.py new file mode 100644 index 00000000..f8135118 --- /dev/null +++ b/scenes/siteLadyboyGoldByPerformer.py @@ -0,0 +1,87 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteLadyboyGoldByPerformerSpider(BaseSceneScraper): + name = 'LadyboyGoldByPerformer' + site = 'Ladyboy Gold' + parent = 'Ladyboy Gold' + network = 'Ladyboy Gold' + + selector_map = { + 'title': './/p[@class="setTitle"]/text()', + 'description': './/p[contains(@class, "setTRT") and contains(@class, "hidden-xs")]/text()', + 'date': '', + 'image': './/img/@src', + 'performers': './/p[@class="setModel"]/a/text()', + 'tags': './/p[contains(@class,"setTags")]/a/text()', + 'trailer': '', + 'external_id': r'', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = 'https://www.ladyboygold.com/index.php?section=1813' + yield scrapy.Request(link, callback=self.get_performers, meta=meta, headers=self.headers, cookies=self.cookies) + + + def get_performers(self, response): + performers = response.xpath('//p[@class="setModel"]/a/@href').getall() + for performer in performers: + performer = re.search(r'(.*?)\&nats', performer).group(1) + yield scrapy.Request(url=self.format_link(response, performer), callback=self.get_scenes, cookies=self.cookies, headers=self.headers) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "videoUpdate") and contains(@class, "col-xxxld")]') + for scene in scenes: + item = SceneItem() + item['title'] = self.get_title(scene) + item['description'] = self.get_description(scene) + scenedate = scene.xpath('./comment()[contains(., "modelNames")]').getall() + scenedate = "".join(scenedate) + scenedate = scenedate.replace('\n', '').replace('\r', '').replace('\t', '') + item['date'] = '' + if scenedate: + scenedate = re.search(r'(\w+\s+\d{1,2},\s+\d{4})', scenedate) + if scenedate: + scenedate = scenedate.group(1) + item['date'] = self.parse_date(scenedate, date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + + item['image'] = scene.xpath('.//img/@src').get() + if "&width" in item['image']: + item['image'] = re.search(r'(.*)\&width', item['image']).group(1) + if "&height" in item['image']: + item['image'] = re.search(r'(.*)\&height', item['image']).group(1) + if "&crop" in item['image']: + item['image'] = re.search(r'(.*)\&crop', item['image']).group(1) + if item['image']: + item['image'] = "https://www.ladyboygold.com/" + item['image'] + if item['image']: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image_blob'] = '' + + item['id'] = re.search(r'gal=(\d+)', item['image']).group(1) + item['trailer'] = self.get_trailer(scene) + duration = scene.xpath('.//p[@class="setTRT" and contains(text(), "Minutes")]/text()') + if duration: + duration = re.search(r'(\d+)', duration.get()) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + item['url'] = f"https://www.ladyboygold.com/video/{item['id']}" + item['network'] = self.network + item['site'] = self.site + item['parent'] = self.parent + + item['performers'] = response.xpath('//div[@class="container"]/div[1]/div[2]/div[1]/text()').getall() + item['tags'] = self.get_tags(scene) + + yield self.check_item(item, self.days) diff --git a/scenes/siteLasVegasAmateurs.py b/scenes/siteLasVegasAmateurs.py index 9c64c127..36178a92 100644 --- a/scenes/siteLasVegasAmateurs.py +++ b/scenes/siteLasVegasAmateurs.py @@ -1,96 +1,64 @@ import re -from datetime import date, timedelta +import html +import scrapy from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem class SiteLasVegasAmateursSpider(BaseSceneScraper): name = 'LasVegasAmateurs' + site = 'Las Vegas Amateurs' + parent = 'Las Vegas Amateurs' network = 'Las Vegas Amateurs' - max_pages = 100 start_urls = [ - 'http://lasvegasamateurs.com' + 'https://lasvegasamateurs.com', ] selector_map = { - 'title': '//h1[contains(@class, "title")]/text()', - 'description': '//p[contains(@class, "description")]/text()', - 'performers': '//span[contains(@class,"models")]/a/text()', - 'date': '//div[contains(@class, "date")]/text()', - 'image': '//meta[@property="og:image"]/@content', - 'tags': '//div[contains(@class, "video-tags")]/a/text()', - 'trailer': '', - 'external_id': r'trailers/(.*)\.html', - 'pagination': '/tour/categories/updates_%s_d.html' + 'title': '//span[@class="update_title"]/text()', + 'description': '//span[@class="latest_update_description"]//text()', + 'date': '//span[@class="availdate"]/text()', + 'date_formats': ['%m/%d/%Y'], + 'image': '//comment()[contains(., "First Thumb Spot")]/following-sibling::a[1]/img/@src0_4x', + 'performers': '//span[@class="tour_update_models"]/a/text()', + 'tags': '//span[@class="update_tags"]/a/text()', + 'duration': '//div[@class="update_counts_preview_table"]/text()[contains(., "min")]', + 'trailer': '//div[@class="update_image"]/a[contains(@onclick, ".mp4")][1]/@onclick', + 're_trailer': r'tload.*?(/.*?)[\'\"]', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/tour/categories/updates_%s.html', + 'type': 'Scene', } def get_scenes(self, response): - scenes = response.xpath('//div[@class="updateItem"]') - if response.meta['page'] < self.max_pages: - for scene in scenes: - item = SceneItem() - title = scene.xpath('./div/h5/a/text()').get() - if title: - item['title'] = self.cleanup_title(title) - else: - item['title'] = '' + meta = response.meta + scenes = response.xpath('//div[@class="updateItem"]/div[1]/a[1]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) - item['description'] = '' + def get_duration(self, response): + duration = response.xpath('//div[@class="update_counts_preview_table"]/text()[contains(., "min")]') + if duration: + duration = duration.get() + duration = re.sub('[^a-z0-9]', '', duration.lower()) - performers = scene.xpath('.//span[@class="tour_update_models"]/a/text()') - if performers: - performers = performers.getall() - item['performers'] = list(map(lambda x: x.strip(), performers)) - else: - item['performers'] = [] + duration = re.search(r'(\d+)min', duration) + if duration: + return str(int(duration.group(1)) * 60) + return None - scenedate = scene.xpath('./div/p/span[contains(text(), "/")]/text()') - item['date'] = self.parse_date('today').isoformat() - if scenedate: - item['date'] = self.parse_date(scenedate.get(), date_formats=['%m/%d/%Y']).isoformat() + def get_image(self, response): + image = super().get_image(response) + image = image.replace(".com/content", ".com/tour/content") + return image - image = scene.xpath('./div/a/img/@src0_3x') - if image: - image = image.get() - item['image'] = "http://lasvegasamateurs.com/tour/" + image.strip().replace(" ", "%20") - else: - item['image'] = None + def get_trailer(self, response): + trailer = super().get_trailer(response) + trailer = trailer.replace(".com/content", ".com/tour/content") + return trailer - item['image_blob'] = self.get_image_blob_from_link(item['image']) - - item['tags'] = [] - trailer = scene.xpath('./div/a/@onclick') - item['trailer'] = '' - if trailer: - trailer = trailer.get() - trailer = re.search(r'\'(/.*.mp4)', trailer) - if trailer: - item['trailer'] = 'https://lasvegasamateurs.com' + trailer.group(1).strip().replace(" ", "%20") - item['site'] = "Las Vegas Amateurs" - item['parent'] = "Las Vegas Amateurs" - item['network'] = "Las Vegas Amateurs" - - extern_id = re.search(r'content/(.*?)/.*?.jpg', item['image']) - if extern_id: - item['id'] = extern_id.group(1).strip().lower() - - item['url'] = response.url - - days = int(self.days) - if days > 27375: - filterdate = "0000-00-00" - else: - filterdate = date.today() - timedelta(days) - filterdate = filterdate.strftime('%Y-%m-%d') - - if self.debug: - if not item['date'] > filterdate: - item['filtered'] = "Scene filtered due to date restraint" - print(item) - else: - if filterdate: - if item['date'] > filterdate: - yield item - else: - yield item + def get_id(self, response): + image = self.get_image(response) + sceneid = re.search(r'.*/(.*?)/', image).group(1) + return sceneid.lower() diff --git a/scenes/siteLatinaRaw.py b/scenes/siteLatinaRaw.py index 12db874e..6e759ea7 100644 --- a/scenes/siteLatinaRaw.py +++ b/scenes/siteLatinaRaw.py @@ -40,7 +40,7 @@ def start_requests(self): 'X-Nats-Cms-Area-Id': '2', 'X-Nats-Entity-Decode': '1', } - link = 'https://idsandbox.hostednats.com/tour_api.php/content/sets?cms_set_ids=&data_types=1&content_count=1&count=100&start=0&cms_area_id=2&cms_block_id=100695&orderby=published_desc&content_type=video&status=enabled&data_type_search=%7B%22100001%22:%22163%22%7D' + link = 'https://idsandbox.hostednats.com/tour_api.php/content/sets?cms_set_ids=&data_types=1&content_count=1&count=100&start=0&cms_area_id=2&cms_block_id=100695&orderby=published_desc&content_type=video&status=enabled&text_search=&data_type_search=%7B%22100001%22:%22163%22%7D' yield scrapy.Request(link, callback=self.get_scenes, meta=meta, headers=headers, cookies=self.cookies) def get_scenes(self, response): diff --git a/scenes/siteLetsDoeIt.py b/scenes/siteLetsDoeIt.py index b488aacf..94e3894d 100644 --- a/scenes/siteLetsDoeIt.py +++ b/scenes/siteLetsDoeIt.py @@ -13,7 +13,7 @@ class LetsDoeItSpider(BaseSceneScraper): 'https://www.letsdoeit.com', 'https://amateureuro.com', 'https://mamacitaz.com/', - 'https://dirtycosplay.com/', + # ~ # 'https://dirtycosplay.com/', Paywalled 'https://transbella.com/', 'https://vipsexvault.com', ] @@ -33,12 +33,15 @@ class LetsDoeItSpider(BaseSceneScraper): } def get_scenes(self, response): + meta = response.meta responsetext = response.xpath('//*').getall() responsetext = "".join(responsetext) scenes = re.findall(r'a\ target=\"_self\" class=\"-g-vc-fake\"\ href=\"(.*?.html)\"', responsetext) for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) + meta['id'] = re.search(r'/watch/(.*)/', scene).group(1) + meta['url'] = self.format_link(response, scene) + yield scrapy.Request(meta['url'], callback=self.parse_scene, meta=meta) def get_site(self, response): site = response.xpath('//div[@class="-mvd-grid-actors"]/span[1]/a[1]/text()').get().strip() @@ -87,12 +90,13 @@ def get_date(self, response): if scenedate: return scenedate - scenedate = response.xpath('//div[@class="-mvd-grid-stats"]/text()') + # ~ scenedate = response.xpath('//div[contains(@class,"video-top-details")]//div[contains(@class,"mvd-grid-stats")]//text()') + scenedate = response.xpath('//div[contains(text(), "Views")]/text()') if scenedate: scenedate = scenedate.get() scenedate = re.search(r'(\w+ \d{2}, \d{4})', scenedate) if scenedate: scenedate = scenedate.group(1) - return self.parse_date(scenedate, date_formats=['%b %d, %Y']).isoformat() + return self.parse_date(scenedate, date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') return None diff --git a/scenes/siteLifeSelector.py b/scenes/siteLifeSelector.py index 50fb830c..8a054130 100644 --- a/scenes/siteLifeSelector.py +++ b/scenes/siteLifeSelector.py @@ -68,38 +68,38 @@ def get_next_page_url(self, base, page): return self.format_url(base, self.get_selector_map('pagination') % (page, timestamp)) def get_scenes(self, response): - scenes = response.xpath('//div[contains(@class, "episodeBlock") and contains(@class, "normal")]') + scenes = response.xpath('//div[contains(@class, "episodeBlock") and contains(@class, "notOrdered")]') for scene in scenes: item = SceneItem() - item['title'] = self.cleanup_title(scene.xpath('.//img[contains(@class,"gamePic")]/@title').get()) + item['title'] = self.cleanup_title(scene.xpath('./div/h3[@class="game-title"]/text()').get()) item['description'] = "" - description = scene.xpath('.//div[@class="td story"]//text()[not(ancestor::a) and not(ancestor::em)]') + description = scene.xpath('.//div[@class="story"]//text()') if description: item['description'] = " ".join(list(map(lambda x: x.strip(), description.getall()))).strip().replace("\n", "").replace("\t", "").replace("\r", "") item['date'] = '' item['image'] = "" item['image_blob'] = "" - image = scene.xpath('.//img[contains(@class,"gamePic")]/@src') + image = scene.xpath('./a[1]/img/@src') if image: item['image'] = self.format_link(response, image.get()) if "list/soft/1.jpg" in item['image']: item['image'] = item['image'].replace("list/soft/1", "poster/soft/1_size1200") item['image_blob'] = self.get_image_blob_from_link(item['image']) - performers = scene.xpath('.//div[@class="details"]/div[@class="tr"]/div[@class="th" and contains(./text(), "Starring")]/following-sibling::div[@class="td"]/a/text()') + performers = scene.xpath('.//div[@class="models"]/a/text()') item['performers'] = [] if performers: item['performers'] = performers.getall() - tags = scene.xpath('.//div[@class="details"]/div[@class="tr"]/div[@class="th" and contains(./text(), "Labels")]/following-sibling::div[@class="td"]/a/text()') + tags = scene.xpath('.//div[@class="tags"]/a/text()') item['tags'] = [] if tags: item['tags'] = list(map(lambda x: string.capwords(x.strip()), tags.getall())) item['trailer'] = '' - trailer = scene.xpath('.//a[contains(@class, "view-trailer")]/@data-video-src') + trailer = scene.xpath('.//div[contains(@class,"action")]/button[contains(@class, "trailer")]/@data-video-src') if trailer: item['trailer'] = self.format_link(response, trailer.get()) item['id'] = scene.xpath('./@id').get() item['network'] = "Life Selector" item['parent'] = "Life Selector" item['site'] = "Life Selector" - item['url'] = self.format_link(response, scene.xpath('./div[@class="thumb"]/a/@href').get()) + item['url'] = self.format_link(response, scene.xpath('./a[1]/@href').get()) yield self.check_item(item, self.days) diff --git a/scenes/siteLucasRaunch.py b/scenes/siteLucasRaunch.py new file mode 100644 index 00000000..1633053f --- /dev/null +++ b/scenes/siteLucasRaunch.py @@ -0,0 +1,61 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteLucasRaunchSpider(BaseSceneScraper): + name = 'LucasRaunch' + network = 'Lucas Entertainment' + + start_urls = [ + # ~ 'https://www.lucasraunch.com', + 'https://www.sexinsuits.com', + ] + + selector_map = { + 'title': '//div[@class="slidercontainer"]//h2/text()', + 'description': '//div[@class="container"]/div[@class="row-fluid"][1]/div[@class="span12"]/p[1]/text()', + 'date': '', + 'image': '//div[contains(@class, "scene-limit-reached")]/img/@src', + 'performers': '//h4/a/text()', + 'tags': '', + 'duration': '', + 'trailer': '', + 'external_id': r'.*/(.*?)$', + 'pagination': '/scenes/page/%s/', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//a[@class="scene-thumbnail"]/..') + for scene in scenes: + scenedate = scene.xpath('.//h6//text()') + if scenedate: + scenedate = scenedate.getall() + scenedate = "".join(scenedate) + scenedate = re.search(r'(\d{2}\.\d{2}\.\d{2})', scenedate).group(1) + meta['date'] = self.parse_date(scenedate, date_formats=['%m.%d.%y']).strftime('%Y-%m-%d') + scene = scene.xpath('./a/@href').get() + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_image(self, response): + image = super().get_image(response) + if not image or "content" not in image: + image = response.xpath('//script[contains(text(), "jwplayer") and contains(text(), "image:")]/text()').get() + image = image.replace("\r", "").replace("\n", "").replace("\t", "") + image = re.search(r'jwplayer.*?image:.*?[\'\"](.*?)[\'\"]', image).group(1) + return self.format_link(response, image) + + def get_site(self, response): + if "lucasraunch" in response.url: + return "Lucas Raunch" + if "sexinsuits" in response.url: + return "Sex in Suits" + + def get_parent(self, response): + if "lucasraunch" in response.url: + return "Lucas Raunch" + if "sexinsuits" in response.url: + return "Sex in Suits" diff --git a/scenes/siteLukeHardy.py b/scenes/siteLukeHardy.py new file mode 100644 index 00000000..c258027d --- /dev/null +++ b/scenes/siteLukeHardy.py @@ -0,0 +1,84 @@ +import re +import json +import scrapy +import requests +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteLukeHardySpider(BaseSceneScraper): + name = 'LukeHardy' + network = 'Luke Hardy' + parent = 'Luke Hardy' + site = 'Luke Hardy' + + start_urls = [ + 'https://www.lukehardyxxx.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/army/videos.php?&p=%s', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + with open('datafiles/LukeHardyPerformers.json') as perf_file: + meta['performerlist'] = json.load(perf_file) + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + perf_list = meta['performerlist']['scenes'] + scenes = response.xpath('//div[@class="videoThumbBlock"]') + for scene in scenes: + item = SceneItem() + + item['title'] = self.cleanup_title(scene.xpath('./p/a/text()').get()) + + sceneid = scene.xpath('.//img[contains(@src, "content")]/@alt').get() + item['id'] = re.search(r'(\d+)', sceneid).group(1) + + item['description'] = '' + + + + + scenethumb = scene.xpath('.//img[contains(@src, "content")]/@src') + if scenethumb: + scenethumb = scenethumb.get() + item['image'] = f"https://www.lukehardyxxx.com/army/{scenethumb}" + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = '' + item['image_blob'] = '' + + item['trailer'] = "" + + item['date'] = '' + scenedate = scene.xpath('.//div[contains(@class, "videoDate")]/text()') + if scenedate: + scenedate = scenedate.get() + scenedate = re.search(r'(\w+ \d{1,2}, \d{4})', scenedate) + if scenedate: + scenedate = scenedate.group(1) + item['date'] = self.parse_date(scenedate, date_formats=['%b %d, %Y']).strftime('%Y-%m-%d') + + item['url'] = f"https://www.lukehardyxxx.com/army/video-{item['id']}.php" + + item['tags'] = [] + item['duration'] = None + item['site'] = 'Luke Hardy' + item['parent'] = 'Luke Hardy' + item['network'] = 'Luke Hardy' + + item['performers'] = [] + title = item['title'].lower() + for perf in perf_list: + if perf['name'].lower() in title: + item['performers'].append(perf['name']) + + yield self.check_item(item, self.days) diff --git a/scenes/siteLustery.py b/scenes/siteLustery.py index e5502233..d12bf257 100644 --- a/scenes/siteLustery.py +++ b/scenes/siteLustery.py @@ -63,7 +63,7 @@ def parse_scene(self, response): item['title'] = video['title'] item['duration'] = video['duration'] item['tags'] = video['tags'] - item['tags'] = item['tags'] = list(map(lambda x: string.capwords(x.replace("-", " ").strip()), item['tags'])) + item['tags'] = list(map(lambda x: string.capwords(x.replace("-", " ").strip()), item['tags'])) item['image'] = f"https://lustery.com/{video['posterFullPath']}" item['image_blob'] = self.get_image_blob_from_link(item['image']) performers = video['coupleName'] diff --git a/scenes/siteMatureFetish.py b/scenes/siteMatureFetish.py new file mode 100644 index 00000000..4e26879b --- /dev/null +++ b/scenes/siteMatureFetish.py @@ -0,0 +1,36 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteMatureFetishSpider(BaseSceneScraper): + name = 'MatureFetish' + network = 'Mature NL' + parent = 'Mature Fetish' + site = 'Mature Fetish' + + start_urls = [ + 'https://maturefetish.com', + ] + + selector_map = { + 'title': '//h1/text()', + 'description': '//h3[contains(text(), "Synopsis")]/following-sibling::text()', + 'date': '//h1/following-sibling::div[1]/div[@class="stats-list"]/div[2]/text()', + 'date_formats': ['%d-%m-%Y'], + 'image': '//video/@poster', + 'performers': '//div[@class="grid-tile-model"]//a[contains(@href, "/model/")]/text()', + 'tags': '//div[contains(@class, "tag-list")]/a/text()', + 'duration': '//div[contains(@style, "max-width")]/following-sibling::div[contains(@class, "stats-list")]/div[1]/text()', + 'trailer': '', + 'external_id': r'.*/(.*?)$', + 'pagination': '/en/content/%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="grid-tile-content"]/div[1]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteMaverickMen.py b/scenes/siteMaverickMen.py new file mode 100644 index 00000000..32450db7 --- /dev/null +++ b/scenes/siteMaverickMen.py @@ -0,0 +1,32 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteMaverickMenSpider(BaseSceneScraper): + name = 'MaverickMen' + network = 'Maverick Men' + parent = 'Maverick Men' + site = 'Maverick Men' + + start_urls = [ + 'https://vod.maverickmen.com', + ] + + selector_map = { + 'title': '//h1[@id="view_title"]/text()', + 'description': '//span[@id="view_description"]//text()', + 'date': '//strong[contains(text(), "Released")]/following-sibling::text()', + 'date_formats': ['%m/%d/%Y'], + 'image': '//div[@class="main_vid"]//img/@src', + 'external_id': r'.*=(.*?)$', + 'pagination': '/?page=videos&p=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="vid-list-thumb"]/a[1]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteMaverickMenDirects.py b/scenes/siteMaverickMenDirects.py new file mode 100644 index 00000000..5d3d8a14 --- /dev/null +++ b/scenes/siteMaverickMenDirects.py @@ -0,0 +1,39 @@ +import re +import string +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteMaverickMenDirectsSpider(BaseSceneScraper): + name = 'MaverickMenDirects' + network = 'Maverick Men' + parent = 'Maverick Men Directs' + site = 'Maverick Men Directs' + + start_urls = [ + 'https://vod.maverickmen.com', + ] + + selector_map = { + 'title': '//div[@class="custom-container"]/div/div/h2/text()', + 'description': '//h5/following-sibling::p//text()', + 'date': '//i[contains(@class, "fa-clock")]/following-sibling::small/text()', + 'date_formats': ['%m/%d/%Y'], + 'image': '//comment()[contains(., "img-responsive")]', + 're_image': r'(http.*?)[\'\"]', + 'external_id': r'.*/(.*?)$', + 'pagination': '/m/r/site/Maverick_Directs/ms/trailers?p=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="videobox2"]/figure/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_title(self, response): + title = super().get_title(response) + title = title.lower().replace("teaser", "").strip() + return string.capwords(title) diff --git a/scenes/siteMilfCandy.py b/scenes/siteMilfCandy.py new file mode 100644 index 00000000..e33e1bed --- /dev/null +++ b/scenes/siteMilfCandy.py @@ -0,0 +1,53 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteMilfCandySpider(BaseSceneScraper): + name = 'MilfCandy' + network = 'Milf Candy' + parent = 'Milf Candy' + site = 'Milf Candy' + + start_urls = [ + 'https://tour.milfcandy.com', + ] + + selector_map = { + 'title': '//div[@class="bodyInnerArea"]/div[1]/div[contains(@class, "title clear")]/h2/text()', + 'description': '//div[@class="description"]/p/text()', + 'date': '//div[@class="info"]/p/text()[contains(., "Added")]', + 're_date': r'(\w+ \d{1,2}, \d{4})', + 'date_formats': ['%B %d, %Y'], + 'image': '//script[contains(text(), "video_content")]/text()', + 're_image': r'poster.*?[\'\"](.*?)[\'\"]', + 'performers': '//div[@class="info"]/p[1]//a/text()', + 'tags': '//ul[@class="tags"]/li/a/text()', + 'duration': '//div[@class="info"]/p/text()[contains(., "Runtime")]', + 're_duration': r'((?:\d{1,2}\:)?\d{2}\:\d{2})', + 'trailer': '//script[contains(text(), "video_content")]/text()', + 're_trailer': r'video src.*?[\'\"](.*?)[\'\"]', + 'external_id': r'', + 'pagination': '/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="item item-video"]') + for scene in scenes: + sceneid = scene.xpath('.//img/@id').get() + meta['id'] = re.search(r'-(\d+)', sceneid).group(1) + + scene = scene.xpath('./div[1]/a/@href').get() + if meta['id']: + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_image(self, response): + image = super().get_image(response) + if "content" not in image: + image = response.xpath('//div[@class="player-thumb"]//img/@src0_3x|//div[@class="player-thumb"]//img/@src0_2x|//div[@class="player-thumb"]//img/@src0_1x') + if image: + image = image.get() + image = self.format_link(response, image) + return image diff --git a/scenes/siteMilkyPeru.py b/scenes/siteMilkyPeru.py new file mode 100644 index 00000000..03526500 --- /dev/null +++ b/scenes/siteMilkyPeru.py @@ -0,0 +1,104 @@ +import re +import string +import html +import json +import requests +import unidecode +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteMilkyPeruSpider(BaseSceneScraper): + name = 'MilkyPeru' + + custom_settings = {'CONCURRENT_REQUESTS': '1', + 'AUTOTHROTTLE_ENABLED': 'True', + 'AUTOTHROTTLE_DEBUG': 'False', + 'DOWNLOAD_DELAY': '2', + 'CONCURRENT_REQUESTS_PER_DOMAIN': '1', + } + + start_urls = [ + 'https://milkyperu.com', + ] + + selector_map = { + 'title': '', + 'description': '', + 'date': '', + 'image': '', + 'performers': '', + 'tags': '', + 'trailer': '', + 'external_id': r'', + 'pagination': '/index.php/wp-json/wp/v2/posts?page=%s&per_page=20' + } + + def start_requests(self): + tagdata = [] + for i in range(1, 10): + req = requests.get(f'https://milkyperu.com/index.php/wp-json/wp/v2/tags?per_page=100&page={str(i)}') + if req and len(req.text) > 5: + tagtemp = [] + tagtemp = json.loads(req.text) + tagdata = tagdata + tagtemp + else: + break + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), + callback=self.parse, + meta={'page': self.page, 'tagdata': tagdata}, + headers=self.headers, + cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + jsondata = json.loads(response.text) + for scene in jsondata: + item = SceneItem() + + desc_block = scene['excerpt']['rendered'] + desc_block = html.unescape(desc_block) + desc_block = desc_block.replace("\\", "") + + image = scene["yoast_head_json"]['og_image'][0]['url'] + if image: + item['image'] = image + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = "" + item['image_blob'] = "" + + item['description'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', scene['excerpt']['rendered'])).strip()) + + item['trailer'] = "" + if re.search(r'(http.*?\.mp4) ', item['description']): + item['trailer'] = re.search(r'(http.*?\.mp4) ', item['description']).group(1) + item['description'] = re.search(r'http.*?\.mp4 (.*)', item['description']).group(1) + + item['id'] = str(scene['id']) + item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', scene['date']).group(1) + item['title'] = unidecode.unidecode(html.unescape(re.sub('<[^<]+?>', '', scene['title']['rendered'])).strip()) + item['performers'] = [] + item['tags'] = [] + for model_id in scene['tags']: + for tag in meta['tagdata']: + if tag['id'] == model_id: + item['tags'].append(tag['name']) + + matches = ['Porn Movies', 'Baandidas', 'Best Porn', 'Flirt With', 'Hd Peru Videos', 'Hot Peruvian', 'Hottest Latinas', 'Milky', 'Peru', 'Porn'] + for match in matches: + for tag in item['tags']: + if match.lower() in tag.lower(): + item['tags'].remove(tag) + + item['tags'] = list(map(lambda x: string.capwords(x.strip()), item['tags'])) + + item['site'] = 'Milky Peru' + item['parent'] = 'Milky Peru' + item['network'] = 'Milky Peru' + item['url'] = scene['link'] + + yield self.check_item(item, self.days) diff --git a/scenes/siteMongerInAsia.py b/scenes/siteMongerInAsia.py index 355bae6f..ef33f52c 100644 --- a/scenes/siteMongerInAsia.py +++ b/scenes/siteMongerInAsia.py @@ -1,6 +1,6 @@ import re import scrapy - +from tpdb.items import SceneItem from tpdb.BaseSceneScraper import BaseSceneScraper @@ -10,33 +10,74 @@ class SiteMongerInAsiaSpider(BaseSceneScraper): parent = 'Monger In Asia' site = 'Monger In Asia' - start_urls = [ - 'https://mongerinasia.com', - ] + start_url = 'https://mongerinasia.com' selector_map = { - 'title': '//div[@class="scene-title-wrap"]/h1/text()', - 'description': '//div[contains(@class,"description_content")]/text()', - 'date': '', - 'image': '//video/@poster', - 'performers': '//div[@class="div-model-info-in-desc"]//h2/text()', - 'tags': '', 'external_id': r'trailers/(.*)', - 'trailer': '//video/source/@src', - 'pagination': '/categories/monger-in-asia_%s_d' + 'pagination': '/_next/data//videos.json?page=%s&order_by=publish_date&sort_by=desc', } - def get_scenes(self, response): + + def start_requests(self): meta = {} - scenes = response.xpath('//div[contains(@class,"videoBlock")]') + meta['page'] = self.page + yield scrapy.Request('https://mongerinasia.com/', callback=self.start_requests_2, meta=meta, headers=self.headers, cookies=self.cookies) + + def start_requests_2(self, response): + meta = response.meta + buildId = re.search(r'\"buildId\":\"(.*?)\"', response.text) + if buildId: + meta['buildID'] = buildId.group(1) + link = self.get_next_page_url(self.start_url, self.page, meta['buildID']) + yield scrapy.Request(link, callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 for scene in scenes: - date = scene.xpath('./div[@class="scene-icons"]//img[contains(@class,"calendar")]/following-sibling::span/text()') - if date: - meta['date'] = self.parse_date(date.get()).isoformat() - tag = scene.xpath('.//a[@class="site_link"]/span/text()') - if tag: - meta['tags'] = [tag.get().strip()] - - scene = scene.xpath('./div/a/@href').get() - if re.search(self.get_selector_map('external_id'), scene): - yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + count += 1 + yield scene + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['buildID']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_next_page_url(self, base, page, buildID): + pagination = self.get_selector_map('pagination') + pagination = pagination.replace("", buildID) + return self.format_url(base, pagination % page) + + def get_scenes(self, response): + jsondata = response.json() + jsondata = jsondata['pageProps']['contents']['data'] + for scene in jsondata: + item = SceneItem() + item['title'] = self.cleanup_title(scene['title']) + item['id'] = scene['slug'] + item['description'] = self.cleanup_description(re.sub('<[^<]+?>', '', scene['description'])) + item['image'] = self.format_link(response, scene['poster_url']).replace(" ", "%20") + item['image_blob'] = self.get_image_blob_from_link(item['image']) + if scene['trailer_url']: + item['trailer'] = self.format_link(response, scene['trailer_url']).replace(" ", "%20") + else: + item['trailer'] = "" + scene_date = self.parse_date(scene['publish_date'], date_formats=['%Y/%m/%d %h:%m:%s']).strftime('%Y-%m-%d') + item['date'] = "" + if scene_date: + item['date'] = scene_date + item['url'] = f"https://mongerinasia.com/trailers/{scene['slug']}" + item['tags'] = [] + if "tags" in scene: + item['tags'] = ["Asian"] + item['duration'] = self.duration_to_seconds(scene['videos_duration']) + item['site'] = 'Monger In Asia' + item['parent'] = 'Monger In Asia' + item['network'] = 'Monger In Asia' + item['performers'] = [] + for model in scene['models_slugs']: + item['performers'].append(model['name']) + + yield self.check_item(item, self.days) diff --git a/scenes/siteMy18Teens.py b/scenes/siteMy18Teens.py index 3adb23aa..294e11e6 100644 --- a/scenes/siteMy18Teens.py +++ b/scenes/siteMy18Teens.py @@ -27,6 +27,7 @@ class SiteMy18TeensSpider(BaseSceneScraper): 'external_id': r'.*/(.*?)$', 'trailer': '', 'pagination': '/new?page=%s' + # ~ 'pagination': '/all?page=%s' } def get_scenes(self, response): @@ -35,12 +36,15 @@ def get_scenes(self, response): item = SceneItem() titledate = scene.xpath('./div[contains(@class, "video-preview__data")]/p[contains(@class, "title")]/text()') + item['date'] = "" if titledate: titledate = titledate.get() - if re.search(r'(\d{2}\.\d{2}\.\d{4})', titledate): - item['date'] = self.parse_date(re.search(r'(\d{2}\.\d{2}\.\d{4})', titledate).group(1), date_formats=['%d.%m.%Y']).isoformat() - else: - item['date'] = self.parse_date('today').isoformat() + scenedate = re.search(r'(\d{2}\.\d{2}\.\d{4})', titledate) + if scenedate: + scenedate = scenedate.group(1) + scenedate = self.parse_date(scenedate, date_formats=['%d.%m.%Y']) + if scenedate: + item['date'] = scenedate.strftime('%Y-%m-%d') item['title'] = string.capwords(titledate) else: item['title'] = '' diff --git a/scenes/siteMyPOVFam.py b/scenes/siteMyPOVFam.py new file mode 100644 index 00000000..3e1ec77f --- /dev/null +++ b/scenes/siteMyPOVFam.py @@ -0,0 +1,51 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteMyPOVFamSpider(BaseSceneScraper): + name = 'MyPOVFam' + network = 'My POV Fam' + parent = 'My POV Fam' + site = 'My POV Fam' + + start_urls = [ + 'https://www.mypovfam.com', + ] + + selector_map = { + 'title': '//div[@class="video-details"]/div[1]/h1/text()', + 'description': '//div[@class="video-details"]//p/text()', + 'image': '//video/@poster', + 'performers': '//div[@class="video-details"]//span[@class="meta"]//a/text()', + 'external_id': r'', + 'trailer': '//video//source/@src', + 'pagination': '/videos/page/%s/', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//article[contains(@class, "type-video")]') + for scene in scenes: + sceneid = scene.xpath('.//@class').get() + if "post-" in sceneid: + meta['id'] = re.search(r'post-(\d+)', sceneid).group(1) + + duration = scene.xpath('.//strong[contains(text(), "Length:")]/following-sibling::text()[1]') + if duration: + duration = duration.get() + meta['duration'] = self.duration_to_seconds(duration) + + images = scene.xpath('.//img/@srcset') + if images: + images = images.get() + images = images.split(",") + image = images[-1] + meta['image'] = re.search(r'(.*) ', image).group(1).strip() + meta['image_blob'] = self.get_image_blob_from_link(meta['image']) + + scene = scene.xpath('./div[1]/a[1]/@href').get() + + if meta['id']: + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteOnlyBBC.py b/scenes/siteOnlyBBC.py new file mode 100644 index 00000000..7024dd35 --- /dev/null +++ b/scenes/siteOnlyBBC.py @@ -0,0 +1,34 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteOnlyBBCSpider(BaseSceneScraper): + name = 'OnlyBBC' + site = 'Only BBC' + parent = 'Only BBC' + network = 'Only BBC' + + start_urls = [ + 'https://www.onlybbc.com', + ] + + selector_map = { + 'title': '//div[@class="update_block_info"]/span[contains(@class, "update_title")]/text()', + 'description': '//div[@class="update_block_info"]/span[contains(@class, "update_description")]/text()', + 'date': '//div[@class="update_block_info"]/span[contains(@class, "availdate")]/text()', + 'date_formats': ['%m/%d/%Y'], + 'image': '//meta[@property="og:image"]/@content|//meta[@name="twitter:image"]/@content', + 'performers': '//div[@class="update_block_info"]/span[contains(@class, "update_models")]/a/text()', + 'tags': '//div[@class="update_block_info"]/span[contains(@class, "update_tags")]/a/text()', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/tour/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="updateItem"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/sitePervect.py b/scenes/sitePervect.py new file mode 100644 index 00000000..08649a1c --- /dev/null +++ b/scenes/sitePervect.py @@ -0,0 +1,48 @@ +import re +import string +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SitePervectSpider(BaseSceneScraper): + name = 'Pervect' + site = 'Pervect' + + start_urls = [ + 'https://pervect.com', + ] + + selector_map = { + 'title': '//div[@class="container"]/h1/text()', + 'description': '//div[@class="container"]//div[contains(@class, "player-text")]//text()', + 'date': '//meta[@property="video:release_date"]/@content', + 're_date': r'(\d{4}-\d{2}-\d{2})', + 'image': '//meta[@property="og:image"]/@content', + 'performers': '//span[contains(text(), "Starring:")]/following-sibling::a/text()', + 'tags': '//ul[contains(@class,"player-tag-list")]/li/a/text()', + 'duration': '//meta[@property="video:duration"]/@content', + 'trailer': '//script[contains(text(), "contentUrl")]/text()', + 're_trailer': r'contentUrl.*?(http.*?\.mp4)', + 'external_id': r'.*/(.*?)/', + 'pagination': '/scenes/?mode=async&function=get_block&block_id=list_videos_latest_videos_list&sort_by=post_date&from=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class,"card-item")]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_tags(self, response): + tags = super().get_tags(response) + tags = list(map(lambda x: x.lower(), tags)) + changetags = [['ass2mouth', 'ATM'], ['bigbooty', 'Big Butt'], ['bigdildo', 'Dildo'], ['bigtits', 'Big Boobs'], ['sextoys', 'Toys']] + for tag in changetags: + if tag[0] in tags: + tags.remove(tag[0]) + tags.append(tag[1]) + tags = list(map(lambda x: string.capwords(x), tags)) + return tags + diff --git a/scenes/sitePlantsVsCunts.py b/scenes/sitePlantsVsCunts.py new file mode 100644 index 00000000..ed720639 --- /dev/null +++ b/scenes/sitePlantsVsCunts.py @@ -0,0 +1,45 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SitePlantsVsCuntsSpider(BaseSceneScraper): + name = 'PlantsVsCunts' + network = 'Hentaied' + parent = 'Plants vs Cunts' + site = 'Plants vs Cunts' + + start_urls = [ + 'https://plantsvscunts.com', + ] + + selector_map = { + 'title': '//h1/text()', + 'description': '//div[@id="fullstory"]/p/span/text()[not(contains(., "Read Less"))]', + 'date': '//meta[@property="article:published_time"]/@content', + 'image': '//meta[@property="og:image"]/@content', + 'performers': '//img[contains(@alt, "model icon")]/following-sibling::div[@class="taglist"]/a/text()', + 'director': '//img[contains(@alt, "director icon")]/following-sibling::span/a/text()', + 'tags': '//ul[contains(@class,"post-categories")]/li/a/text()', + 'duration': '//div[@class="duration"]/text()', + 'trailer': '//video[@id="singlepreview"]/@src', + 'external_id': r'com/(.*)/', + 'pagination': '/all-videos/%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + scenes = response.xpath('//center[@class="vidcont"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) + + def get_duration(self, response): + duration = response.xpath('//div[@class="duration"]/text()') + if duration: + duration = duration.getall() + duration = "".join(duration) + duration = duration.strip() + if ":" in duration: + return self.duration_to_seconds(duration) + return None diff --git a/scenes/sitePornPlus.py b/scenes/sitePornPlus.py index d98628ec..abf6e61b 100644 --- a/scenes/sitePornPlus.py +++ b/scenes/sitePornPlus.py @@ -21,6 +21,7 @@ class NetworkPornPlusSpider(BaseSceneScraper): '/series/glory-hole-4k', '/series/kinky-sluts-4k', '/series/momcum', + '/series/pornstars-in-cars', '/series/property-exploits', '/series/rv-adventures', '/series/school-of-cock', diff --git a/scenes/sitePrivateClassicsMovies.py b/scenes/sitePrivateClassicsMovies.py deleted file mode 100644 index ab1811dc..00000000 --- a/scenes/sitePrivateClassicsMovies.py +++ /dev/null @@ -1,154 +0,0 @@ -import re -import string -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class PrivateClassicsMoviesSpider(BaseSceneScraper): - name = 'PrivateClassicsMovies' - network = "Private" - - start_urls = [ - 'https://www.privateclassics.com', - ] - - selector_map = { - 'external_id': '\\/(\\d+)$', - 'pagination': '/en/movies/%s/' - } - - def parse(self, response, **kwargs): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - meta['movie'] = movie - yield movie - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - movies = response.xpath('//article[contains(@class, "video")]') - for movie in movies: - imagealt = movie.xpath('./figure/a/img/@data-src') - if imagealt: - meta['imagealt'] = imagealt.get() - movie = movie.xpath('./figure/a/@href').get() - movieurl = self.format_link(response, movie) - yield scrapy.Request(movieurl, callback=self.parse_movie, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - scenes = response.xpath('//h3/a[contains(text(), "Scenes")]/@href').get() - if len(scenes) > 1: - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//div[@class="container"]/div[contains(@class, "content-text")]//h1/text()').get().strip()) - scenedate = response.xpath('//div[@class="container"]/div[contains(@class, "content-text")]//p[contains(@class, "release")]//text()').getall() - scenedate = "".join(scenedate) - scenedate = re.sub('[^a-zA-Z0-9-/]', '', scenedate) - scenedate = re.search(r'(\d{1,2}/\d{1,2}/\d{4})', scenedate) - if scenedate: - scenedate = scenedate.group(1) - item['date'] = self.parse_date(scenedate, date_formats=['%m/%d/%Y']).isoformat() - else: - item['date'] = '' - description = response.xpath('//div[@class="container"]//p[contains(@class, "sinopsys")]/text()') - if description: - item['description'] = description.get().strip() - else: - item['description'] = "" - item['image'] = response.xpath('//div[@class="content-cover"]/img/@src').get() - if not item['image']: - item['image'] = meta['imagealt'] - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - director = response.xpath('//div[@class="container"]//h3[contains(@class, "directe")]/text()') - if director: - director = director.get() - if ":" in director: - director = re.search(r':(.*)', director).group(1) - if director: - item['director'] = director.strip() - else: - item['director'] = '' - item['performers'] = response.xpath('//div[@class="container"]/div[contains(@class, "content-text")]//a[contains(@href, "/pornstar")]/text()').getall() - item['performers'] = list(map(lambda x: x.strip(), item['performers'])) - item['performers'] = list(filter(None, item['performers'])) - duration = response.xpath('//em[contains(text(), "Duration")]/following-sibling::text()') - item['duration'] = None - durations = response.xpath('//span[@class="scene-length-start"]/following-sibling::text()').getall() - if durations: - duration = 0 - for entry in durations: - minutes = re.search(r'(\d+) [Mm]in', entry) - if minutes: - duration = duration + int(minutes.group(1)) - if duration: - item['duration'] = str(duration * 60) - - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Movie' - item['network'] = 'Private' - item['parent'] = 'Private' - item['site'] = 'Private' - item['url'] = response.url - item['id'] = re.search(r'.*/(\d+)/', response.url).group(1) - scenes = response.xpath('//article[contains(@class, "content video")]') - scenelist = [] - for scene in scenes: - sceneurl = scene.xpath('.//figure/a/@href').get() - sceneduration = scene.xpath('.//span[@class="scene-length-start"]/following-sibling::text()[contains(., "min")]') - if sceneduration: - sceneduration = re.sub('[^a-zA-Z0-9-/]', '', sceneduration.get()) - sceneduration = re.search(r'(\d+)min', sceneduration).group(1) - sceneduration = str(int(sceneduration) * 60) - scenelist.append({'url': sceneurl, 'duration': sceneduration}) - item['scenes'] = [] - for sceneurl in scenelist: - extern_id = re.search(r'.*/(\d+)', sceneurl['url']).group(1) - item['scenes'].append({'site': "Private Classics", 'external_id': extern_id}) - meta['movie'] = item - yield item - for sceneurl in scenelist: - meta['duration'] = sceneurl['duration'] - yield scrapy.Request(self.format_link(response, sceneurl['url']), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - movie = meta['movie'] - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//div[contains(@class, "user-tools")]/preceding-sibling::h1/text()').get().strip()) - item['date'] = movie['date'] - description = response.xpath('//div[contains(@class, "user-tools")]/preceding-sibling::p/text()') - if description: - item['description'] = description.get().strip() - else: - item['description'] = "" - - item['image'] = '' - item['image'] = response.xpath('//meta[@itemprop="thumbnailUrl"]/@content|//meta[@property="og:image"]/@content|//div[@id="video_player_finished"]/img/@src').get() - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - item['director'] = movie['director'] - item['performers'] = response.xpath('//div[contains(@class, "user-tools")]/..//a[contains(@href, "/pornstar")]/text()').getall() - item['performers'] = list(map(lambda x: string.capwords(x.strip()), item['performers'])) - item['performers'] = list(filter(None, item['performers'])) - item['tags'] = [] - item['trailer'] = '' - item['duration'] = meta['duration'] - item['type'] = 'Scene' - item['movies'] = {'site': movie['site'], 'external_id': movie['id']} - item['network'] = 'Private' - item['parent'] = "Private Classics" - item['site'] = "Private Classics" - item['url'] = response.url - item['id'] = re.search(r'.*/(\d+)', response.url).group(1) - yield item diff --git a/scenes/sitePrivateClassicsScenes.py b/scenes/sitePrivateClassicsScenes.py index 8f609ed2..5b855f65 100644 --- a/scenes/sitePrivateClassicsScenes.py +++ b/scenes/sitePrivateClassicsScenes.py @@ -135,7 +135,7 @@ def parse_scene(self, response): item['trailer'] = '' item['duration'] = meta['duration'] item['type'] = 'Scene' - item['movie'] = {'site': movie['site'], 'external_id': movie['id']} + item['movie'] = [{'site': movie['site'], 'external_id': movie['id']}] item['network'] = 'Private' item['parent'] = "Private Classics" item['site'] = "Private Classics" diff --git a/scenes/sitePrivateMovies.py b/scenes/sitePrivateMovies.py deleted file mode 100644 index d8c214fe..00000000 --- a/scenes/sitePrivateMovies.py +++ /dev/null @@ -1,144 +0,0 @@ -import re -import string -import scrapy - -from tpdb.BaseSceneScraper import BaseSceneScraper -from tpdb.items import SceneItem - - -class PrivateMoviesSpider(BaseSceneScraper): - name = 'PrivateMovies' - network = "Private" - - start_urls = [ - 'https://www.private.com', - ] - - selector_map = { - 'external_id': '\\/(\\d+)$', - 'pagination': '/movies/%s/' - } - - def parse(self, response, **kwargs): - meta = response.meta - movies = self.get_movies(response) - count = 0 - for movie in movies: - count += 1 - meta['movie'] = movie - yield movie - if count: - if 'page' in response.meta and response.meta['page'] < self.limit_pages: - meta['page'] = meta['page'] + 1 - print('NEXT PAGE: ' + str(meta['page'])) - yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) - - def get_movies(self, response): - meta = response.meta - movies = response.xpath('//div[@class="film"]/a/@href').getall() - for movie in movies: - movieurl = self.format_link(response, movie) - yield scrapy.Request(movieurl, callback=self.parse_movie, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_movie(self, response): - meta = response.meta - scenes = response.xpath('//div[@class="scene"]//h3/a/text()').get() - if len(scenes) > 1: - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//meta[@itemprop="name"]/@content|//h1[@itemprop="name"]/text()').get().strip()) - scenedate = response.xpath('//meta[@itemprop="uploadDate"]/@content|//em[contains(text(), "Release date")]/following-sibling::span[1]/text()').get() - item['date'] = self.parse_date(scenedate, date_formats=['%m/%d/%Y']).isoformat() - description = response.xpath('//meta[@itemprop="description"]/@content|//meta[@property="og:description"]/@content') - if description: - item['description'] = self.cleanup_description(description.get()) - else: - item['description'] = '' - item['image'] = response.xpath('//div[contains(@class, "dvds-photo")]/a/picture/source[1]/@srcset').get() - if re.search(r'( \d+w)', item['image']): - item['image'] = re.sub(r'( \d+w)', '', item['image']) - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - director = response.xpath('//p[@class="director"]/span/text()') - if director: - director = director.get() - if director: - item['director'] = director - else: - item['director'] = '' - item['performers'] = response.xpath('//p[@class="dvd-performers"]/span/a/span/text()').getall() - item['performers'] = list(map(lambda x: x.strip(), item['performers'])) - duration = response.xpath('//em[contains(text(), "Duration")]/following-sibling::text()') - item['duration'] = None - if duration: - duration = duration.get().lower() - duration = re.search(r'(\d+) [Mm]in', duration) - if duration: - duration = duration.group(1) - item['duration'] = str(int(duration) * 60) - item['tags'] = [] - item['trailer'] = '' - item['type'] = 'Movie' - item['network'] = 'Private' - item['parent'] = 'Private' - item['site'] = 'Private' - item['url'] = response.url - item['id'] = re.search(r'movie/(\d+).*', response.url).group(1) - sceneurls = response.xpath('//div[@class="scene"]//h3/a/@href').getall() - item['scenes'] = [] - for sceneurl in sceneurls: - item['scenes'].append({'site': item['site'], 'external_id': re.search(r'.*/(\d+)', sceneurl).group(1)}) - meta['movie'] = item - yield item - for sceneurl in sceneurls: - yield scrapy.Request(self.format_link(response, sceneurl), callback=self.parse_scene, meta=meta, headers=self.headers, cookies=self.cookies) - - def parse_scene(self, response): - meta = response.meta - movie = meta['movie'] - item = SceneItem() - item['title'] = self.cleanup_title(response.xpath('//meta[@itemprop="name"]/@content').get().strip()) - scenedate = response.xpath('//meta[@itemprop="uploadDate"]/@content') - item['date'] = "" - if scenedate: - scenedate = scenedate.get() - item['date'] = self.parse_date(scenedate, date_formats=['%m/%d/%Y']).isoformat() - item['description'] = response.xpath('//meta[@itemprop="description"]/@content').get().strip() - item['image'] = '' - item['image'] = response.xpath('//meta[@itemprop="thumbnailUrl"]/@content').get() - item['image_blob'] = self.get_image_blob_from_link(item['image']) - # ~ item['image_blob'] = '' - item['director'] = movie['director'] - item['performers'] = response.xpath('//ul[@class="scene-models-list"]/li/a[@data-track="PORNSTAR_NAME"]/text()|//ul[@class="scene-models-list-tags-sites"]/li[contains(@class, "tag-models")]/a/text()').getall() - item['performers'] = list(map(lambda x: string.capwords(x.strip()), item['performers'])) - item['tags'] = response.xpath('//ul[contains(@class,"scene-tags")]/li/a/text()').getall() - item['tags'] = list(map(lambda x: string.capwords(x.strip()), item['tags'])) - item['trailer'] = '' - item['type'] = 'Scene' - item['movies'] = {'site': movie['site'], 'external_id': movie['id']} - item['network'] = 'Private' - item['parent'] = self.get_site(response) - item['site'] = self.get_site(response) - item['url'] = response.url - item['id'] = re.search(r'.*/(\d+)', response.url).group(1) - yield item - - def get_image(self, image): - trash = '_' + image.split('_', 3)[-1].rsplit('.', 1)[0] - image = image.replace(trash, '', 1) - return image - - def get_site(self, response): - site = response.xpath('//span[@class="title-site"]/text()').get() - if site: - return site.strip() - elif "privateblack" in response.url: - return "Private Black" - return "Private" - - def get_parent(self, response): - site = response.xpath('//span[@class="title-site"]/text()').get() - if site: - return site.strip() - elif "privateblack" in response.url: - return "Private Black" - return "Private" diff --git a/scenes/sitePutaLocura.py b/scenes/sitePutaLocura.py index a9d9f4ec..d472b52b 100644 --- a/scenes/sitePutaLocura.py +++ b/scenes/sitePutaLocura.py @@ -34,17 +34,28 @@ def get_scenes(self, response): if re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) - def get_title(self, response): - title = self.process_xpath( - response, self.get_selector_map('title')).get() - if "|" in title: - title = re.search(r'(.*)\|', title).group(1) - if title: - return self.cleanup_title(title) - return '' - def get_performers(self, response): return [] def get_tags(self, response): return ["Spanish"] + + def get_title(self, response): + title = response.xpath('//title/text()') + if title: + title = title.get() + if "|" in title: + title = re.search(r'(.*?)\|', title).group(1) + if not title: + title = self.process_xpath(response, self.get_selector_map('title')).get() + if "|" in title: + title = re.search(r'(.*)\|', title).group(1) + + title = title.strip() + if title[0] == "!" or title[0] == "?" or title[0] == "¡" or title[0] == "¿": + title = title[1:] + + if title: + return self.cleanup_title(title) + else: + return '' diff --git a/scenes/siteQueensnake.py b/scenes/siteQueensnake.py index 8d6cd966..10d4a39a 100644 --- a/scenes/siteQueensnake.py +++ b/scenes/siteQueensnake.py @@ -11,10 +11,10 @@ class SiteQueensnakeSpider(BaseSceneScraper): site = 'Queensnake' start_urls = [ + 'https://queensect.com', 'https://queensnake.com', ] - cookies = { 'cLegalAge': 'true', 'cCookieConsent': 'true', @@ -57,11 +57,20 @@ def get_scenes(self, response): item['tags'] = self.get_tags(scene) item['duration'] = self.get_duration(scene) item['trailer'] = "" - item['site'] = "Queensnake" - item['parent'] = "Queensnake" - item['network'] = "Queensnake" + if "queensnake" in response.url: + item['site'] = "Queensnake" + item['parent'] = "Queensnake" + item['network'] = "Queensnake" + if "queensect" in response.url: + item['site'] = "Queensect" + item['parent'] = "Queensect" + item['network'] = "Queensnake" yield self.check_item(item, self.days) + def get_next_page_url(self, base, page): + page = str(int(page) - 1) + return self.format_url(base, self.get_selector_map('pagination') % page) + def get_performers(self, scene): performers = scene.xpath('.//div[@class="contentPreviewTags"]/a/text()').getall() performers2 = [] diff --git a/scenes/siteRealityLovers.py b/scenes/siteRealityLovers.py new file mode 100644 index 00000000..a6881a63 --- /dev/null +++ b/scenes/siteRealityLovers.py @@ -0,0 +1,135 @@ +import re +import json +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteRealityLoversSpider(BaseSceneScraper): + name = 'RealityLovers' + network = 'Reality Lovers' + parent = 'Reality Lovers' + site = 'Reality Lovers' + + start_urls = [ + 'https://engine.realitylovers.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/content/videos?max=12&page=%s&pornstar=&category=&perspective=&sort=NEWEST', + 'type': 'Scene', + } + + custom_scraper_settings = { + 'RETRY_ENABLED': True, + 'RETRY_TIMES': 3, + 'RETRY_HTTP_CODES': [307,404], + 'HANDLE_HTTPSTATUS_LIST': [307,404], + "HTTPCACHE_ENABLED": False, + 'DOWNLOADER_MIDDLEWARES': { + 'tpdb.middlewares.TpdbSceneDownloaderMiddleware': 543, + 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, + 'scrapy.downloadermiddlewares.retry.RetryMiddleware': None, + 'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400, + 'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401, + }, + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + meta['reg_pagination'] = "https://realitylovers.com/videos/page%s" + meta['json_pagination'] = f"https://engine.realitylovers.com/content/videos?max=12&page=%s&pornstar=&category=&perspective=&sort=NEWEST" + + link = "https://realitylovers.com/" + yield scrapy.Request(link, callback=self.start_requests_primed, meta=meta) + + def start_requests_primed(self, response): + meta = response.meta + for link in self.start_urls: + meta['link'] = link + yield scrapy.Request(url=self.get_next_page_url(link, self.page, meta['reg_pagination']), callback=self.start_requests_2, meta=meta) + + def start_requests_2(self, response): + meta = response.meta + yield scrapy.Request(url=self.get_next_page_url(meta['link'], self.page, meta['json_pagination']), callback=self.parse, meta=meta) + + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + url = self.get_next_page_url(response.url, meta['page'], meta['reg_pagination']) + print('NEXT PAGE: ' + str(meta['page']) + f" Url: {url}") + yield scrapy.Request(url, callback=self.mid_index, meta=meta) + + def mid_index(self, response): + meta = response.meta + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['json_pagination']), callback=self.parse, meta=meta) + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def get_scenes(self, response): + meta = response.meta + scenes = json.loads(response.text) + for scene in scenes['contents']: + sceneid = scene['id'] + if sceneid: + meta['link'] = f"https://engine.realitylovers.com/content/videoDetail?contentId={sceneid}" + link = f"https://realitylovers.com/{scene['videoUri']}" + yield scrapy.Request(link, callback=self.mid_scene, meta=meta) + + def mid_scene(self, response): + meta = response.meta + link = meta['link'] + yield scrapy.Request(link, callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + scene = json.loads(response.text) + item = SceneItem() + + item['title'] = self.cleanup_title(scene['title']) + item['description'] = re.sub(r'<[^<]+?>', '', self.cleanup_description(scene['description'])) + item['date'] = scene['releaseDate'] + item['performers'] = [] + if "starring" in scene: + for performer in scene['starring']: + item['performers'].append(self.cleanup_title(performer['name'])) + + item['tags'] = [] + if "categories" in scene: + for tag in scene['categories']: + item['tags'].append(self.cleanup_title(tag['name'])) + + item['url'] = f"https://realitylovers.com/{scene['canonicalUri']}" + item['id'] = scene['contentId'] + image = '' + if 'mainImages' in scene: + if len(scene['mainImages']): + image = scene['mainImages'][0]['imgSrcSet'] + image = re.search(r'(.*?) ', image).group(1) + + if image: + item['image'] = image + item['image_blob'] = self.get_image_blob_from_link(image) + else: + item['image'] = '' + item['image_blob'] = '' + + item['type'] = 'Scene' + if 'trailerUrl' in scene: + item['trailer'] = scene['trailerUrl'] + item['site'] = "Reality Lovers" + item['parent'] = "Reality Lovers" + item['network'] = "Reality Lovers" + + yield self.check_item(item, self.days) diff --git a/scenes/siteReflectiveDesire.py b/scenes/siteReflectiveDesire.py index 51282779..1546ba15 100644 --- a/scenes/siteReflectiveDesire.py +++ b/scenes/siteReflectiveDesire.py @@ -11,43 +11,40 @@ class ReflectiveDesireSpider(BaseSceneScraper): site = 'Reflective Desire' start_urls = [ - 'https://reflectivedesire.com/.com/', - ] - - scene_urls = [ - 'https://reflectivedesire.com/videos/categories/scenes/?sort=chrono', - 'https://reflectivedesire.com/videos/categories/shorts/?sort=chrono' + 'https://reflectivedesire.com/videos/pain/?sort=chrono', + 'https://reflectivedesire.com/videos/pleasure/?sort=chrono', + 'https://reflectivedesire.com/videos/solos/?sort=chrono', + 'https://reflectivedesire.com/videos/devices/?sort=chrono', + 'https://reflectivedesire.com/videos/extras/?sort=chrono', ] def start_requests(self): - for link in self.scene_urls: - yield scrapy.Request(link, - callback=self.get_scenes, - meta={'page': self.page}, - headers=self.headers, - cookies=self.cookies) + for link in self.start_urls: + yield scrapy.Request(link, callback=self.get_scenes, meta={'page': self.page}, headers=self.headers, cookies=self.cookies) selector_map = { - 'title': '//meta[@property="og:title"]/@content', + 'title': '//h1/text()', 'description': '//meta[@name="description"]/@content', 'date': '//meta[@name="description"]/@content', 're_date': r'Posted ([a-zA-Z]*? \d{4})', 'image': '//meta[@property="og:image"]/@content', - 'performers': '//h2[@class="subhead" and contains(text(), "Follow")]/text()', - 're_performers': r'Follow (.*)', - 'tags': '', + 'performers': '//span[contains(text(), "Performers")]/a/text()', + 'tags': '//span[contains(text(), "Categories")]/a/text()', 'external_id': r'.*\/(.*?)\/', - 'trailer': '//a[contains(@href,"https://hd.reflectivedesire.com")]/@href', + 'trailer': '', } def get_scenes(self, response): - scenes = response.xpath('//article/a/@href').getall() + scenes = response.xpath('//main/section[1]//article/a/@href').getall() for scene in scenes: if re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene) def get_tags(self, response): tags = ['Bondage', 'Fetish', 'Latex / Rubber / Vinyl'] + tags2 = super().get_tags(response) + for tag in tags2: + tags.append(tag) return tags def get_date(self, response): diff --git a/scenes/siteSexMex.py b/scenes/siteSexMex.py index 839733f9..c65488ef 100644 --- a/scenes/siteSexMex.py +++ b/scenes/siteSexMex.py @@ -1,4 +1,5 @@ import re +from datetime import date, timedelta from tpdb.BaseSceneScraper import BaseSceneScraper from tpdb.items import SceneItem @@ -26,7 +27,7 @@ class SexMexSpider(BaseSceneScraper): } def get_scenes(self, response): - scenes = response.xpath('//div[contains(@class,"col-lg-4 col-md-4 col-xs-16 thumb")]') + scenes = response.xpath('//div[@class="videothumbnail"]') for scene in scenes: item = SceneItem() @@ -47,9 +48,9 @@ def get_scenes(self, response): image = re.search(r'url=(.*)', image).group(1) performers = scene.xpath('.//a[contains(@class, "modelnamesut") and contains(@href, "/models/")]/text()').getall() - sceneid = scene.xpath('./@data-setid').get() + sceneid = scene.xpath('./../@data-setid').get() - scene = scene.xpath('./div/a/@href').get() + scene = scene.xpath('./a[1]/@href').get() item['title'] = title item['date'] = date diff --git a/scenes/siteSexMexAlt.py b/scenes/siteSexMexAlt.py new file mode 100644 index 00000000..7b879ddc --- /dev/null +++ b/scenes/siteSexMexAlt.py @@ -0,0 +1,83 @@ +import re +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteSexMexAltSpider(BaseSceneScraper): + name = 'SexMexAlt' + network = 'SexMex' + parent = 'SexMex' + site = 'SexMex' + + start_urls = [ + 'https://exposedlatinas.com', + 'https://sexmexamateurs.com', + 'https://transqueens.com', + ] + + selector_map = { + 'title': '', + 'description': '', + 'date': '', + 'image': '', + 'performers': '', + 'tags': "", + 'external_id': r'', + 'trailer': '//video/source/@src', + 'pagination': '/tour/categories/movies_%s_d.html' + } + + def get_scenes(self, response): + scenes = response.xpath('//div[contains(@class,"col-lg-4 col-md-4 col-xs-16 thumb")]') + for scene in scenes: + item = SceneItem() + + date = scene.xpath('.//p[@class="scene-date"]/text()') + if date: + date = date.get() + date = self.parse_date(date.strip()).strftime('%Y-%m-%d') + else: + date = None + title = scene.xpath('.//h5/a/text()').get() + title = title.title() + if " . " in title: + title = re.search(r'^(.*) \. ', title).group(1).strip() + description = scene.xpath('.//p[contains(@class,"scene-descr")]/text()').get() + image = scene.xpath('.//img/@src').get() + image = image.replace(" ", "%20") + if "transform.php" in image or "url=" in image: + image = re.search(r'url=(.*)', image).group(1) + performers = scene.xpath('.//a[contains(@class, "modelnamesut") and contains(@href, "/models/")]/text()').getall() + + sceneid = scene.xpath('./@data-setid').get() + + scene = scene.xpath('./div/a/@href').get() + + item['title'] = title + item['date'] = date + item['description'] = description + item['image'] = image + item['image_blob'] = self.get_image_blob_from_link(item['image']) + item['image'] = re.search(r'(.*)\?', item['image']).group(1) + item['performers'] = performers + item['tags'] = ['Latina', 'South American'] + item['id'] = sceneid + item['type'] = 'Scene' + item['trailer'] = '' + if "exposedlatinas" in response.url: + item['site'] = 'Exposed Latinas' + item['parent'] = 'SexMex' + item['network'] = 'SexMex' + if "sexmexamateurs" in response.url: + item['site'] = 'Sexmex Amateurs' + item['parent'] = 'SexMex' + item['network'] = 'SexMex' + item['tags'].append('Amateur') + if "transqueens" in response.url: + item['site'] = 'Trans Queens' + item['parent'] = 'SexMex' + item['network'] = 'SexMex' + item['tags'].append('Trans') + item['url'] = scene + + yield self.check_item(item, self.days) diff --git a/scenes/siteSexSelector.py b/scenes/siteSexSelector.py index 595d63ad..e647d79f 100644 --- a/scenes/siteSexSelector.py +++ b/scenes/siteSexSelector.py @@ -12,7 +12,7 @@ class SiteSexSelectorSpider(BaseSceneScraper): site = 'Sex Selector' start_urls = [ - 'https://www.sexselector.com', + # 'https://www.sexselector.com', Moved into Project1Service scraper ] selector_map = { diff --git a/scenes/siteSheFuckedHer.py b/scenes/siteSheFuckedHer.py new file mode 100644 index 00000000..dfd80fe4 --- /dev/null +++ b/scenes/siteSheFuckedHer.py @@ -0,0 +1,59 @@ +import re +import string +import slugify +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteSheFuckedHerSpider(BaseSceneScraper): + name = 'SheFuckedHer' + network = 'Apollo Cash' + + start_urls = [ + 'https://shefuckedher.com', + ] + + selector_map = { + 'external_id': r'', + 'pagination': '/index.php?updates=&page=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//table[@bgcolor="black"]') + for scene in scenes: + item = SceneItem() + item['title'] = self.cleanup_title(scene.xpath('.//td/h1/text()').get()) + item['description'] = "" + description = scene.xpath('.//font[contains(@face, "SunSans-Regular") and @size="4"]/text()') + if description: + item['description'] = " ".join(list(map(lambda x: x.strip(), description.getall()))).strip().replace("\n", "").replace("\t", "").replace("\r", "") + item['date'] = '' + item['image'] = "" + item['image_blob'] = "" + image = scene.xpath('.//th[@align="left"]/a/img/@src') + if image: + item['image'] = self.format_link(response, image.get()) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + performers = scene.xpath('.//font[contains(@face, "SunSans-Regular") and @size="6"]/b/text()') + item['performers'] = [] + if performers: + performers = performers.getall() + for performer in performers: + if "(" in performer: + item['performers'].append(re.search(r'(.*?) \(', performer).group(1)) + tags = scene.xpath('.//b[contains(text(), "Tags")]/following-sibling::text()[1]') + item['tags'] = [] + if tags: + tags = tags.get() + tags = tags.split(",") + item['tags'] = list(map(lambda x: string.capwords(x.strip()), tags)) + item['trailer'] = '' + item['id'] = slugify.slugify(item['title'].lower()) + item['network'] = "Apollo Cash" + item['parent'] = "She Fucked Her" + item['site'] = "She Fucked Her" + item['url'] = f"https://shefuckedher.com/{item['id']}" + yield self.check_item(item, self.days) diff --git a/scenes/siteSheSeducedMe.py b/scenes/siteSheSeducedMe.py new file mode 100644 index 00000000..f7d39259 --- /dev/null +++ b/scenes/siteSheSeducedMe.py @@ -0,0 +1,55 @@ +import re +import scrapy +import html +import unidecode +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteSheSeducedMeSpider(BaseSceneScraper): + name = 'SheSeducedMe' + site = 'She Seduced Me' + parent = 'She Seduced Me' + network = 'She Seduced Me' + + start_urls = [ + 'https://sheseducedme.com', + ] + + selector_map = { + 'title': '//div[@class="title_bar"]/span/text()', + 'description': '//span[contains(@class,"update_description")]/text()', + 'date': '//div[contains(@class, "gallery_info")]/div[1]//div[contains(@class,"update_date")]/text()', + 'date_formats': ['%m/%d/%Y'], + 'image': '//meta[@property="og:image"]/@content', + 'performers': '//div[@class="page_body"]/div[@class="gallery_info"]/span[@class="update_models"]/a/text()', + 'tags': '//div[@class="page_body"]/div[@class="gallery_info"]/span[@class="update_tags"]/a/text()', + 'trailer': '//script[contains(text(), "df_movie")]/text()', + 're_trailer': r'df_movie.*?path.*?[\'\"](.*?)[\'\"]', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/vod/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@id, "packageinfo")]') + for scene in scenes: + duration = scene.xpath('./following-sibling::div[contains(@class, "update_counts")]') + if duration: + duration = duration.get() + duration = unidecode.unidecode(html.unescape(duration.lower().replace(" ", " ").replace("\xa0", " "))) + duration = re.sub('[^a-zA-Z0-9-/]', '', duration) + duration = re.search(r'(\d+)min', duration) + if duration: + meta['duration'] = str(int(duration.group(1)) * 60) + + scene = scene.xpath('./following-sibling::a[1]/@href').get() + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_id(self, response): + sceneid = super().get_id(response) + sceneid = sceneid.lower() + if "_vids" in sceneid: + sceneid = re.search(r'(.*?)_vids', sceneid).group(1) + return sceneid diff --git a/scenes/siteSlimeWave.py b/scenes/siteSlimeWave.py new file mode 100644 index 00000000..cb459f1b --- /dev/null +++ b/scenes/siteSlimeWave.py @@ -0,0 +1,81 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteSlimeWaveSpider(BaseSceneScraper): + name = 'SlimeWave' + network = 'SlimeWave' + parent = 'SlimeWave' + site = 'SlimeWave' + + selector_map = { + 'title': '//h1[@class="title--3"]/text()', + 'description': '//div[contains(@class,"accordion__content")]/h5/following-sibling::p//text()', + 'date': '//div[contains(@class,"accordion__content")]//td[contains(text(), "Date added")]/following-sibling::td/text()', + 'date_formats': ['%d %B %Y'], + 'image': '//div[contains(@class,"show---link")]/img/@src', + 'performers': '//figcaption[contains(@class,"girls-item--content")]/h4/text()', + 'trailer': '', + 'external_id': r'movie/(\d+)/', + 'pagination': '', + 'type': 'Scene', + } + + def get_next_page_url(self, base, page, pagination): + return self.format_url(base, pagination % page) + + def start_requests(self): + meta = {} + meta['page'] = self.page + + # Deprecated in favor of Tainster scraper + # ~ link = "https://www.sinx.com/channel/Slime-Wave/all" + # ~ yield scrapy.Request(link, callback=self.start_requests_2, meta=meta, headers=self.headers, cookies=self.cookies) + + def start_requests_2(self, response): + meta = response.meta + links = response.xpath('//a[@class="item--link"]/@href').getall() + for link in links: + meta['pagination'] = link + "?page=%s" + yield scrapy.Request(url=self.get_next_page_url("https://www.sinx.com/", self.page, meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page'], meta['pagination']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="video_item--player"]/a[1]/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_duration(self, response): + duration = response.xpath('//div[contains(@class,"accordion__content")]//td[contains(text(), "Runtime")]/following-sibling::td/text()') + if duration: + duration = duration.get() + duration = re.search(r'(\d+)', duration) + if duration: + duration = str(int(duration.group(1)) * 60) + return duration + return None + + def get_tags(self, response): + taglist = response.xpath('//div[contains(@class,"video-page--tag")]//a/span/text()').getall() + tags = [] + for tag in taglist: + tag = tag.replace("#", "") + tag = re.sub(r"([A-Z])", r" \1", tag) + tags.append(tag) + return tags diff --git a/scenes/siteTSRaw.py b/scenes/siteTSRaw.py new file mode 100644 index 00000000..104a301b --- /dev/null +++ b/scenes/siteTSRaw.py @@ -0,0 +1,81 @@ +import re +import string +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteTSRawSpider(BaseSceneScraper): + name = 'TSRaw' + site = 'TSRaw' + parent = 'TSRaw' + network = 'TSRaw' + + start_urls = [ + 'https://www.tsraw.com', + ] + + selector_map = { + 'title': './/span[contains(@class, "video-title")]/text()', + 'description': './/p[contains(@class, "setTRT")]/text()', + 'date': './/span[contains(@class, "videoDate")]/text()', + 'image': '', + 'performers': '', + 'tags': './/span[contains(@class, "tags-style")]/following-sibling::a/text()', + 'duration': '', + 'trailer': '', + 'external_id': r'', + 'pagination': '', + 'type': 'Scene', + } + + def start_requests(self): + link = 'https://www.tsraw.com/index.php?section=1647' + yield scrapy.Request(link, callback=self.get_scenes, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + scenes = response.xpath('//div[@class="videoThumb"]/..') + for scene in scenes: + item = SceneItem() + item['title'] = self.get_title(scene) + item['description'] = self.get_description(scene) + item['date'] = self.get_date(scene) + image = self.format_link(response, scene.xpath('.//img/@src').get()) + if "&width" in image: + image = re.search(r'(.*?)\&width', image).group(1) + if image: + item['image'] = image + item['image_blob'] = self.get_image_blob_from_link(item['image']) + else: + item['image'] = '' + item['image_blob'] = '' + + item['performers'] = [] + performers = scene.xpath('.//span[contains(@class, "ts-video-desc")]/text()') + if performers: + performers = performers.get() + performers = performers.split(",") + if performers: + item['performers'] = list(map(lambda x: string.capwords(x.strip()), performers)) + + item['tags'] = self.get_tags(scene) + item['id'] = re.search(r'gal=(\d+)', item['image']).group(1) + + item['trailer'] = "" + item['duration'] = '' + duration = scene.xpath('.//span[contains(@class, "videoTRT")]/text()') + if duration: + duration = duration.getall() + duration = "".join(duration) + duration = re.sub('[^a-zA-Z0-9]', '', duration.lower()) + duration = re.search(r'(\d+)min', duration) + if duration: + item['duration'] = str(int(duration.group(1)) * 60) + + item['url'] = f"https://www.tsraw.com/index.php?vid={item['id']}" + item['network'] = self.network + item['parent'] = self.parent + item['site'] = self.site + item['type'] = 'Scene' + + yield self.check_item(item, self.days) diff --git a/scenes/siteThatFetishGirl.py b/scenes/siteThatFetishGirl.py new file mode 100644 index 00000000..f91f0762 --- /dev/null +++ b/scenes/siteThatFetishGirl.py @@ -0,0 +1,51 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteThatFetishGirlSpider(BaseSceneScraper): + name = 'ThatFetishGirl' + network = 'That Fetish Girl' + site = 'That Fetish Girl' + parent = 'That Fetish Girl' + + start_urls = [ + 'https://thatfetishgirl.com', + ] + + selector_map = { + 'title': '//div[@class="update_block"]//span[@class="update_title"]/text()', + 'description': '//div[@class="update_block"]//span[contains(@class,"update_description")]/text()', + 'date': '//div[@class="update_block"]//span[contains(@class,"availdate")]/text()[1]', + 'date_formats': ['%m/%d/%Y'], + 'image': '//div[@class="update_image"]//img[contains(@class, "large_update")]/@src', + 'performers': '//div[@class="update_block"]//span[contains(@class,"update_models")]/a/text()', + 'tags': '//div[@class="update_block"]//span[contains(@class,"update_tags")]/a/text()', + 'trailer': '//div[@class="update_image"]/a[1]/@onclick', + 're_trailer': r'\([\'\"](.*?)[\'\"]', + 'external_id': r'.*/(.*?)\.htm', + 'pagination': '/categories/movies_%s_d.html', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[@class="updateItem"]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def get_duration(self, response): + duration = response.xpath('//div[contains(@class, "update_counts")]/text()') + if duration: + duration = duration.get() + duration = duration.replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", " ").replace("\xa0", " ").replace(" ", "").lower() + duration = re.search(r'(\d+)min', duration) + if duration: + duration = duration.group(1) + return str(int(duration) * 60) + return None + + def get_id(self, response): + sceneid = super().get_id(response) + return sceneid.lower() diff --git a/scenes/siteTranzVR.py b/scenes/siteTranzVR.py new file mode 100644 index 00000000..630b69f6 --- /dev/null +++ b/scenes/siteTranzVR.py @@ -0,0 +1,92 @@ +import re +import scrapy +import json +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SiteTranzVRSpider(BaseSceneScraper): + name = 'TranzVR' + network = 'TranzVR' + parent = 'TranzVR' + site = 'TranzVR' + + start_urls = [ + 'https://www.tranzvr.com', + ] + + selector_map = { + 'title': '', + 'description': '', + 'date': '', + 'image': '', + 'performers': '', + 'tags': '//div[@class="tag-list__body"]//a/text()', + 'duration': '', + 'trailer': '', + 'external_id': r'.*-(\d+)$', + 'pagination': '/?o=d&p=%s', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//li[contains(@class,"cards-list__item")]/div/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) + + def parse_scene(self, response): + scene = response.xpath('//script[contains(@type, "ld+json")]/text()').get() + scene = json.loads(scene) + item = SceneItem() + + item['title'] = self.cleanup_title(scene['name']) + item['id'] = re.search(r'.*-(\d+)$', response.url).group(1) + + item['description'] = self.cleanup_description(scene['description']) + + images = response.xpath('//div[contains(@class, "detail__video")]//picture//img/@srcset') + item['image'] = '' + item['image_blob'] = '' + + if images: + images = images.get() + images = images.split(",") + images = images[-1] + if images and " " in images: + item['image'] = re.search(r'(.*) ', images).group(1) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + if not item['image']: + if 'thumbnailUrl' in scene and scene['thumbnailUrl']: + item['image'] = scene['thumbnailUrl'] + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + if scene['contentUrl']: + item['trailer'] = self.format_link(response, scene['contentUrl']).replace(" ", "%20") + else: + item['trailer'] = "" + + scene_date = self.parse_date(scene['uploadDate'], date_formats=['%Y-%m-%d']).strftime('%Y-%m-%d') + item['date'] = "" + if scene_date: + item['date'] = scene_date + + item['url'] = scene['embedUrl'] + if "?" in item['url']: + item['url'] = re.search(r'(.*)\?', item['url']).group(1) + + item['tags'] = self.get_tags(response) + + item['duration'] = self.duration_to_seconds(scene['duration']) + + item['site'] = 'TranzVR' + item['parent'] = 'TranzVR' + item['network'] = 'TranzVR' + + item['performers'] = [] + for model in scene['actor']: + item['performers'].append(model['name']) + + yield self.check_item(item, self.days) diff --git a/scenes/siteVirtualTaboo.py b/scenes/siteVirtualTaboo.py index 5ce0701c..75ad055a 100644 --- a/scenes/siteVirtualTaboo.py +++ b/scenes/siteVirtualTaboo.py @@ -13,12 +13,9 @@ class SiteVirtualTabooSpider(BaseSceneScraper): ] selector_map = { - 'title': '', - 'description': '', - 'date': '', - 'image': '', - 'performers': '', - 'tags': '', + 'title': './/div[@class="videoTitle"]/text()', + 'date': '//div[@class="row video-detail"]//div[contains(@class, "info mt-5")]//span[@class="bullet"]/following-sibling::text()[1]', + 'date_formats': ['%b %d, %Y'], 'external_id': r'.*/(.*?)', 'trailer': '', 'duration': '//div[contains(@class,"video-detail")]//div[contains(@class,"info")]/text()', @@ -52,7 +49,9 @@ def parse_scene(self, response): item['trailer'] = '' item['url'] = response.url item['id'] = re.search(r'videos/(.*)', item['url']).group(1) - item['date'] = self.parse_date(jsondata['uploadDate'].strip()).isoformat() + item['date'] = self.get_date(response) + if not item['date']: + item['date'] = self.parse_date(jsondata['uploadDate'].strip()).isoformat() item['site'] = "Virtual Taboo" item['parent'] = "Virtual Taboo" item['network'] = "Virtual Taboo" diff --git a/scenes/siteVurigVlaanderen.py b/scenes/siteVurigVlaanderen.py new file mode 100644 index 00000000..f1bfdd1a --- /dev/null +++ b/scenes/siteVurigVlaanderen.py @@ -0,0 +1,204 @@ +import re +from datetime import date, timedelta +import codecs +import json +import scrapy +from tpdb.items import SceneItem + +from tpdb.BaseSceneScraper import BaseSceneScraper + + +def match_tag(argument): + match = { + 'debutanten': "First Time", + 'anaal': "Anal", + 'dikke tieten': "Big Boobs", + 'amateur sex': "Amateur", + 'volle vrouw': "BBW", + 'duo': "FM", + 'gangbang': "Gangbang", + 'trio': "Threesome", + 'jonge meid': "18+ Teens", + 'squirten': "Squirting", + 'pov': "POV", + 'lesbisch': "Lesbian", + 'pijpen': "Blowjob", + 'buitensex': "Outdoors", + 'bdsm': "BDSM", + 'rollenspel': "Roleplay", + 'internationaal': "International", + 'klassiekers': "Classics", + 'milf': "MILF", + } + return match.get(argument, '') + + +class SiteVurigVlaanderenSpider(BaseSceneScraper): + name = 'VurigVlaanderen' + network = 'Vurig Vlaanderen' + parent = 'Vurig Vlaanderen' + site = 'Vurig Vlaanderen' + + base_url = 'https://vurigvlaanderen.be' + + cookies = {"name": "agecookies", "value": "true"} + + headers_json = { + 'origin': 'https://vurigvlaanderen.be', + 'referer': 'https://vurigvlaanderen.be/', + 'Credentials': 'Syserauth 3-585d92b35321e910bc1c25b734531c9adf52e2679c0d42aefad09e2556cde47f-65be7945', + } + + selector_map = { + 'title': '//script[contains(text(),"NUXT")]/text()', + 're_title': r'video:\{title:\"(.*?)\"', + 'description': '//script[contains(text(),"NUXT")]/text()', + 're_description': r'description:\"(.*?)\"', + 'date': '//script[contains(text(),"NUXT")]/text()', + 're_date': r'pivot_data:\{active_from:\"(\d{4}-\d{2}-\d{2})', + 'image': '//meta[@name="og:image"]/@content', + 'performers': '//script[contains(text(),"NUXT")]/text()', + 're_performers': r'models:\[(.*?)\]', + 'tags': '//script[contains(text(),"NUXT")]/text()', + 'external_id': r'sexfilms\/(.*)', + 'trailer': '', + 'pagination': '/categories/movies_%s_d.html#' + } + + def get_next_page_url(self, base, page): + url = 'https://api.sysero.nl/videos?page={}&count=20&type=video&include=images:types(thumb|thumb_mobile),products,categories,clips&filter[status]=published&filter[products]=1%2C2&filter[recurring]=1&sort[recommended_at]=DESC&frontend=3' + return self.format_url(base, url.format(page)) + + def start_requests(self): + meta = {} + meta['page'] = self.page + + link = "https://vurigvlaanderen.be/sexfilms" + yield scrapy.Request(link, callback=self.start_requests_2, meta=meta, cookies=self.cookies) + + def start_requests_2(self, response): + meta = response.meta + link = self.get_next_page_url(self.base_url, meta['page']) + yield scrapy.Request(link, callback=self.parse, meta=meta, headers=self.headers_json) + + def parse(self, response, **kwargs): + scenes = self.get_scenes(response) + count = 0 + for scene in scenes: + count += 1 + yield scene + if count: + if 'page' in response.meta and response.meta['page'] < self.limit_pages: + meta = response.meta + meta['page'] = meta['page'] + 1 + print('NEXT PAGE: ' + str(meta['page'])) + yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers_json) + + def get_scenes(self, response): + jsondata = json.loads(response.text) + data = jsondata['data'] + for jsonentry in data: + if jsonentry['attributes']['slug']: + scene_url = "https://vurigvlaanderen.be/sexfilms/" + jsonentry['attributes']['slug'] + yield scrapy.Request(url=self.format_link(response, scene_url), callback=self.parse_scene) + + def get_performers(self, response): + performers = self.process_xpath(response, self.get_selector_map('performers')) + if performers: + performers = performers.get() + performers = re.search( + r',models:\[(.*id:\".*?)\],preroll', + performers) + if performers: + performers = performers.group(1) + performers = re.findall('title:\"(.*?)\"', performers) + return list(map(lambda x: x.strip(), performers)) + return [] + + def get_tags(self, response): + tags = self.process_xpath(response, self.get_selector_map('tags')) + if tags: + tags = tags.get() + tags = re.search(r',categories:\[(.*?)\],products', tags) + if tags: + tags = tags.group(1) + tags = re.findall(r'name:\"(.*?)\"', tags) + tags2 = ['European'] + for tag in tags: + found_tag = match_tag(tag.lower()) + if found_tag: + tags2.append(found_tag) + return list(map(lambda x: x.strip().title(), tags2)) + return [] + + def get_description(self, response): + if 'description' not in self.get_selector_map(): + return '' + + description = self.process_xpath(response, self.get_selector_map('description')) + if description: + description = self.get_from_regex(description.get(), 're_description') + + if description: + try: + description = codecs.decode(description, 'unicode-escape') + except Exception: + description = re.sub(r'\\u00\d[a-fA-F]', '', description) + description = re.sub(r'<[^<]+?>', '', description).strip() + description = re.sub( + r'[^a-zA-Z0-9\-_ \.\?\!]', '', description) + return self.cleanup_description(description) + return '' + + def get_date(self, response): + datestring = self.process_xpath(response, self.get_selector_map('date')) + if datestring: + datestring = datestring.get().replace(r"\u002F", "/") + date = re.search(self.get_selector_map('re_date'), datestring) + if not date: + date = re.search(r'active_from=\"(\d{4}-\d{2}-\d{2})', datestring) + if not date: + date = re.search(r'active_from:\"(\d{1,2}/\d{1,2}/\d{2})', datestring) + if date: + date = date.group(1) + return self.parse_date(date, date_formats=['%Y-%m-%d', '%m/%d/%Y']).strftime('%Y-%m-%d') + return self.parse_date('today').strftime('%Y-%m-%d') + return None + + def get_duration(self, response): + duration = response.xpath('//span[@class="time"]/text()') + if duration: + duration = duration.get() + duration = re.search(r'(\d+)', duration).group(1) + return str(int(duration) * 60) + return None + + def parse_scene(self, response): + item = SceneItem() + + item['title'] = self.get_title(response) + item['description'] = self.get_description(response) + item['date'] = self.get_date(response) + item['image'] = self.get_image(response) + + if not item['image']: + item['image'] = '' + item['image_blob'] = '' + else: + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + item['performers'] = self.get_performers(response) + item['tags'] = self.get_tags(response) + item['id'] = self.get_id(response) + item['trailer'] = self.get_trailer(response) + + item['url'] = self.get_url(response) + + item['duration'] = self.get_duration(response) + + item['site'] = self.site + item['network'] = self.network + item['parent'] = self.parent + + if item['title'] and item['id']: + yield self.check_item(item, self.days) diff --git a/scenes/siteWankzVR.py b/scenes/siteWankzVR.py index afcef654..17ac0e0f 100644 --- a/scenes/siteWankzVR.py +++ b/scenes/siteWankzVR.py @@ -41,3 +41,11 @@ def get_tags(self, response): tags = super().get_tags(response) tags.append("Virtual Reality") return tags + + def get_duration(self, response): + duration = super().get_duration(response) + if "PT" in duration or "M" in duration: + duration = re.search(r'(\d+)', duration) + if duration: + duration = str(int(duration.group(1)) * 60) + return duration diff --git a/scenes/siteWatch4Beauty.py b/scenes/siteWatch4Beauty.py index f3143b1c..87ae54f7 100644 --- a/scenes/siteWatch4Beauty.py +++ b/scenes/siteWatch4Beauty.py @@ -57,6 +57,10 @@ def parse_scene(self, response): if len(data): data = data[0] item['title'] = data['issue_title'] + if len(item['title']) < 3: + item['title'] = item['title'] + "." + if len(item['title']) < 3: + item['title'] = item['title'] + "." item['date'] = data['issue_datetime'] if "Z" in item['date']: item['date'] = item['date'][:-1] @@ -90,23 +94,7 @@ def parse_models(self, response): item['performers'] = performers - days = int(self.days) - if days > 27375: - filterdate = "0000-00-00" - else: - filterdate = date.today() - timedelta(days) - filterdate = filterdate.strftime('%Y-%m-%d') - - if self.debug: - if not item['date'] > filterdate: - item['filtered'] = "Scene filtered due to date restraint" - print(item) - else: - if filterdate: - if item['date'] > filterdate: - yield item - else: - yield item + yield self.check_item(item, self.days) def get_next_page_url(self, base, page, response=""): if response: diff --git a/scenes/siteWifeysWorld.py b/scenes/siteWifeysWorld.py index 8f3ce081..d5b64e9d 100644 --- a/scenes/siteWifeysWorld.py +++ b/scenes/siteWifeysWorld.py @@ -37,7 +37,7 @@ def get_scenes(self, response): else: item['image'] = None item['image_blob'] = None - item['performers'] = None + item['performers'] = ['Sandra Otterson'] item['tags'] = None item['markers'] = None item['id'] = scene.xpath('./@data-setid').get() diff --git a/scenes/siteXX-Cel.py b/scenes/siteXX-Cel.py new file mode 100644 index 00000000..9c43ed66 --- /dev/null +++ b/scenes/siteXX-Cel.py @@ -0,0 +1,36 @@ +import re +import scrapy +from tpdb.BaseSceneScraper import BaseSceneScraper + + +class SiteXXCelSpider(BaseSceneScraper): + name = 'XX-Cel' + network = 'XX-Cel' + parent = 'XX-Cel' + site = 'XX-Cel' + + start_urls = [ + 'https://xx-cel.com', + ] + + selector_map = { + 'title': '//div[contains(@class, "vid-details")]//h2/text()', + 'description': '', + 'date': '//div[contains(@class, "vid-details")]//span[contains(text(), "eleased")]/strong/text()', + 'date_formats': ['%b %d, %Y'], + 'image': '//div[@id="videoPlayer"]//video/@poster|//div[@id="videoPlayer"]/a/img/@src', + 'performers': '//div[contains(@class, "vid-details")]//span[contains(text(), "tarring")]/a/text()', + 'tags': '', + 'duration': '//div[contains(@class, "vid-details")]//span[contains(text(), "uration")]/strong/text()', + 'trailer': '//div[@id="videoPlayer"]//video/source/@src', + 'external_id': r'.*/(.*?)$', + 'pagination': '/movies/page-%s/?tag=&q=&model=&sort=recent', + 'type': 'Scene', + } + + def get_scenes(self, response): + meta = response.meta + scenes = response.xpath('//div[contains(@class, "star col-xxl-3")]/a/@href').getall() + for scene in scenes: + if re.search(self.get_selector_map('external_id'), scene): + yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) diff --git a/scenes/siteXXXJobInterviews.py b/scenes/siteXXXJobInterviews.py index 33eb2e22..e93271b4 100644 --- a/scenes/siteXXXJobInterviews.py +++ b/scenes/siteXXXJobInterviews.py @@ -32,22 +32,22 @@ def get_scenes(self, response): meta = response.meta scenes = response.xpath('//div[@class="gallery-item"]') for scene in scenes: - duration = scene.xpath('.//div[contains(@class, "info-container")]/div/span[1]/text()') + duration = scene.xpath('.//div[@class="item"]/div[1]/span/text()') if duration: duration = duration.get() meta['duration'] = self.duration_to_seconds(duration.strip()) - scenedate = scene.xpath('.//div[contains(@class, "info-container")]/div/span[2]/text()') + scenedate = scene.xpath('.//div[@class="item"]/div[2]/span/text()') if scenedate: - scenedate = scenedate.get() - meta['date'] = scenedate.strip() - scene = scene.xpath('./a/@href').get() - if re.search(self.get_selector_map('external_id'), scene): + meta['date'] = self.parse_date(scenedate.get(), date_formats=['%B %d, %Y']).strftime('%Y-%m-%d') + scene = scene.xpath('.//div[@class="description"]/a/@href').get() + if scene and re.search(self.get_selector_map('external_id'), scene): yield scrapy.Request(url=self.format_link(response, scene), callback=self.parse_scene, meta=meta) def get_tags(self, response): - tags = response.xpath('//div[@class="tags"]/ul/li/a/text()').getall() - tags2 = [] - for tag in tags: - if re.sub(r'[A-Z]', '', tag) == tag: - tags2.append(string.capwords(tag)) - return tags2 + tags = super().get_tags(response) + performers = self.get_performers(response) + for performer in performers: + for tag in tags: + if performer.lower() in tag.lower(): + tags.remove(performer) + return tags diff --git a/scenes/siteppptv.py b/scenes/siteppptv.py new file mode 100644 index 00000000..61891f73 --- /dev/null +++ b/scenes/siteppptv.py @@ -0,0 +1,145 @@ +import re +import scrapy +import os +import json +import datetime +from dateutil.relativedelta import relativedelta +from tpdb.BaseSceneScraper import BaseSceneScraper +from tpdb.items import SceneItem + + +class SitePPPTVSpider(BaseSceneScraper): + name = 'PPPTV' + site = 'P-P-P TV' + parent = 'P-P-P TV' + network = 'P-P-P TV' + + start_urls = [ + 'https://p-p-p.tv', + ] + + selector_map = { + 'title': './/strong/text()[1]', + 'description': '', + 'date': './/i[contains(@class, "calendar")]/following-sibling::text()', + 'image': './/img/@src', + 'performers': '', + 'tags': '', + 'duration': './/i[contains(@class, "fa-clock")]/following-sibling::text()', + 'trailer': '', + 'external_id': r'', + 'pagination': '/en/videos/list?page=%s', + 'type': 'Scene', + } + + def start_requests(self): + meta = {} + meta['page'] = self.page + + performer_list = './datafiles/PPPTV_Performers.json' + if os.path.isfile(performer_list): + f = open(performer_list) + meta['performer_list'] = json.load(f) + f.close() + else: + meta['performer_list'] = [] + + for link in self.start_urls: + yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) + + def get_scenes(self, response): + meta = response.meta + performer_list = meta['performer_list']['scenes'] + scenes = response.xpath('//turbo-frame//div[contains(@class, "col-md-4")]') + for scene in scenes: + datetest = scene.xpath('.//i[contains(@class, "calendar")]/following-sibling::text()') + if datetest: + datetest = datetest.get() + if "in" not in datetest: + item = SceneItem() + + item['title'] = self.get_title(scene) + item['duration'] = self.get_duration(scene) + item['description'] = '' + item['tags'] = ['European'] + item['performers'] = [] + for model in performer_list: + if model['name'] in item['title']: + if model['name'] == "Cat": + if "Cathaleya" not in item['title']: + item['performers'].append(model['name']) + else: + item['performers'].append(model['name']) + for model in performer_list: + if " " in model['name']: + if model['name'].replace(" ", "").lower() in item['title'].replace(" ", "").lower(): + item['performers'].append(model['name']) + + item['performers'] = list(set(item['performers'])) + + item['date'] = self.get_date(scene) + + item['image'] = self.get_image(scene, response) + item['image_blob'] = self.get_image_blob_from_link(item['image']) + + item['url'] = self.format_link(response, scene.xpath('./a/@href').get()) + item['id'] = re.search(r'.*/(.*?)$', item['url']).group(1) + + item['trailer'] = '' + item['type'] = 'Scene' + item['site'] = "P-P-P TV" + item['parent'] = "P-P-P TV" + item['network'] = "P-P-P TV" + + if item['id'] and item['title']: + yield self.check_item(item, self.days) + + def get_image(self, scene, response, path=None): + force_update = self.settings.get('force_update') + if force_update: + force_update = True + force_fields = self.settings.get('force_fields') + if force_fields: + force_fields = force_fields.split(",") + + if not force_update or (force_update and "image" in force_fields): + if 'image' in self.get_selector_map(): + image = self.get_element(scene, 'image', 're_image') + if isinstance(image, list): + image = image[0] + image = image.replace(" ", "%20") + if path: + return self.format_url(path, image) + else: + return self.format_link(response, image) + return '' + + return [] + + def get_date(self, response): + today = datetime.datetime.now() + datestring = self.process_xpath(response, self.get_selector_map('date')).get() + datestring = datestring.lower() + intervalcount = re.search(r'(\d+)', datestring).group(1) + if not intervalcount: + intervalcount = 0 + else: + intervalcount = int(intervalcount) + if "minute" in datestring: + date = today - relativedelta(minutes=intervalcount) + if "hour" in datestring: + date = today - relativedelta(hours=intervalcount) + if "day" in datestring: + date = today - relativedelta(days=intervalcount) + if "today" in datestring: + date = today + if "yesterday" in datestring: + date = today - relativedelta(days=1) + if "week" in datestring: + date = today - relativedelta(weeks=intervalcount) + if "month" in datestring: + date = today - relativedelta(months=intervalcount) + if "year" in datestring: + date = today - relativedelta(years=intervalcount) + + return date.isoformat()