Skip to content

Commit

Permalink
Scrapers update
Browse files Browse the repository at this point in the history
  • Loading branch information
Darklyter committed Mar 22, 2024
1 parent 012d40b commit e46191b
Show file tree
Hide file tree
Showing 169 changed files with 7,586 additions and 3,712 deletions.
49 changes: 49 additions & 0 deletions performers/MatureFetishPerformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from tpdb.BasePerformerScraper import BasePerformerScraper
from tpdb.items import PerformerItem


class SiteMatureFetishPerformerSpider(BasePerformerScraper):
selector_map = {
'pagination': '/en/models/%s?sort=&q=&sex=female',
'external_id': r'model/(.*)/'
}

name = 'MatureFetishPerformer'

start_urls = [
'https://maturefetish.com',
]

def get_performers(self, response):
performers = response.xpath('//div[@class="grid-tile-model"]')
for performer in performers:
item = PerformerItem()

item['name'] = self.cleanup_title(performer.xpath('./div/a/text()').get())
image = performer.xpath('.//img/@data-src')
if image:
item['image'] = self.format_link(response, image.get())
item['image_blob'] = self.get_image_blob_from_link(item['image'])
else:
item['image'] = ""
item['image_blob'] = ""
item['bio'] = ''
item['gender'] = 'Female'
item['astrology'] = ''
item['birthday'] = ''
item['birthplace'] = ''
item['cupsize'] = ''
item['ethnicity'] = ''
item['eyecolor'] = ''
item['fakeboobs'] = ''
item['haircolor'] = ''
item['height'] = ''
item['measurements'] = ''
item['nationality'] = ''
item['piercings'] = ''
item['tattoos'] = ''
item['weight'] = ''
item['network'] = 'Mature NL'
item['url'] = self.format_link(response, performer.xpath('./a/@href').get())

yield item
86 changes: 86 additions & 0 deletions performers/javJAVCTPerformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import re
import string
import scrapy

from tpdb.BasePerformerScraper import BasePerformerScraper


class JavJAVCTPerformerSpider(BasePerformerScraper):
selector_map = {
'name': '//li[contains(@class, "breadcrumb__item--active")]/text()',
'image': '//div[@class="stats"]/img/@data-src',
'image_blob': True,
'birthday': '//div[@class="stats"]/ul/li[contains(./text(), "Born:")]/text()',
're_birthday': r': (.*)',
'height': '//div[@class="stats"]/ul/li[contains(./text(), "Height:")]/text()',
're_height': r': (.*)',
'measurements': 'InCode',

'pagination': '/models/pg-%s',
'external_id': r'model/(.*)/'
}

name = 'JAVJAVCTPerformer'
network = 'R18'

start_urls = [
'https://javct.net',
]

def get_gender(self, response):
return 'Female'

def get_performers(self, response):
performers = response.xpath('//h3[contains(@class,"card__title")]/a/@href').getall()
for performer in performers:
yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer, cookies=self.cookies, headers=self.headers)

def get_measurements(self, response):
bust = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Breast:")]/text()')
waist = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Waist:")]/text()')
hips = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Hips:")]/text()')

if bust and waist and hips:
bust = re.search(r'(\d+)', bust.get())
if bust:
bust = bust.group(1)
waist = re.search(r'(\d+)', waist.get())
if waist:
waist = waist.group(1)
hips = re.search(r'(\d+)', hips.get())
if hips:
hips = hips.group(1)
if bust and waist and hips:
if bust:
bust = round(int(bust) / 2.54)
if waist:
waist = round(int(waist) / 2.54)
if hips:
hips = round(int(hips) / 2.54)

if bust and waist and hips:
measurements = str(bust) + "-" + str(waist) + "-" + str(hips)
return measurements.strip()
return ''

def get_cupsize(self, response):
bust = response.xpath('//div[@class="stats"]/ul/li[contains(./text(), "Breast:")]/text()')
if bust:
bust = re.search(r'(\d+)', bust.get())
if bust:
bust = bust.group(1)
if bust:
bust = round(int(bust) / 2.54)
if bust:
return str(bust)
return ''

def get_name(self, response):
name = super().get_name(response)
if "(" in name:
name = re.search(r'(.*?)\(', name).group(1)
return string.capwords(name.strip())

def get_height(self, response):
height = super().get_height(response)
return height.replace(" ", "")
129 changes: 129 additions & 0 deletions performers/javSEXTBPerformerPlaywright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import re
import string
import scrapy

from tpdb.BasePerformerScraper import BasePerformerScraper


class JavSEXTBPerformerSpider(BasePerformerScraper):
selector_map = {
'name': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-user")]/following-sibling::span/a/text()',
'image': '//section[@class="tray all"]/div/img/@data-src',
'image_blob': True,
'birthday': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-birthday")]/following-sibling::span/text()',
'height': '//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Height")]/following-sibling::span/text()',
'measurements': 'InCode',

'pagination': '/list-actress/pg-%s',
'external_id': r'model/(.*)/'
}

custom_scraper_settings = {
'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62',
'TWISTED_REACTOR': 'twisted.internet.asyncioreactor.AsyncioSelectorReactor',
'AUTOTHROTTLE_ENABLED': True,
'USE_PROXY': False,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY': 60,
'CONCURRENT_REQUESTS': 1,
'DOWNLOAD_DELAY': 2,
'DOWNLOADER_MIDDLEWARES': {
# 'tpdb.helpers.scrapy_flare.FlareMiddleware': 542,
'tpdb.middlewares.TpdbSceneDownloaderMiddleware': 543,
'tpdb.custommiddlewares.CustomProxyMiddleware': 350,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
'scrapy_fake_useragent.middleware.RetryUserAgentMiddleware': 401,
},
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
}

name = 'JAVSEXTBPerformerPlaywright'
network = 'R18'

start_urls = [
'https://sextb.net',
]

def start_requests(self):
meta = {}
meta['page'] = self.page
meta['playwright'] = True

for link in self.start_urls:
yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies)

def parse(self, response, **kwargs):
performers = self.get_performers(response)
count = 0
for performer in performers:
count += 1
yield performer

if 'page' in response.meta and response.meta['page'] < self.limit_pages:
meta = response.meta
meta['page'] = meta['page'] + 1
print('NEXT PAGE: ' + str(meta['page']))
yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta)

def get_gender(self, response):
return 'Female'

def get_performers(self, response):
performers = response.xpath('//div[@class="tray-item-actress"]/a/@href').getall()
for performer in performers:
yield scrapy.Request(url=self.format_link(response, performer), callback=self.parse_performer)

def get_measurements(self, response):
bust = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Breast")]/following-sibling::span/text()')
waist = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Waist")]/following-sibling::span/text()')
hips = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Hips")]/following-sibling::span/text()')

if bust and waist and hips:
bust = re.search(r'(\d+)', bust.get())
if bust:
bust = bust.group(1)
waist = re.search(r'(\d+)', waist.get())
if waist:
waist = waist.group(1)
hips = re.search(r'(\d+)', hips.get())
if hips:
hips = hips.group(1)
if bust and waist and hips:
if bust:
bust = round(int(bust) / 2.54)
if waist:
waist = round(int(waist) / 2.54)
if hips:
hips = round(int(hips) / 2.54)

if bust and waist and hips:
measurements = str(bust) + "-" + str(waist) + "-" + str(hips)
return measurements.strip()
return ''

def get_cupsize(self, response):
bust = response.xpath('//div[contains(@class,"actor-info")]/ul/li/i[contains(@class, "fa-arrows-alt") and contains(following-sibling::text(), "Breast")]/following-sibling::span/text()')
if bust:
bust = re.search(r'(\d+)', bust.get())
if bust:
bust = bust.group(1)
if bust:
bust = round(int(bust) / 2.54)
if bust:
return str(bust)
return ''

def get_name(self, response):
name = super().get_name(response)
if "(" in name:
name = re.search(r'(.*?)\(', name).group(1)
return string.capwords(name.strip())

def get_height(self, response):
height = super().get_height(response)
return height.replace(" ", "")
2 changes: 1 addition & 1 deletion performers/networkAdultEmpireCashPerformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def match_path(argument):
'www.filthykings.com': "/filthy-kings-porn-stars.html?page=%s&hybridview=member",
'thirdworldxxx.com': "/third-world-media-porn-stars.html?page=%s&hybridview=member",
'latinoguysporn.com': "/latino-guys-porn-stars.html?page=%s&hybridview=member",
'www.lethalhardcore.com': "/lethal-hardcore-porn-stars.html?page=%s&hybridview=member",
'www.lethalhardcore.com': "/porn-stars.html?sort=ag_added&page=%s&hybridview=member",
'www.wcpclub.com': "/west-coast-productions-porn-stars.html?page=%s&hybridview=member",
}
return match.get(argument, "")
Expand Down
Loading

0 comments on commit e46191b

Please sign in to comment.