From 7868ab06668484ca2302c9f699b3f3a16139a060 Mon Sep 17 00:00:00 2001 From: Ryoichiro Kamiya Date: Mon, 25 Jun 2018 00:47:33 +0900 Subject: [PATCH 1/2] Resolves #106 - updated scrapy req after successful test run. --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 995d1d7f..be4c9b96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ Django>=1.8,<1.12 -Scrapy>=1.4,<1.5 +Scrapy>=1.5,<1.6 scrapy-djangoitem>=1.1.1,<1.2 scrapy-splash>=0.7,<0.8 scrapyd>=1.2,<1.3 @@ -9,4 +9,3 @@ Celery==3.1.25 django-celery==3.2.1 future>=0.15,<0.16 pillow>=3.0,<4.0 - From f2cbc8874111eb925bf2779a513e73c3331f7e8f Mon Sep 17 00:00:00 2001 From: Ryoichiro Kamiya Date: Thu, 12 Jul 2018 16:45:28 +0900 Subject: [PATCH 2/2] Updated to support Django's timezone aware now. --- dynamic_scraper/models.py | 70 +++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/dynamic_scraper/models.py b/dynamic_scraper/models.py index 21790da6..d7cadc77 100644 --- a/dynamic_scraper/models.py +++ b/dynamic_scraper/models.py @@ -7,7 +7,7 @@ import datetime from django.db import models from django.db.models import Q - +from django.utils import timezone @python_2_unicode_compatible class ScrapedObjClass(models.Model): @@ -25,7 +25,7 @@ class ScrapedObjClass(models.Model): "ZERO_ACTIONS_FACTOR_CHANGE": 5,\n\ "FACTOR_CHANGE_FACTOR": 1.3,\n') comments = models.TextField(blank=True) - + def __str__(self): return self.name @@ -50,10 +50,10 @@ class ScrapedObjAttr(models.Model): attr_type = models.CharField(max_length=1, choices=ATTR_TYPE_CHOICES) id_field = models.BooleanField(default=False) save_to_db = models.BooleanField(default=True) - + def __str__(self): return self.name + " (" + str(self.obj_class) + ")" - + class Meta(object): ordering = ['order',] @@ -112,7 +112,7 @@ class Scraper(models.Model): pagination_type = models.CharField(max_length=1, choices=PAGINATION_TYPE, default='N') pagination_on_start = models.BooleanField(default=False) pagination_append_str = models.CharField(max_length=200, blank=True, help_text="Syntax: /somepartofurl/{page}/moreurlstuff.html") - pagination_page_replace = models.TextField(blank=True, + pagination_page_replace = models.TextField(blank=True, help_text="RANGE_FUNCT: uses Python range funct., syntax: [start], stop[, step], FREE_LIST: 'Replace text 1', 'Some other text 2', 'Maybe a number 3', ...") help_text = "Optional, follow links from a single non-paginated or all statically paginated (RANGE_FUNCT, FREE_LIST) main pages" follow_pages_url_xpath = models.TextField(blank=True, help_text=help_text) @@ -120,12 +120,12 @@ class Scraper(models.Model): follow_pages_page_xpath = models.TextField(blank=True, help_text=help_text) help_text = "Optionally limit number of pages to follow (default: follow until XPath fails)" num_pages_follow = models.IntegerField(blank=True, null=True, help_text=help_text) - last_scraper_save_alert_period = models.CharField(max_length=5, blank=True, + last_scraper_save_alert_period = models.CharField(max_length=5, blank=True, help_text="Optional, used for scraper monitoring with 'check_last_scraper_saves' management cmd, \ syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')") next_last_scraper_save_alert = models.DateTimeField(default=datetime.datetime.now, help_text="Next time the last scraper save will be alerted, normally set on management cmd run.",) - last_checker_delete_alert_period = models.CharField(max_length=5, blank=True, + last_checker_delete_alert_period = models.CharField(max_length=5, blank=True, help_text="Optional, used for scraper monitoring with 'check_last_checker_deletes' management cmd, \ syntax: [HOURS]h or [DAYS]d or [WEEKS]w (e.g. '6h', '5d', '2w')") next_last_checker_delete_alert = models.DateTimeField(default=datetime.datetime.now, @@ -133,7 +133,7 @@ class Scraper(models.Model): comments = models.TextField(blank=True) last_scraper_save = models.DateTimeField(null=True, blank=True) last_checker_delete = models.DateTimeField(null=True, blank=True) - + def get_alert_period_timedelta(self, attribute_str): if getattr(self, attribute_str) and len(getattr(self, attribute_str)) >= 2: period_str = getattr(self, attribute_str)[-1] @@ -153,16 +153,16 @@ def get_alert_period_timedelta(self, attribute_str): return None else: return None - + def get_last_scraper_save_alert_period_timedelta(self): return self.get_alert_period_timedelta('last_scraper_save_alert_period') - + def get_last_checker_delete_alert_period_timedelta(self): return self.get_alert_period_timedelta('last_checker_delete_alert_period') - + def get_main_page_rpt(self): return self.requestpagetype_set.get(page_type='MP') - + def get_follow_page_rpts(self): return self.requestpagetype_set.filter(page_type='FP') @@ -177,16 +177,16 @@ def get_rpt_for_scraped_obj_attr(self, soa): def get_base_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='B') - + def get_base_elem(self): return self.scraperelem_set.get(scraped_obj_attr__attr_type='B') - + def get_detail_page_url_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U') def get_detail_page_url_id_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='U', scraped_obj_attr__id_field=True) - + def get_standard_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') @@ -204,33 +204,33 @@ def get_standard_update_elems(self): def get_standard_update_elems_from_detail_pages(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='T').filter(~Q(request_page_type='MP')) - + def get_image_elems(self): return self.scraperelem_set.filter(scraped_obj_attr__attr_type='I') - + def get_image_elem(self): return self.scraperelem_set.get(scraped_obj_attr__attr_type='I') - + def get_scrape_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') q3 = Q(scraped_obj_attr__attr_type='U') q4 = Q(scraped_obj_attr__attr_type='I') return self.scraperelem_set.filter(q1 | q2 | q3 | q4) - + def get_mandatory_scrape_elems(self): q1 = Q(scraped_obj_attr__attr_type='S') q2 = Q(scraped_obj_attr__attr_type='T') q3 = Q(scraped_obj_attr__attr_type='U') q4 = Q(scraped_obj_attr__attr_type='I') return self.scraperelem_set.filter(q1 | q2 | q3 | q4).filter(mandatory=True) - + def get_from_detail_pages_scrape_elems(self): return self.scraperelem_set.filter(~Q(request_page_type='MP')) - + def __str__(self): return self.name + " (" + self.scraped_obj_class.name + ")" - + class Meta(object): ordering = ['name', 'scraped_obj_class',] @@ -287,17 +287,17 @@ class Checker(models.Model): checker_x_path_result = models.TextField(blank=True) checker_ref_url = models.URLField(max_length=500, blank=True) comments = models.TextField(blank=True) - + def __str__(self): return str(self.scraped_obj_attr) + ' > ' + self.get_checker_type_display() - + @python_2_unicode_compatible class ScraperElem(models.Model): REQUEST_PAGE_TYPE_CHOICES = tuple([("MP", "Main Page")] + [("DP{n}".format(n=str(n)), "Detail Page {n}".format(n=str(n))) for n in list(range(1, 26))]) help_text = "The different attributes to be scraped, exactly one attribute of type BASE necessary." scraped_obj_attr = models.ForeignKey(ScrapedObjAttr, help_text=help_text) - scraper = models.ForeignKey(Scraper) + scraper = models.ForeignKey(Scraper) x_path = models.TextField(blank=True, help_text='XPath or JSONPath expression, leave blank on "static" processor use.') reg_exp = models.TextField(blank=True, help_text="Optional filtering by regular expression (e.g. 'Scrape only (.*) the text in between').") help_text = "Corresponding Request Page Types created for this scraper." @@ -310,16 +310,16 @@ class ScraperElem(models.Model): proc_ctxt = models.TextField(blank=True, help_text=help_text) help_text = "Drop item if attribute could not be scraped." mandatory = models.BooleanField(default=True, help_text=help_text) - + def __str__(self): return '{s} > {soa} Attribute ({rpt})'.format( s=str(self.scraper), soa=self.scraped_obj_attr.name, rpt=self.get_request_page_type_display()) - + class Meta(object): ordering = ['scraped_obj_attr__order',] - + @python_2_unicode_compatible @@ -329,13 +329,13 @@ class SchedulerRuntime(models.Model): ('C', 'CHECKER'), ) runtime_type = models.CharField(max_length=1, choices=TYPE, default='P') - next_action_time = models.DateTimeField(default=datetime.datetime.now) + next_action_time = models.DateTimeField(default=timezone.now()) next_action_factor = models.FloatField(blank=True, null=True) num_zero_actions = models.IntegerField(default=0) - + def __str__(self): return str(self.id) - + class Meta(object): ordering = ['next_action_time',] @@ -347,7 +347,7 @@ class LogMarker(models.Model): ('IM', 'Important'), ('IG', 'Ignore'), ('MI', 'Miscellaneous'), - ('CU', 'Custom'), + ('CU', 'Custom'), ) message_contains = models.CharField(max_length=255) help_text = "Use the string format from the log messages" @@ -374,14 +374,14 @@ class Log(models.Model): spider_name = models.CharField(max_length=200) scraper = models.ForeignKey(Scraper, blank=True, null=True) date = models.DateTimeField(default=datetime.datetime.now) - + @staticmethod def numeric_level(level): numeric_level = 0 for choice in Log.LEVEL_CHOICES: if choice[1] == level: numeric_level = choice[0] - return numeric_level - + return numeric_level + class Meta(object): ordering = ['-date']