scrape: clean up, add tests

Fixes NITDgpOS#85
sethiyash · Nov 29, 2016 · add76e6 · add76e6
1 parent eb68b13
commit add76e6
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 48 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ dist/
 build/
 UIP.egg-info/
 README
+.cache/
diff --git a/.travis.yml b/.travis.yml
@@ -6,7 +6,8 @@ python:
 sudo: true
 
 before_script:
-  - pip install coala
+  - pip install -r test-requirements.txt
 
 script:
   - coala-ci
+  - pytest
diff --git a/README.md b/README.md
@@ -83,21 +83,29 @@ commit body
 Fixes <issue number>
 ```
 
-UIP uses coala as a linter, to install use:
+where short log is the area/filename where you make the change
+commit message is the very brief description of the change made by you and any
+other additional details go into the commit body.
+
+TESTING
+=======
+
+While developing, to test, you should first install the test-requirements
+by running:
+
 ```
-sudo pip install coala
+pip install -r test-requirements.txt
 ```
-
-before you push remember to run
+then test your work by the command:
+```
+pytest
+```
+If you want to lint your files you can run
 ```
 coala
 ```
 and commit all changes suggested
 
-where short log is the area/filename where you make the change
-commit message is the very brief description of the change made by you and any
-other additional details go into the commit body.
-
 Do remember to keep your master branch updated at all times
 and always work on a different branch.
 

diff --git a/setup.py b/setup.py
@@ -18,6 +18,9 @@ def get_contents(filename):
 if not os.path.exists(HOME_DIR):
     os.makedirs(HOME_DIR)
 
+if not os.path.exists(DEFAULT_PICS_FOLDER):
+    os.makedirs(DEFAULT_PICS_FOLDER)
+
 if not os.path.isfile(settings_file_path):
     file_data = {'timeout' : 30*60,
                  'no-of-images': NUMBER_OF_IMAGES_TO_PARSE,

diff --git a/test-requirements.txt b/test-requirements.txt
@@ -0,0 +1,2 @@
+pytest==3.0.4
+coala==0.9.0
diff --git a/tests/test_scrape.py b/tests/test_scrape.py
@@ -0,0 +1,38 @@
+import unittest
+import tempfile
+import os
+from uiplib import scrape
+
+class ScrapeTest(unittest.TestCase):
+
+    def test_get_image_links(self):
+        old_reddit = scrape.get_reddit_image_links
+        old_unsplash = scrape.get_unsplash_image_links
+        scrape.get_unsplash_image_links = lambda x,y : [(1,1), (2,2)]
+        scrape.get_reddit_image_links = lambda x,y : [(7,7), (8,8)]
+        self.assertEqual(scrape.get_image_links(
+                            'www.reddit.com/r/CoolSite', 2),
+                         [(7, 7), (8, 8)])
+        self.assertEqual(scrape.get_image_links(
+                            'www.unsplash.com/new', 2),
+                         [(1, 1), (2, 2)])
+        scrape.get_reddit_image_links = old_reddit
+        scrape.get_unsplash_image_links = old_unsplash
+
+    def test_get_images(self):
+        with tempfile.TemporaryDirectory() as directory:
+            scrape.get_image_links =  lambda x,y : [('filename.png',
+                'https://placeholdit.imgix.net/'
+                '~text?txtsize=15&txt=image1&w=120&h=120')]
+            scrape.get_images('url', directory, 1)
+            self.assertEqual(os.listdir(directory), ['filename.png'])
+
+    def test_reddit_image_links(self):
+        scrape.make_json = lambda x  : {
+                            'data' : {'children' : [{'data' : {'preview' : {
+                              'images' : [{'source' : {'url' :
+                                                    'url.com/some_url.png?21'}
+                                }]}}}]}}
+        self.assertEqual(scrape.get_reddit_image_links('url', 1),
+                         [('some_url.png', 'url.com/some_url.png?21')])
+
diff --git a/uiplib/scrape.py b/uiplib/scrape.py
@@ -31,53 +31,89 @@ def dlProgress(count, blockSize, totalSize):
     sys.stdout.write("\r[%s%s]" % ('='*int(percent), ' '*(50-int(percent))))
     sys.stdout.flush()
 
-def get_images(url, directory, count):
+def get_unsplash_image_links(url, no_of_images):
+    """
+    returns a list of tuples,with first index as filename and ther index
+    as link of the image scraped from unsplash.
+    """
+    soup = make_soup(url)
+    '''Selects desired bs4 tags, soup.select is a recursive function,
+       it searches for classes/tags within classes/tags'''
+    a_tags = soup.select('.y5w1y .hduMF .tPMQE a')
+    image_links = []
+    if not a_tags:
+        print ('No matching image found')
+        return
+
+    for a_tag in a_tags:
+        image_url = a_tag['href']
+        filename = image_url.split('/')[-2]+".jpg"
+        image_links.append((filename, image_url))
+        if(len(image_links) >= no_of_images):
+            break
+
+    return image_links
+
+def get_reddit_image_links(url, no_of_images):
+    """
+    returns a list of tuples,with first index as filename and ther index
+    as link of the image scraped from reddit.
+    """
+    page = make_json(url)
+    image_links = []
+    children = []
+    try:
+        # structure of reddit API
+        children = page['data']['children']
+    except (IndexError, KeyError) as e:
+        print("You seem to be having some issues with your internet."
+              "Please contact us at our github repo 'NIT-dgp/UIP'"
+              "If you feel it isn't the case with your internet.", e)
+    for child in children:
+        images = []
+        try:
+            images = child['data']['preview']['images']
+        except KeyError:
+            pass
+
+        for image in images:
+            if(len(image_links)<no_of_images):
+                image_url = image['source']['url']
+                filename = image_url.split('/')[-1]
+                filename = filename[: filename.find('?')]
+                image_links.append((filename, image_url))
+
+    return image_links
+
+def get_image_links(url, count):
     '''
-    scrape images from /r/wallpapers
+    Returns
     '''
     image_links = []
-    no_of_images = int(count)
 
     if 'unsplash' in url:       #For Unsplash
-        soup = make_soup(url)
-        '''Selects desired bs4 tags, soup.select is a recursive function,
-           it searches for classes/tags within classes/tags'''
-        a_tags = soup.select('.y5w1y .hduMF .tPMQE a')
+        image_links.extend(get_unsplash_image_links(url, count))
 
-        if not a_tags:
-            print ('No matching image found')
-            return
+    elif 'reddit' in url:     #For Reddit
+        image_links.extend(get_reddit_image_links(url, count))
 
-        for a_tag in a_tags:
-            image_links.append(a_tag['href'])
-            if(len(image_links) >= no_of_images):
-                break
+    return image_links
 
-    elif 'reddit' in url:     #For Reddit
-        page = make_json(url)
-        try:
-            for sub in page['data']['children']: # structure of reddit API
-                for image in sub['data']['preview']['images']:
-                    if(len(image_links)<no_of_images):
-                        image_links.append(image['source']['url'])
-        except (IndexError, KeyError) as e:
-            print("You seem to be having some issues with your internet."
-                  "Please contact us at our github repo 'NIT-dgp/UIP'"
-                  "If you feel it isn't the case with your internet.")
 
+def download_store_images(full_path, image_link):
+    try:
+        urlretrieve(image_link,
+                    full_path,
+                    reporthook = dlProgress)
+    except Exception as e:
+        print("Image cannot be downloaded: ",str(e))
 
-    for image in image_links:
-        if not os.path.exists(directory):
-            os.makedirs(directory)
-        if  'unsplash' in url:    #Unsplash
-            filename = image.split('/')[-2]+".jpg"
-        elif 'reddit' in url:     #Reddit
-            filename = image.split('/')[-1]
-            filename = filename[: filename.find('?')]
 
-        try:
-            urlretrieve(image,
-                        os.path.join(directory,filename),
-                        reporthook = dlProgress)
-        except Exception as e:
-            print("Image cannot be downloaded: ",str(e))
+def get_images(url, directory, count):
+    '''
+    scrape images from /r/wallpapers
+    '''
+    no_of_images = int(count)
+    image_links = get_image_links(url, no_of_images)
+    for image in image_links:
+        download_store_images(os.path.join(directory, image[0]), image[1])
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ dist/ @@
     build/
     UIP.egg-info/
     README
+    .cache/