Skip to content

Commit

Permalink
scrape: clean up, add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
abh3po committed Nov 29, 2016
1 parent eb68b13 commit add76e6
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 48 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ dist/
build/
UIP.egg-info/
README
.cache/
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ python:
sudo: true

before_script:
- pip install coala
- pip install -r test-requirements.txt

script:
- coala-ci
- pytest
24 changes: 16 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,29 @@ commit body
Fixes <issue number>
```

UIP uses coala as a linter, to install use:
where short log is the area/filename where you make the change
commit message is the very brief description of the change made by you and any
other additional details go into the commit body.

TESTING
=======

While developing, to test, you should first install the test-requirements
by running:

```
sudo pip install coala
pip install -r test-requirements.txt
```

before you push remember to run
then test your work by the command:
```
pytest
```
If you want to lint your files you can run
```
coala
```
and commit all changes suggested

where short log is the area/filename where you make the change
commit message is the very brief description of the change made by you and any
other additional details go into the commit body.

Do remember to keep your master branch updated at all times
and always work on a different branch.

Expand Down
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ def get_contents(filename):
if not os.path.exists(HOME_DIR):
os.makedirs(HOME_DIR)

if not os.path.exists(DEFAULT_PICS_FOLDER):
os.makedirs(DEFAULT_PICS_FOLDER)

if not os.path.isfile(settings_file_path):
file_data = {'timeout' : 30*60,
'no-of-images': NUMBER_OF_IMAGES_TO_PARSE,
Expand Down
2 changes: 2 additions & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pytest==3.0.4
coala==0.9.0
38 changes: 38 additions & 0 deletions tests/test_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import unittest
import tempfile
import os
from uiplib import scrape

class ScrapeTest(unittest.TestCase):

def test_get_image_links(self):
old_reddit = scrape.get_reddit_image_links
old_unsplash = scrape.get_unsplash_image_links
scrape.get_unsplash_image_links = lambda x,y : [(1,1), (2,2)]
scrape.get_reddit_image_links = lambda x,y : [(7,7), (8,8)]
self.assertEqual(scrape.get_image_links(
'www.reddit.com/r/CoolSite', 2),
[(7, 7), (8, 8)])
self.assertEqual(scrape.get_image_links(
'www.unsplash.com/new', 2),
[(1, 1), (2, 2)])
scrape.get_reddit_image_links = old_reddit
scrape.get_unsplash_image_links = old_unsplash

def test_get_images(self):
with tempfile.TemporaryDirectory() as directory:
scrape.get_image_links = lambda x,y : [('filename.png',
'https://placeholdit.imgix.net/'
'~text?txtsize=15&txt=image1&w=120&h=120')]
scrape.get_images('url', directory, 1)
self.assertEqual(os.listdir(directory), ['filename.png'])

def test_reddit_image_links(self):
scrape.make_json = lambda x : {
'data' : {'children' : [{'data' : {'preview' : {
'images' : [{'source' : {'url' :
'url.com/some_url.png?21'}
}]}}}]}}
self.assertEqual(scrape.get_reddit_image_links('url', 1),
[('some_url.png', 'url.com/some_url.png?21')])

114 changes: 75 additions & 39 deletions uiplib/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,53 +31,89 @@ def dlProgress(count, blockSize, totalSize):
sys.stdout.write("\r[%s%s]" % ('='*int(percent), ' '*(50-int(percent))))
sys.stdout.flush()

def get_images(url, directory, count):
def get_unsplash_image_links(url, no_of_images):
"""
returns a list of tuples,with first index as filename and ther index
as link of the image scraped from unsplash.
"""
soup = make_soup(url)
'''Selects desired bs4 tags, soup.select is a recursive function,
it searches for classes/tags within classes/tags'''
a_tags = soup.select('.y5w1y .hduMF .tPMQE a')
image_links = []
if not a_tags:
print ('No matching image found')
return

for a_tag in a_tags:
image_url = a_tag['href']
filename = image_url.split('/')[-2]+".jpg"
image_links.append((filename, image_url))
if(len(image_links) >= no_of_images):
break

return image_links

def get_reddit_image_links(url, no_of_images):
"""
returns a list of tuples,with first index as filename and ther index
as link of the image scraped from reddit.
"""
page = make_json(url)
image_links = []
children = []
try:
# structure of reddit API
children = page['data']['children']
except (IndexError, KeyError) as e:
print("You seem to be having some issues with your internet."
"Please contact us at our github repo 'NIT-dgp/UIP'"
"If you feel it isn't the case with your internet.", e)
for child in children:
images = []
try:
images = child['data']['preview']['images']
except KeyError:
pass

for image in images:
if(len(image_links)<no_of_images):
image_url = image['source']['url']
filename = image_url.split('/')[-1]
filename = filename[: filename.find('?')]
image_links.append((filename, image_url))

return image_links

def get_image_links(url, count):
'''
scrape images from /r/wallpapers
Returns
'''
image_links = []
no_of_images = int(count)

if 'unsplash' in url: #For Unsplash
soup = make_soup(url)
'''Selects desired bs4 tags, soup.select is a recursive function,
it searches for classes/tags within classes/tags'''
a_tags = soup.select('.y5w1y .hduMF .tPMQE a')
image_links.extend(get_unsplash_image_links(url, count))

if not a_tags:
print ('No matching image found')
return
elif 'reddit' in url: #For Reddit
image_links.extend(get_reddit_image_links(url, count))

for a_tag in a_tags:
image_links.append(a_tag['href'])
if(len(image_links) >= no_of_images):
break
return image_links

elif 'reddit' in url: #For Reddit
page = make_json(url)
try:
for sub in page['data']['children']: # structure of reddit API
for image in sub['data']['preview']['images']:
if(len(image_links)<no_of_images):
image_links.append(image['source']['url'])
except (IndexError, KeyError) as e:
print("You seem to be having some issues with your internet."
"Please contact us at our github repo 'NIT-dgp/UIP'"
"If you feel it isn't the case with your internet.")

def download_store_images(full_path, image_link):
try:
urlretrieve(image_link,
full_path,
reporthook = dlProgress)
except Exception as e:
print("Image cannot be downloaded: ",str(e))

for image in image_links:
if not os.path.exists(directory):
os.makedirs(directory)
if 'unsplash' in url: #Unsplash
filename = image.split('/')[-2]+".jpg"
elif 'reddit' in url: #Reddit
filename = image.split('/')[-1]
filename = filename[: filename.find('?')]

try:
urlretrieve(image,
os.path.join(directory,filename),
reporthook = dlProgress)
except Exception as e:
print("Image cannot be downloaded: ",str(e))
def get_images(url, directory, count):
'''
scrape images from /r/wallpapers
'''
no_of_images = int(count)
image_links = get_image_links(url, no_of_images)
for image in image_links:
download_store_images(os.path.join(directory, image[0]), image[1])

0 comments on commit add76e6

Please sign in to comment.