From f8bec907dde9ff0cbf3bc7584b964c61cf18194b Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Mon, 3 Jun 2013 21:13:34 +0400 Subject: [PATCH 1/7] Add argument parsing and getting credentials from .netrc if not specified any other way --- edx_dl.py | 112 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 103 insertions(+), 9 deletions(-) diff --git a/edx_dl.py b/edx_dl.py index 7732f0e..ab7a7d8 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -7,6 +7,11 @@ import re import sys import os.path +import getpass +import netrc +import platform +import os +import argparse from bs4 import BeautifulSoup from math import floor from random import random @@ -52,6 +57,44 @@ def csrfCookie(csrftoken): rest={'HttpOnly': None}, rfc2109=False) +def get_netrc_creds(authenticator): + """ + Read username/password from the users' netrc file. Returns None if no + coursera credentials can be found. + """ + # inspired by https://github.com/jplehmann/coursera + # taken from https://github.com/dgorissen/coursera-dl + + if platform.system() == 'Windows': + # where could the netrc file be hiding, try a number of places + env_vars = ["HOME","HOMEDRIVE", "HOMEPATH","USERPROFILE","SYSTEMDRIVE"] + env_dirs = [os.environ[e] for e in env_vars if os.environ.get(e,None)] + + # also try the root/cur dirs + env_dirs += ["C:", ""] + + # possible filenames + file_names = [".netrc", "_netrc"] + + # all possible paths + paths = [os.path.join(dir,fn) for dir in env_dirs for fn in file_names] + else: + # on *nix just put None, and the correct default will be used + paths = [None] + + # try the paths one by one and return the first one that works + creds = None + for p in paths: + try: + auths = netrc.netrc(p).authenticators(authenticator) + creds = (auths[0], auths[2]) + print "Credentials found in .netrc file" + break + except (IOError, TypeError, netrc.NetrcParseError) as e: + pass + + return creds + class EdXBrowser(object): def __init__(self, config): self._br = mechanize.Browser() @@ -137,7 +180,7 @@ def download(self): # + sanitize_filename(chapter_name) + '/' \ # + '%02i.%02i.*' % (i,j) #fn = glob.glob(DIRECTORY + nametmpl) - nametmpl = os.path.join(DIRECTORY, + nametmpl = os.path.join(self._config.directory, sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), '%02i.%02i.*' % (i,j)) @@ -166,7 +209,7 @@ def download(self): # + sanitize_filename(chapter_name) + '/' \ # + '%02i.%02i.%02i ' % (i,j,k) \ # + sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s' - outtmpl = os.path.join(DIRECTORY, + outtmpl = os.path.join(self._config.directory, sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), '%02i.%02i.%02i ' % (i,j,k) + \ @@ -178,16 +221,67 @@ def download(self): pass if __name__ == '__main__': - config.interactive_mode = ('--interactive' in sys.argv) + parser = argparse.ArgumentParser(description='Make courses from EdX powered courses available offline.', add_help=True) + parser.add_argument("-u", "--username", dest='username', type=str, help='username (if omitted search in profile file, then .netrc used)') + parser.add_argument("-p", "--password", dest='password', type=str, help='user''s password') + parser.add_argument('-c', "--courses", dest="course_names", nargs="+", metavar='', type=str, help='one or more course names (e.g. TODO)') + parser.add_argument('-w', "--weeks", dest="week_numbers", nargs="+", metavar='', type=str, help='one or more weeks; -c must be present and specify only one course') + parser.add_argument('-r', "--profile", dest="profile", type=str, help='download profile ("10gen", "edx" etc...)', choices=['10gen', 'edx']) + + group = parser.add_mutually_exclusive_group() + group.add_argument('-d', "--destdir", dest="destdir", type=str, default=".", help='destination directory for downloaded content') + group.add_argument('dest_dir', nargs="?", metavar='', type=str, help='destination directory; deprecated, use --destdir option)') + + group = parser.add_mutually_exclusive_group() + group.add_argument('-i', "--interactive", dest="interactive_mode", help='run in interactive mode; cannot use with --gui', action="store_true") + group.add_argument('-g', "--gui", dest="gui_mode", help='show GUI menu to choose course(s)/week(s) for download; cannot use with --interactive', action="store_true") + + #parser.add_argument("-n", dest='ignorefiles', type=str, default="", help='comma-separated list of file extensions to skip, e.g., "ppt,srt,pdf"') + #parser.add_argument("-q", dest='parser', type=str, default=CourseraDownloader.DEFAULT_PARSER, + # help="the html parser to use, see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser") + #parser.add_argument("-x", dest='proxy', type=str, default=None, help="proxy to use, e.g., foo.bar.com:3125") + #parser.add_argument("--reverse-sections", dest='reverse', action="store_true", + # default=False, help="download and save the sections in reverse order") + args = parser.parse_args() + print args + + import config + # search for login credentials in .netrc file if username hasn't been provided in command-line args + username, password = args.username, args.password + netrc_password = None + if not username and hasattr(config, 'EMAIL'): + username = config.EMAIL + if not username: + creds = get_netrc_creds(config.DOMAIN) + if creds: + username, netrc_password = creds + else: + #raise Exception("No username passed and no .netrc credentials found, unable to login") + pass + if not username: + username = raw_input('Enter username for %s: ' % config.DOMAIN) + if not password and hasattr(config, 'PASSWORD'): + password = config.PASSWORD + if not password: + password = netrc_password + # prompt the user for his password if not specified + if not password: + password = getpass.getpass('Enter password for %s at %s: ' % (username, config.DOMAIN)) + + config.EMAIL = username + config.PASSWORD = password - if config.interactive_mode: - sys.argv.remove('--interactive') + config.interactive_mode = args.interactive_mode + config.gui_mode = args.gui_mode - if len(sys.argv) >= 2: - DIRECTORY = sys.argv[-1].strip('"') + config.directory = None + if args.dest_dir: + print "Positional argument for destination directory is deprecated, please use --destdir or -d option" + config.directory = args.dest_dir else: - DIRECTORY = os.path.curdir - print 'Downloading to ''%s'' directory' % DIRECTORY + config.directory = args.destdir + pass + print 'Downloading to ''%s'' directory' % config.directory edxb = EdXBrowser(config) edxb.login() From be8f75d01bc84a3843a6c4c2b4f4dcaa2a8914e9 Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Mon, 3 Jun 2013 23:12:10 +0400 Subject: [PATCH 2/7] Now using configuration as a dictionary in the code --- edx_dl.py | 110 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 44 deletions(-) diff --git a/edx_dl.py b/edx_dl.py index ab7a7d8..d5dea69 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -21,19 +21,6 @@ from youtube_dl.InfoExtractors import YoutubeIE from youtube_dl.utils import sanitize_filename -import config - -replace_space_with_underscore = True -base_url = 'https://'+config.DOMAIN -# Dirty hack for differences in 10gen and edX implementation -if 'edx' in config.DOMAIN.split('.'): - login_url = '/login_ajax' -else: - login_url = '/login' - -dashboard_url = '/dashboard' -youtube_url = 'http://www.youtube.com/watch?v=' - def makeCsrf(): t = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' e = 24 @@ -42,12 +29,12 @@ def makeCsrf(): csrftoken.append(t[int(floor(random()*len(t)))]) return ''.join(csrftoken) -def csrfCookie(csrftoken): +def csrfCookie(csrftoken, domain): return mechanize.Cookie(version=0, name='csrftoken', value=csrftoken, port=None, port_specified=False, - domain=config.DOMAIN, + domain=domain, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, @@ -97,22 +84,22 @@ def get_netrc_creds(authenticator): class EdXBrowser(object): def __init__(self, config): + self._config = config self._br = mechanize.Browser() self._cj = mechanize.LWPCookieJar() csrftoken = makeCsrf() - self._cj.set_cookie(csrfCookie(csrftoken)) + self._cj.set_cookie(csrfCookie(csrftoken, self._config['DOMAIN'])) self._br.set_handle_robots(False) self._br.set_cookiejar(self._cj) self._br.addheaders.append(('X-CSRFToken',csrftoken)) self._br.addheaders.append(('Referer',base_url)) self._logged_in = False - self._fd = FileDownloader(config.YDL_PARAMS) + self._fd = FileDownloader(self._config.get('YDL_PARAMS')) self._fd.add_info_extractor(YoutubeIE()) - self._config = config def login(self): try: - login_resp = self._br.open(base_url + login_url, urlencode({'email':self._config.EMAIL, 'password':self._config.PASSWORD})) + login_resp = self._br.open(base_url + login_url, urlencode({'email':self._config['EMAIL'], 'password':self._config['PASSWORD']})) login_state = json.loads(login_resp.read()) self._logged_in = login_state.get('success') if not self._logged_in: @@ -132,7 +119,7 @@ def list_courses(self): course_url = my_course.a['href'] course_name = my_course.h3.text - if self._config.interactive_mode: + if self._config['interactive_mode']: launch_download_msg = 'Download the course [%s] from %s? (y/n) ' % (course_name, course_url) launch_download = raw_input(launch_download_msg) if (launch_download.lower() == "n"): @@ -156,7 +143,7 @@ def list_chapters(self, course_i): for chapter in chapters: chapter_name = chapter.find('h3').find('a').text - if self._config.interactive_mode: + if self._config['interactive_mode']: launch_download_msg = 'Download the chapter [%s - %s]? (y/n) ' % (course_name, chapter_name) launch_download = raw_input(launch_download_msg) if (launch_download.lower() == "n"): @@ -180,7 +167,7 @@ def download(self): # + sanitize_filename(chapter_name) + '/' \ # + '%02i.%02i.*' % (i,j) #fn = glob.glob(DIRECTORY + nametmpl) - nametmpl = os.path.join(self._config.directory, + nametmpl = os.path.join(self._config['directory'], sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), '%02i.%02i.*' % (i,j)) @@ -209,7 +196,7 @@ def download(self): # + sanitize_filename(chapter_name) + '/' \ # + '%02i.%02i.%02i ' % (i,j,k) \ # + sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s' - outtmpl = os.path.join(self._config.directory, + outtmpl = os.path.join(self._config['directory'], sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), '%02i.%02i.%02i ' % (i,j,k) + \ @@ -220,13 +207,51 @@ def download(self): #print "Error: %s" % e pass +replace_space_with_underscore = True +youtube_url = 'http://www.youtube.com/watch?v=' + +base_url = None +login_url = '/login' +dashboard_url = '/dashboard' + +def setup_urls(config): + global base_url, login_url, dashboard_url + domain = config['DOMAIN'] + base_url = 'https://' + domain + # Dirty hack for differences in 10gen and edX implementation + if 'edx' in domain.split('.'): + login_url = '/login_ajax' + else: + login_url = '/login' + dashboard_url = '/dashboard' + +def read_config(profile): + """if profile: + #exec "import %s" % args.profile + cfg_parser = ConfigParser.ConfigParser() + if profile not in cfg_parser.sections(): + raise Exception("Profile '%s' is not defined" % profile) + else: + """ + import config + cfg_dict = {} + if hasattr(config, 'DOMAIN'): + cfg_dict['DOMAIN'] = config.DOMAIN + if hasattr(config, 'EMAIL'): + cfg_dict['EMAIL'] = config.EMAIL + if hasattr(config, 'PASSWORD'): + cfg_dict['PASSWORD'] = config.PASSWORD + if hasattr(config, 'YDL_PARAMS'): + cfg_dict['YDL_PARAMS'] = config.YDL_PARAMS + return cfg_dict + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Make courses from EdX powered courses available offline.', add_help=True) parser.add_argument("-u", "--username", dest='username', type=str, help='username (if omitted search in profile file, then .netrc used)') parser.add_argument("-p", "--password", dest='password', type=str, help='user''s password') parser.add_argument('-c', "--courses", dest="course_names", nargs="+", metavar='', type=str, help='one or more course names (e.g. TODO)') parser.add_argument('-w', "--weeks", dest="week_numbers", nargs="+", metavar='', type=str, help='one or more weeks; -c must be present and specify only one course') - parser.add_argument('-r', "--profile", dest="profile", type=str, help='download profile ("10gen", "edx" etc...)', choices=['10gen', 'edx']) + parser.add_argument('-r', "--profile", dest="profile", type=str, help='download profile ("10gen", "edx" etc...)') group = parser.add_mutually_exclusive_group() group.add_argument('-d', "--destdir", dest="destdir", type=str, default=".", help='destination directory for downloaded content') @@ -236,53 +261,50 @@ def download(self): group.add_argument('-i', "--interactive", dest="interactive_mode", help='run in interactive mode; cannot use with --gui', action="store_true") group.add_argument('-g', "--gui", dest="gui_mode", help='show GUI menu to choose course(s)/week(s) for download; cannot use with --interactive', action="store_true") - #parser.add_argument("-n", dest='ignorefiles', type=str, default="", help='comma-separated list of file extensions to skip, e.g., "ppt,srt,pdf"') #parser.add_argument("-q", dest='parser', type=str, default=CourseraDownloader.DEFAULT_PARSER, # help="the html parser to use, see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser") #parser.add_argument("-x", dest='proxy', type=str, default=None, help="proxy to use, e.g., foo.bar.com:3125") - #parser.add_argument("--reverse-sections", dest='reverse', action="store_true", - # default=False, help="download and save the sections in reverse order") args = parser.parse_args() - print args + #print args - import config + config = read_config(args.profile) # search for login credentials in .netrc file if username hasn't been provided in command-line args username, password = args.username, args.password netrc_password = None - if not username and hasattr(config, 'EMAIL'): - username = config.EMAIL if not username: - creds = get_netrc_creds(config.DOMAIN) + username = config.get('EMAIL', None) + if not username: + creds = get_netrc_creds(config['DOMAIN']) if creds: username, netrc_password = creds else: #raise Exception("No username passed and no .netrc credentials found, unable to login") pass if not username: - username = raw_input('Enter username for %s: ' % config.DOMAIN) - if not password and hasattr(config, 'PASSWORD'): - password = config.PASSWORD + username = raw_input('Enter username for %s: ' % config['DOMAIN']) + if not password: + password = config.get('PASSWORD', None) if not password: password = netrc_password # prompt the user for his password if not specified if not password: - password = getpass.getpass('Enter password for %s at %s: ' % (username, config.DOMAIN)) + password = getpass.getpass('Enter password for %s at %s: ' % (username, config['DOMAIN'])) - config.EMAIL = username - config.PASSWORD = password + config['EMAIL'] = username + config['PASSWORD'] = password - config.interactive_mode = args.interactive_mode - config.gui_mode = args.gui_mode + config['interactive_mode'] = args.interactive_mode + config['gui_mode'] = args.gui_mode - config.directory = None if args.dest_dir: print "Positional argument for destination directory is deprecated, please use --destdir or -d option" - config.directory = args.dest_dir + config['directory'] = args.dest_dir else: - config.directory = args.destdir + config['directory'] = args.destdir pass - print 'Downloading to ''%s'' directory' % config.directory + print 'Downloading to ''%s'' directory' % config['directory'] + setup_urls(config) edxb = EdXBrowser(config) edxb.login() print 'Found the following courses:' From 4dcf22dd495c0b7b33c57d4c6e826ea86b48e7f5 Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Mon, 3 Jun 2013 23:31:51 +0400 Subject: [PATCH 3/7] Fix file download --- edx_dl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/edx_dl.py b/edx_dl.py index d5dea69..5b28af6 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -200,7 +200,7 @@ def download(self): sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), '%02i.%02i.%02i ' % (i,j,k) + \ - sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s', replace_space_with_underscore) + sanitize_filename('%s (%s)' % (par_name, video_type), replace_space_with_underscore) + '.%(ext)s') self._fd.params['outtmpl'] = outtmpl self._fd.download([video_url]) except Exception as e: From 350bf404cf7ac34daf1571dc9888aa201a456f5b Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Tue, 4 Jun 2013 16:12:40 +0400 Subject: [PATCH 4/7] Support for '--courses' option. --- edx_dl.py | 174 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 63 deletions(-) diff --git a/edx_dl.py b/edx_dl.py index 5b28af6..f306a38 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -18,30 +18,32 @@ from urllib import urlencode from youtube_dl.FileDownloader import FileDownloader -from youtube_dl.InfoExtractors import YoutubeIE +from youtube_dl.InfoExtractors import YoutubeIE from youtube_dl.utils import sanitize_filename + def makeCsrf(): t = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' e = 24 csrftoken = list() - for i in range(0,e): - csrftoken.append(t[int(floor(random()*len(t)))]) + for i in range(0, e): + csrftoken.append(t[int(floor(random() * len(t)))]) return ''.join(csrftoken) + def csrfCookie(csrftoken, domain): return mechanize.Cookie(version=0, - name='csrftoken', - value=csrftoken, - port=None, port_specified=False, - domain=domain, - domain_specified=False, - domain_initial_dot=False, - path='/', path_specified=True, - secure=False, expires=None, - discard=True, - comment=None, comment_url=None, - rest={'HttpOnly': None}, rfc2109=False) + name='csrftoken', + value=csrftoken, + port=None, port_specified=False, + domain=domain, + domain_specified=False, + domain_initial_dot=False, + path='/', path_specified=True, + secure=False, expires=None, + discard=True, + comment=None, comment_url=None, + rest={'HttpOnly': None}, rfc2109=False) def get_netrc_creds(authenticator): @@ -54,8 +56,8 @@ def get_netrc_creds(authenticator): if platform.system() == 'Windows': # where could the netrc file be hiding, try a number of places - env_vars = ["HOME","HOMEDRIVE", "HOMEPATH","USERPROFILE","SYSTEMDRIVE"] - env_dirs = [os.environ[e] for e in env_vars if os.environ.get(e,None)] + env_vars = ["HOME", "HOMEDRIVE", "HOMEPATH", "USERPROFILE", "SYSTEMDRIVE"] + env_dirs = [os.environ[e] for e in env_vars if os.environ.get(e, None)] # also try the root/cur dirs env_dirs += ["C:", ""] @@ -64,7 +66,7 @@ def get_netrc_creds(authenticator): file_names = [".netrc", "_netrc"] # all possible paths - paths = [os.path.join(dir,fn) for dir in env_dirs for fn in file_names] + paths = [os.path.join(d, fn) for d in env_dirs for fn in file_names] else: # on *nix just put None, and the correct default will be used paths = [None] @@ -82,6 +84,7 @@ def get_netrc_creds(authenticator): return creds + class EdXBrowser(object): def __init__(self, config): self._config = config @@ -91,15 +94,16 @@ def __init__(self, config): self._cj.set_cookie(csrfCookie(csrftoken, self._config['DOMAIN'])) self._br.set_handle_robots(False) self._br.set_cookiejar(self._cj) - self._br.addheaders.append(('X-CSRFToken',csrftoken)) - self._br.addheaders.append(('Referer',base_url)) + self._br.addheaders.append(('X-CSRFToken', csrftoken)) + self._br.addheaders.append(('Referer', base_url)) self._logged_in = False self._fd = FileDownloader(self._config.get('YDL_PARAMS')) self._fd.add_info_extractor(YoutubeIE()) def login(self): try: - login_resp = self._br.open(base_url + login_url, urlencode({'email':self._config['EMAIL'], 'password':self._config['PASSWORD']})) + login_resp = self._br.open(base_url + login_url, urlencode( + {'email': self._config['EMAIL'], 'password': self._config['PASSWORD']})) login_state = json.loads(login_resp.read()) self._logged_in = login_state.get('success') if not self._logged_in: @@ -114,32 +118,53 @@ def list_courses(self): dashboard = self._br.open(base_url + dashboard_url) dashboard_soup = BeautifulSoup(dashboard.read()) my_courses = dashboard_soup.findAll('article', 'my-course') - i = 0 for my_course in my_courses: course_url = my_course.a['href'] course_name = my_course.h3.text - - if self._config['interactive_mode']: - launch_download_msg = 'Download the course [%s] from %s? (y/n) ' % (course_name, course_url) - launch_download = raw_input(launch_download_msg) - if (launch_download.lower() == "n"): - continue + courseware_url = re.sub(r'\/info$', '/courseware', course_url) + self.courses.append({'name': course_name, 'url': courseware_url}) + + def print_courses(self): + for i in range(len(self.courses)): + print '[%02i] %s' % (i, self.courses[i]['name']) + + def filter_interactive_courses(self): + if self._config['interactive_mode']: + confirmed = [] + for course in self.courses: + course_name = course['name'] + course_url = course['url'] + launch_download_msg = 'Download the course [%s] from %s? (y/n) ' % (course_name, course_url) + launch_download = raw_input(launch_download_msg) + if launch_download.lower() == "n": + continue + confirmed.append(course) + self.courses = confirmed + pass + + def filter_cmd_line_courses(self): + if self._config['course_names']: + selected = self._config['course_names'] - i += 1 - courseware_url = re.sub(r'\/info$','/courseware',course_url) - self.courses.append({'name':course_name, 'url':courseware_url}) - print '[%02i] %s' % (i, course_name) + def fltr(course): + for s in selected: + if course['url'].find('/{}/'.format(s)) >= 0: + return True + return False + + self.courses = filter(fltr, self.courses) + pass def list_chapters(self, course_i): self.paragraphs = [] - if course_i < len(self.courses) and course_i >= 0: + if len(self.courses) > course_i >= 0: print "Getting chapters..." course = self.courses[course_i] course_name = course['name'] - courseware = self._br.open(base_url+course['url']) + courseware = self._br.open(base_url + course['url']) courseware_soup = BeautifulSoup(courseware.read()) - chapters = courseware_soup.findAll('div','chapter') - i = 0 + chapters = courseware_soup.findAll('div', 'chapter') + chapter_index = 0 for chapter in chapters: chapter_name = chapter.find('h3').find('a').text @@ -148,45 +173,42 @@ def list_chapters(self, course_i): launch_download = raw_input(launch_download_msg) if (launch_download.lower() == "n"): continue - - i += 1 - print '\t[%02i] %s' % (i, chapter_name) + + chapter_index += 1 + print '\t[%02i] %s' % (chapter_index, chapter_name) paragraphs = chapter.find('ul').findAll('li') - j = 0 + paragraph_index = 0 for paragraph in paragraphs: - j += 1 + paragraph_index += 1 par_name = paragraph.p.text par_url = paragraph.a['href'] - self.paragraphs.append((course_name, i, j, chapter_name, par_name, par_url)) - print '\t\t[%02i.%02i] %s' % (i, j, par_name) + self.paragraphs.append( + (course_name, chapter_index, paragraph_index, chapter_name, par_name, par_url)) + print '\t\t[%02i.%02i] %s' % (chapter_index, paragraph_index, par_name) def download(self): print "\n-----------------------\nStart downloading\n-----------------------\n" for (course_name, i, j, chapter_name, par_name, url) in self.paragraphs: - #nametmpl = sanitize_filename(course_name) + '/' \ - # + sanitize_filename(chapter_name) + '/' \ - # + '%02i.%02i.*' % (i,j) - #fn = glob.glob(DIRECTORY + nametmpl) nametmpl = os.path.join(self._config['directory'], sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), - '%02i.%02i.*' % (i,j)) + '%02i.%02i.*' % (i, j)) fn = glob.glob(nametmpl) - + if fn: print "Processing of %s skipped" % nametmpl continue print "Processing %s..." % nametmpl par = self._br.open(base_url + url) par_soup = BeautifulSoup(par.read()) - contents = par_soup.findAll('div','seq_contents') + contents = par_soup.findAll('div', 'seq_contents') k = 0 for content in contents: #print "Content: %s" % content content_soup = BeautifulSoup(content.text) try: video_type = content_soup.h2.text.strip() - video_stream = content_soup.find('div','video')['data-streams'] + video_stream = content_soup.find('div', 'video')['data-streams'] video_id = video_stream.split(':')[1] video_url = youtube_url + video_id k += 1 @@ -197,16 +219,18 @@ def download(self): # + '%02i.%02i.%02i ' % (i,j,k) \ # + sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s' outtmpl = os.path.join(self._config['directory'], - sanitize_filename(course_name, replace_space_with_underscore), - sanitize_filename(chapter_name, replace_space_with_underscore), - '%02i.%02i.%02i ' % (i,j,k) + \ - sanitize_filename('%s (%s)' % (par_name, video_type), replace_space_with_underscore) + '.%(ext)s') + sanitize_filename(course_name, replace_space_with_underscore), + sanitize_filename(chapter_name, replace_space_with_underscore), + '%02i.%02i.%02i ' % (i, j, k) + \ + sanitize_filename('%s (%s)' % (par_name, video_type), + replace_space_with_underscore) + '.%(ext)s') self._fd.params['outtmpl'] = outtmpl self._fd.download([video_url]) except Exception as e: #print "Error: %s" % e pass + replace_space_with_underscore = True youtube_url = 'http://www.youtube.com/watch?v=' @@ -214,6 +238,7 @@ def download(self): login_url = '/login' dashboard_url = '/dashboard' + def setup_urls(config): global base_url, login_url, dashboard_url domain = config['DOMAIN'] @@ -225,6 +250,7 @@ def setup_urls(config): login_url = '/login' dashboard_url = '/dashboard' + def read_config(profile): """if profile: #exec "import %s" % args.profile @@ -234,6 +260,7 @@ def read_config(profile): else: """ import config + cfg_dict = {} if hasattr(config, 'DOMAIN'): cfg_dict['DOMAIN'] = config.DOMAIN @@ -245,21 +272,31 @@ def read_config(profile): cfg_dict['YDL_PARAMS'] = config.YDL_PARAMS return cfg_dict + if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Make courses from EdX powered courses available offline.', add_help=True) - parser.add_argument("-u", "--username", dest='username', type=str, help='username (if omitted search in profile file, then .netrc used)') + parser = argparse.ArgumentParser(description='Make courses from EdX powered courses available offline.', + add_help=True) + parser.add_argument("-u", "--username", dest='username', type=str, + help='username (if omitted search in profile file, then .netrc used)') parser.add_argument("-p", "--password", dest='password', type=str, help='user''s password') - parser.add_argument('-c', "--courses", dest="course_names", nargs="+", metavar='', type=str, help='one or more course names (e.g. TODO)') - parser.add_argument('-w', "--weeks", dest="week_numbers", nargs="+", metavar='', type=str, help='one or more weeks; -c must be present and specify only one course') + parser.add_argument('-c', "--courses", dest="course_names", nargs="+", metavar='', type=str, + help='one or more course names (better use course id in the url e.g. "M101" for 10gen or "CS188.1x" for EdX )') + parser.add_argument('-w', "--weeks", dest="week_numbers", nargs="+", metavar='', type=str, + help='one or more weeks; -c must be present and specify only one course') parser.add_argument('-r', "--profile", dest="profile", type=str, help='download profile ("10gen", "edx" etc...)') group = parser.add_mutually_exclusive_group() - group.add_argument('-d', "--destdir", dest="destdir", type=str, default=".", help='destination directory for downloaded content') - group.add_argument('dest_dir', nargs="?", metavar='', type=str, help='destination directory; deprecated, use --destdir option)') + group.add_argument('-d', "--destdir", dest="destdir", type=str, default=".", + help='destination directory for downloaded content') + group.add_argument('dest_dir', nargs="?", metavar='', type=str, + help='destination directory; deprecated, use --destdir option)') group = parser.add_mutually_exclusive_group() - group.add_argument('-i', "--interactive", dest="interactive_mode", help='run in interactive mode; cannot use with --gui', action="store_true") - group.add_argument('-g', "--gui", dest="gui_mode", help='show GUI menu to choose course(s)/week(s) for download; cannot use with --interactive', action="store_true") + group.add_argument('-i', "--interactive", dest="interactive_mode", + help='run in interactive mode; cannot use with --gui', action="store_true") + group.add_argument('-g', "--gui", dest="gui_mode", + help='show GUI menu to choose course(s)/week(s) for download; cannot use with --interactive', + action="store_true") #parser.add_argument("-q", dest='parser', type=str, default=CourseraDownloader.DEFAULT_PARSER, # help="the html parser to use, see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser") @@ -296,6 +333,13 @@ def read_config(profile): config['interactive_mode'] = args.interactive_mode config['gui_mode'] = args.gui_mode + if args.week_numbers and args.course_names and len(args.course_names) > 1: + raise Exception("You must specify only one course if you use -w option") + config['course_names'] = args.course_names + config['week_numbers'] = args.week_numbers + if args.week_numbers: + print "-w or --weeks are not supported yet, ignored" + if args.dest_dir: print "Positional argument for destination directory is deprecated, please use --destdir or -d option" config['directory'] = args.dest_dir @@ -307,8 +351,12 @@ def read_config(profile): setup_urls(config) edxb = EdXBrowser(config) edxb.login() - print 'Found the following courses:' edxb.list_courses() + edxb.filter_cmd_line_courses() + edxb.filter_interactive_courses() + print 'Found the following courses:' + edxb.print_courses() + if edxb.courses: print "Processing..." else: From c3a03b8ce4c440e0ca57511bad54b70a49143b2a Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Tue, 4 Jun 2013 17:43:36 +0400 Subject: [PATCH 5/7] updated readme --- README.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index f37572e..4febda6 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,25 @@ Optionally set another options in `config.py` like `writesubtitles` to enable su ### Usage: -+ `python edx_dl.py` -+ `python edx_dl.py c:\Users\MyUser\Lectures\` -+ `python edx_dl.py --interactive c:\Users\MyUser\Lectures\` +``` +usage: edx_dl.py [-h] [-u USERNAME] [-p PASSWORD] + [-c [ ...]] + [-d DESTDIR] [-i] + +Make courses from EdX powered courses available offline. + +optional arguments: + -h, --help show this help message and exit + -u USERNAME, --username USERNAME + username (if omitted search in profile file, then + .netrc used) + -p PASSWORD, --password PASSWORD + users password + -c [ ...], --courses [ ...] + one or more course names (better use course id in the + url e.g. "M101" for 10gen or "CS188.1x" for EdX ) + -d DESTDIR, --destdir DESTDIR + destination directory for downloaded content + -i, --interactive run in interactive mode + +``` From dbb6ffb09d982153b53a62bb174c1c351b4886ff Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Tue, 4 Jun 2013 20:28:02 +0400 Subject: [PATCH 6/7] Fix chapter numbering while -i specified (that forced to download chapter files again with different number in name) --- edx_dl.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/edx_dl.py b/edx_dl.py index f306a38..70c5454 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -118,11 +118,13 @@ def list_courses(self): dashboard = self._br.open(base_url + dashboard_url) dashboard_soup = BeautifulSoup(dashboard.read()) my_courses = dashboard_soup.findAll('article', 'my-course') + course_index = 0 for my_course in my_courses: + course_index += 1 course_url = my_course.a['href'] course_name = my_course.h3.text courseware_url = re.sub(r'\/info$', '/courseware', course_url) - self.courses.append({'name': course_name, 'url': courseware_url}) + self.courses.append({'name': course_name, 'url': courseware_url, 'index': course_index}) def print_courses(self): for i in range(len(self.courses)): @@ -167,6 +169,7 @@ def list_chapters(self, course_i): chapter_index = 0 for chapter in chapters: chapter_name = chapter.find('h3').find('a').text + chapter_index += 1 if self._config['interactive_mode']: launch_download_msg = 'Download the chapter [%s - %s]? (y/n) ' % (course_name, chapter_name) @@ -174,7 +177,6 @@ def list_chapters(self, course_i): if (launch_download.lower() == "n"): continue - chapter_index += 1 print '\t[%02i] %s' % (chapter_index, chapter_name) paragraphs = chapter.find('ul').findAll('li') paragraph_index = 0 @@ -213,11 +215,6 @@ def download(self): video_url = youtube_url + video_id k += 1 print '[%02i.%02i.%02i] %s (%s)' % (i, j, k, par_name, video_type) - #f.writelines(video_url+'\n') - #outtmpl = DIRECTORY + sanitize_filename(course_name) + '/' \ - # + sanitize_filename(chapter_name) + '/' \ - # + '%02i.%02i.%02i ' % (i,j,k) \ - # + sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s' outtmpl = os.path.join(self._config['directory'], sanitize_filename(course_name, replace_space_with_underscore), sanitize_filename(chapter_name, replace_space_with_underscore), From 4b568b3ed980327e6dd99e8a0c4cd3204562198f Mon Sep 17 00:00:00 2001 From: nettoyeur Date: Thu, 13 Jun 2013 00:52:45 +0400 Subject: [PATCH 7/7] Print message that --gui option is ignored if specified --- edx_dl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/edx_dl.py b/edx_dl.py index 70c5454..7d3f822 100644 --- a/edx_dl.py +++ b/edx_dl.py @@ -329,6 +329,8 @@ def read_config(profile): config['interactive_mode'] = args.interactive_mode config['gui_mode'] = args.gui_mode + if args.gui_mode: + print "-g or --gui are not supported yet, ignored" if args.week_numbers and args.course_names and len(args.course_names) > 1: raise Exception("You must specify only one course if you use -w option")