|
| 1 | + |
| 2 | + |
| 3 | +import json |
| 4 | +import re |
| 5 | +from .common import ( |
| 6 | + BaseChatDownloader, |
| 7 | + Chat, |
| 8 | + Remapper as r |
| 9 | +) |
| 10 | +from ..utils.core import ( |
| 11 | + time_to_seconds, |
| 12 | + regex_search, |
| 13 | + ensure_seconds |
| 14 | +) |
| 15 | +from ..errors import ( |
| 16 | + SiteError, |
| 17 | + ParsingError |
| 18 | +) |
| 19 | + |
| 20 | +# TODO add debugging options |
| 21 | +# from ..debugging import ( |
| 22 | +# log, |
| 23 | +# debug_log |
| 24 | +# ) |
| 25 | + |
| 26 | + |
| 27 | +class ZoomError(SiteError): |
| 28 | + """Raised when an error occurs with a Zoom video.""" |
| 29 | + pass |
| 30 | + |
| 31 | + |
| 32 | +class ZoomChatDownloader(BaseChatDownloader): |
| 33 | + _NAME = 'zoom.us' |
| 34 | + |
| 35 | + _ZOOM_HOMEPAGE = 'https://zoom.us/' |
| 36 | + _ZOOM_PATH_TEMPLATE = 'rec/play/{id}' |
| 37 | + |
| 38 | + _INITIAL_INFO_REGEX = r'(?s)window\.__data__\s*=\s*({.+?});' |
| 39 | + _CHAT_MESSAGES_REGEX = r'window\.__data__\.chatList\.push\((\{[\s\S]+?\})\)' |
| 40 | + |
| 41 | + _SITE_DEFAULT_PARAMS = { |
| 42 | + 'format': 'default', # TODO create zoom format |
| 43 | + } |
| 44 | + |
| 45 | + _REMAPPING = { |
| 46 | + 'username': 'author_name', |
| 47 | + 'time': 'time_text', |
| 48 | + 'content': 'message', |
| 49 | + } |
| 50 | + |
| 51 | + _TESTS = [ |
| 52 | + { |
| 53 | + 'name': 'Get chat messages from past broadcast #1', |
| 54 | + 'params': { |
| 55 | + 'url': 'https://zoom.us/rec/play/6ccrIuigqG83GIaT4wSDAv59W9W5J_-s1HUe_6UPykq3V3hVN1emMucTYLEJiA87rIkEPcGptB0Dp_dH', |
| 56 | + 'max_messages': 10 |
| 57 | + }, |
| 58 | + 'expected_result': { |
| 59 | + 'messages_condition': lambda messages: len(messages) > 0, |
| 60 | + } |
| 61 | + }, |
| 62 | + { |
| 63 | + 'name': 'Get chat messages from past broadcast #2', |
| 64 | + 'params': { |
| 65 | + 'url': 'https://zoom.us/rec/play/65V5deGq-Do3T9bHuASDAv4tW420f_ms1iIb-vIKzEqzUiEFNFWiYONAN-vRvNmKnlg6z95Y4mNQ9QJQ', |
| 66 | + 'max_messages': 10 |
| 67 | + }, |
| 68 | + 'expected_result': { |
| 69 | + 'messages_condition': lambda messages: len(messages) > 0, |
| 70 | + } |
| 71 | + }, |
| 72 | + { |
| 73 | + 'name': 'Get chat messages from past broadcast #3', |
| 74 | + 'params': { |
| 75 | + 'url': 'https://zoom.us/rec/play/75Usc7j8rjg3E92S4gSDAf95W9S9K6-sg3dP_voImR60WiEHYVSmYrsbNwNE1_6-jwlwLx5cg1IeyjM', |
| 76 | + 'max_messages': 10 |
| 77 | + }, |
| 78 | + 'expected_result': { |
| 79 | + 'messages_condition': lambda messages: len(messages) > 0, |
| 80 | + } |
| 81 | + }, |
| 82 | + { |
| 83 | + 'name': 'Invalid video', |
| 84 | + 'params': { |
| 85 | + 'url': 'https://zoom.us/rec/play/invalid', |
| 86 | + }, |
| 87 | + 'expected_result': { |
| 88 | + 'error': ZoomError |
| 89 | + } |
| 90 | + }, |
| 91 | + ] |
| 92 | + |
| 93 | + # Regex provided by youtube-dl |
| 94 | + _VALID_URLS = { |
| 95 | + '_get_chat_by_video_id': r'(?P<base_url>https?://(?:[^.]+\.)?zoom.us/)rec(?:ording)?/(?:play|share)/(?P<id>[A-Za-z0-9_.-]+)', |
| 96 | + } |
| 97 | + _ERROR_MESSAGE_REGEX = r'<span class="error-message">\s*([^<]+?)\s*<\/span>' |
| 98 | + |
| 99 | + def _get_chat_by_video_id(self, match, params): |
| 100 | + match_id = match.group('id') |
| 101 | + base_url = match.group('base_url') |
| 102 | + return self.get_chat_by_video_id(match_id, params, base_url=base_url) |
| 103 | + |
| 104 | + def get_chat_by_video_id(self, video_id, params, base_url=_ZOOM_HOMEPAGE): |
| 105 | + |
| 106 | + url = base_url + self._ZOOM_PATH_TEMPLATE.format(id=video_id) |
| 107 | + page_data = self._session_get(url).text |
| 108 | + |
| 109 | + json_string = regex_search(page_data, self._INITIAL_INFO_REGEX) |
| 110 | + |
| 111 | + if json_string is None: |
| 112 | + error_message = regex_search(page_data, self._ERROR_MESSAGE_REGEX) |
| 113 | + if error_message: |
| 114 | + raise ZoomError(error_message.split('\n')[0]) |
| 115 | + else: |
| 116 | + raise ParsingError('Error parsing video') |
| 117 | + |
| 118 | + initial_info = self._parse_js_dict(json_string) |
| 119 | + |
| 120 | + video_type = 'video' if initial_info.get('isVideo') else 'not_video' |
| 121 | + |
| 122 | + return Chat( |
| 123 | + self._get_chat_messages(page_data, params), |
| 124 | + |
| 125 | + title=initial_info.get('topic'), |
| 126 | + video_type=video_type, |
| 127 | + start_time=initial_info.get('fileStartTime'), |
| 128 | + id=initial_info.get('recordingId'), |
| 129 | + ) |
| 130 | + |
| 131 | + def _parse_js_dict(self, json_string): |
| 132 | + # Helper method to parse JS dictionary format |
| 133 | + result = re.sub(r"^([^:\s]+):\s+", r'"\g<1>": ', |
| 134 | + json_string, 0, re.MULTILINE) |
| 135 | + result = result.replace(r"\'", "'") |
| 136 | + result = re.sub(r":\s+'(.*)'", ": \"\\g<1>\"", result, 0, re.MULTILINE) |
| 137 | + return json.loads(result) |
| 138 | + |
| 139 | + def _get_chat_messages(self, page_data, params): |
| 140 | + start_time = ensure_seconds(params.get('start_time'), 0) |
| 141 | + end_time = ensure_seconds(params.get('end_time'), float('inf')) |
| 142 | + |
| 143 | + for item in re.findall(self._CHAT_MESSAGES_REGEX, page_data): |
| 144 | + data = self._parse_js_dict(item) |
| 145 | + data = r.remap_dict(data, self._REMAPPING) |
| 146 | + |
| 147 | + # Process time inforamtion |
| 148 | + data['time_in_seconds'] = time_to_seconds(data['time_text']) |
| 149 | + if data['time_in_seconds'] < start_time: |
| 150 | + continue |
| 151 | + |
| 152 | + if data['time_in_seconds'] > end_time: |
| 153 | + return |
| 154 | + |
| 155 | + BaseChatDownloader._move_to_dict(data, 'author') |
| 156 | + yield data |
0 commit comments