|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 18, |
| 6 | + "id": "e222d8b3-85e8-46d6-aa36-e9f5b7227262", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "import yaml\n", |
| 11 | + "\n", |
| 12 | + "def parse_metadata(content):\n", |
| 13 | + " return yaml.safe_load(content)\n", |
| 14 | + "\n", |
| 15 | + "def parse_frontmatter(content):\n", |
| 16 | + " if not content.startswith('---'):\n", |
| 17 | + " return {}, content\n", |
| 18 | + "\n", |
| 19 | + " try:\n", |
| 20 | + " # Split frontmatter and content\n", |
| 21 | + " parts = content.split('---', 2)\n", |
| 22 | + " if len(parts) < 3:\n", |
| 23 | + " return {}, content\n", |
| 24 | + " \n", |
| 25 | + " frontmatter = yaml.safe_load(parts[1])\n", |
| 26 | + " markdown_content = parts[2].strip()\n", |
| 27 | + " \n", |
| 28 | + " return frontmatter or {}, markdown_content\n", |
| 29 | + " except yaml.YAMLError:\n", |
| 30 | + " return {}, content\n" |
| 31 | + ] |
| 32 | + }, |
| 33 | + { |
| 34 | + "cell_type": "code", |
| 35 | + "execution_count": 19, |
| 36 | + "id": "8d2cb777-17ee-4cfe-98ce-72a0c545a748", |
| 37 | + "metadata": {}, |
| 38 | + "outputs": [], |
| 39 | + "source": [ |
| 40 | + "from pathlib import Path" |
| 41 | + ] |
| 42 | + }, |
| 43 | + { |
| 44 | + "cell_type": "code", |
| 45 | + "execution_count": 20, |
| 46 | + "id": "6785181d-ae77-45ed-84d4-34d575de181f", |
| 47 | + "metadata": {}, |
| 48 | + "outputs": [], |
| 49 | + "source": [ |
| 50 | + "questions_root = Path('../_questions/')" |
| 51 | + ] |
| 52 | + }, |
| 53 | + { |
| 54 | + "cell_type": "code", |
| 55 | + "execution_count": 24, |
| 56 | + "id": "666aa62b-d3c8-40d9-9062-7b9b344543f4", |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [ |
| 59 | + { |
| 60 | + "data": { |
| 61 | + "text/plain": [ |
| 62 | + "'data-engineering-zoomcamp'" |
| 63 | + ] |
| 64 | + }, |
| 65 | + "execution_count": 24, |
| 66 | + "metadata": {}, |
| 67 | + "output_type": "execute_result" |
| 68 | + } |
| 69 | + ], |
| 70 | + "source": [] |
| 71 | + }, |
| 72 | + { |
| 73 | + "cell_type": "code", |
| 74 | + "execution_count": 25, |
| 75 | + "id": "255c50dd-1c01-4ceb-8295-5d0b2c1930ba", |
| 76 | + "metadata": {}, |
| 77 | + "outputs": [], |
| 78 | + "source": [ |
| 79 | + "course_sections = {}\n", |
| 80 | + "\n", |
| 81 | + "for metadata_file in questions_root.glob('*/_metadata.yaml'):\n", |
| 82 | + " content = metadata_file.read_text(encoding='utf8')\n", |
| 83 | + " metadata = parse_metadata(content)\n", |
| 84 | + "\n", |
| 85 | + " course_id = metadata_file.parent.name\n", |
| 86 | + " sections = {d['id']: d['name'] for d in metadata['sections']}\n", |
| 87 | + "\n", |
| 88 | + " course_sections[course_id] = sections" |
| 89 | + ] |
| 90 | + }, |
| 91 | + { |
| 92 | + "cell_type": "code", |
| 93 | + "execution_count": 36, |
| 94 | + "id": "f5b26175-77d1-43bb-9c7c-0a58e2cafe52", |
| 95 | + "metadata": { |
| 96 | + "scrolled": true |
| 97 | + }, |
| 98 | + "outputs": [], |
| 99 | + "source": [ |
| 100 | + "question_file = list()[0]" |
| 101 | + ] |
| 102 | + }, |
| 103 | + { |
| 104 | + "cell_type": "code", |
| 105 | + "execution_count": 39, |
| 106 | + "id": "f9cf0220-8447-4c54-ad08-697894297ae6", |
| 107 | + "metadata": {}, |
| 108 | + "outputs": [], |
| 109 | + "source": [] |
| 110 | + }, |
| 111 | + { |
| 112 | + "cell_type": "code", |
| 113 | + "execution_count": 43, |
| 114 | + "id": "9166b545-1203-4541-95e0-32cffed32ae8", |
| 115 | + "metadata": {}, |
| 116 | + "outputs": [], |
| 117 | + "source": [ |
| 118 | + "documents = []\n", |
| 119 | + "\n", |
| 120 | + "for question_file in questions_root.glob('**/*.md'):\n", |
| 121 | + " content = question_file.read_text(encoding='utf8')\n", |
| 122 | + " fm, answer = parse_frontmatter(content)\n", |
| 123 | + "\n", |
| 124 | + " section_dir = question_file.parent\n", |
| 125 | + " section_id = section_dir.name\n", |
| 126 | + " course_dir = module_dir.parent\n", |
| 127 | + " course_id = course_dir.name\n", |
| 128 | + "\n", |
| 129 | + " section_name = course_sections[course_id].get(section_id, section_id)\n", |
| 130 | + " document = {\n", |
| 131 | + " 'course': course_id,\n", |
| 132 | + " 'section': section_name,\n", |
| 133 | + " 'section_id': section_id,\n", |
| 134 | + " 'question': fm['question'],\n", |
| 135 | + " 'answer': answer,\n", |
| 136 | + " 'document_id': fm['id']\n", |
| 137 | + " }\n", |
| 138 | + "\n", |
| 139 | + " documents.append(document)" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "cell_type": "code", |
| 144 | + "execution_count": 44, |
| 145 | + "id": "14b7f275-ce0e-48a2-b237-1977107b3989", |
| 146 | + "metadata": {}, |
| 147 | + "outputs": [ |
| 148 | + { |
| 149 | + "data": { |
| 150 | + "text/plain": [ |
| 151 | + "1177" |
| 152 | + ] |
| 153 | + }, |
| 154 | + "execution_count": 44, |
| 155 | + "metadata": {}, |
| 156 | + "output_type": "execute_result" |
| 157 | + } |
| 158 | + ], |
| 159 | + "source": [ |
| 160 | + "len(documents)" |
| 161 | + ] |
| 162 | + }, |
| 163 | + { |
| 164 | + "cell_type": "code", |
| 165 | + "execution_count": 46, |
| 166 | + "id": "75f230d4-2c10-4e83-a624-08a2a70584d7", |
| 167 | + "metadata": {}, |
| 168 | + "outputs": [ |
| 169 | + { |
| 170 | + "data": { |
| 171 | + "text/plain": [ |
| 172 | + "{'course': 'mlops-zoomcamp',\n", |
| 173 | + " 'section': 'General Course-Related Questions',\n", |
| 174 | + " 'section_id': 'general',\n", |
| 175 | + " 'question': 'Is it going to be live? When?',\n", |
| 176 | + " 'answer': 'The course videos are pre-recorded, and you can start watching the course right now.\\n\\nThe zoomcamps are spread out throughout the year. See the article [Guide to Free Online Courses at DataTalks Club](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\\n\\nWe will also occasionally have office hours—live sessions where we will answer your questions. The office hours sessions are recorded too.\\n\\nYou can see the office hours (playlist with year 20xx) as well as the pre-recorded course videos in the Course Channel’s Bookmarks and/or [DTC’s YouTube channel](https://www.youtube.com/@DataTalksClub/playlists).',\n", |
| 177 | + " 'document_id': '7f10dde6f8'}" |
| 178 | + ] |
| 179 | + }, |
| 180 | + "execution_count": 46, |
| 181 | + "metadata": {}, |
| 182 | + "output_type": "execute_result" |
| 183 | + } |
| 184 | + ], |
| 185 | + "source": [ |
| 186 | + "documents[1]" |
| 187 | + ] |
| 188 | + }, |
| 189 | + { |
| 190 | + "cell_type": "code", |
| 191 | + "execution_count": null, |
| 192 | + "id": "06803853-967f-4210-8718-c9cc55ed605d", |
| 193 | + "metadata": {}, |
| 194 | + "outputs": [], |
| 195 | + "source": [] |
| 196 | + } |
| 197 | + ], |
| 198 | + "metadata": { |
| 199 | + "kernelspec": { |
| 200 | + "display_name": "Python 3 (ipykernel)", |
| 201 | + "language": "python", |
| 202 | + "name": "python3" |
| 203 | + }, |
| 204 | + "language_info": { |
| 205 | + "codemirror_mode": { |
| 206 | + "name": "ipython", |
| 207 | + "version": 3 |
| 208 | + }, |
| 209 | + "file_extension": ".py", |
| 210 | + "mimetype": "text/x-python", |
| 211 | + "name": "python", |
| 212 | + "nbconvert_exporter": "python", |
| 213 | + "pygments_lexer": "ipython3", |
| 214 | + "version": "3.13.5" |
| 215 | + } |
| 216 | + }, |
| 217 | + "nbformat": 4, |
| 218 | + "nbformat_minor": 5 |
| 219 | +} |
0 commit comments