Skip to content

Commit c45e06c

Browse files
rag
1 parent 379d107 commit c45e06c

File tree

4 files changed

+413
-3
lines changed

4 files changed

+413
-3
lines changed

_questions/mlops-zoomcamp/general/0010_fde155ddfb_course-i-forgot-if-i-registered-can-i-still-join-t.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
---
22
id: fde155ddfb
3-
question: 'Course: I forgot if I registered, can I still join the zoomcampation of
4-
this course or that for each module?'
3+
question: 'I forgot if I registered, can I still join the zoomcamp?'
54
sort_order: 10
65
---
76

_questions/mlops-zoomcamp/general/0020_7f10dde6f8_is-it-going-to-be-live-when.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ The course videos are pre-recorded, and you can start watching the course right
88

99
The zoomcamps are spread out throughout the year. See the article [Guide to Free Online Courses at DataTalks Club](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).
1010

11-
We will also occasionally have office hourslive sessions where we will answer your questions. The office hours sessions are recorded too.
11+
We will also occasionally have office hourslive sessions where we will answer your questions. The office hours sessions are recorded too.
1212

1313
You can see the office hours (playlist with year 20xx) as well as the pre-recorded course videos in the Course Channel’s Bookmarks and/or [DTC’s YouTube channel](https://www.youtube.com/@DataTalksClub/playlists).

notebooks/rag.ipynb

Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 18,
6+
"id": "e222d8b3-85e8-46d6-aa36-e9f5b7227262",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"import yaml\n",
11+
"\n",
12+
"def parse_metadata(content):\n",
13+
" return yaml.safe_load(content)\n",
14+
"\n",
15+
"def parse_frontmatter(content):\n",
16+
" if not content.startswith('---'):\n",
17+
" return {}, content\n",
18+
"\n",
19+
" try:\n",
20+
" # Split frontmatter and content\n",
21+
" parts = content.split('---', 2)\n",
22+
" if len(parts) < 3:\n",
23+
" return {}, content\n",
24+
" \n",
25+
" frontmatter = yaml.safe_load(parts[1])\n",
26+
" markdown_content = parts[2].strip()\n",
27+
" \n",
28+
" return frontmatter or {}, markdown_content\n",
29+
" except yaml.YAMLError:\n",
30+
" return {}, content\n"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 19,
36+
"id": "8d2cb777-17ee-4cfe-98ce-72a0c545a748",
37+
"metadata": {},
38+
"outputs": [],
39+
"source": [
40+
"from pathlib import Path"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 20,
46+
"id": "6785181d-ae77-45ed-84d4-34d575de181f",
47+
"metadata": {},
48+
"outputs": [],
49+
"source": [
50+
"questions_root = Path('../_questions/')"
51+
]
52+
},
53+
{
54+
"cell_type": "code",
55+
"execution_count": 24,
56+
"id": "666aa62b-d3c8-40d9-9062-7b9b344543f4",
57+
"metadata": {},
58+
"outputs": [
59+
{
60+
"data": {
61+
"text/plain": [
62+
"'data-engineering-zoomcamp'"
63+
]
64+
},
65+
"execution_count": 24,
66+
"metadata": {},
67+
"output_type": "execute_result"
68+
}
69+
],
70+
"source": []
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 25,
75+
"id": "255c50dd-1c01-4ceb-8295-5d0b2c1930ba",
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"course_sections = {}\n",
80+
"\n",
81+
"for metadata_file in questions_root.glob('*/_metadata.yaml'):\n",
82+
" content = metadata_file.read_text(encoding='utf8')\n",
83+
" metadata = parse_metadata(content)\n",
84+
"\n",
85+
" course_id = metadata_file.parent.name\n",
86+
" sections = {d['id']: d['name'] for d in metadata['sections']}\n",
87+
"\n",
88+
" course_sections[course_id] = sections"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": 36,
94+
"id": "f5b26175-77d1-43bb-9c7c-0a58e2cafe52",
95+
"metadata": {
96+
"scrolled": true
97+
},
98+
"outputs": [],
99+
"source": [
100+
"question_file = list()[0]"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 39,
106+
"id": "f9cf0220-8447-4c54-ad08-697894297ae6",
107+
"metadata": {},
108+
"outputs": [],
109+
"source": []
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": 43,
114+
"id": "9166b545-1203-4541-95e0-32cffed32ae8",
115+
"metadata": {},
116+
"outputs": [],
117+
"source": [
118+
"documents = []\n",
119+
"\n",
120+
"for question_file in questions_root.glob('**/*.md'):\n",
121+
" content = question_file.read_text(encoding='utf8')\n",
122+
" fm, answer = parse_frontmatter(content)\n",
123+
"\n",
124+
" section_dir = question_file.parent\n",
125+
" section_id = section_dir.name\n",
126+
" course_dir = module_dir.parent\n",
127+
" course_id = course_dir.name\n",
128+
"\n",
129+
" section_name = course_sections[course_id].get(section_id, section_id)\n",
130+
" document = {\n",
131+
" 'course': course_id,\n",
132+
" 'section': section_name,\n",
133+
" 'section_id': section_id,\n",
134+
" 'question': fm['question'],\n",
135+
" 'answer': answer,\n",
136+
" 'document_id': fm['id']\n",
137+
" }\n",
138+
"\n",
139+
" documents.append(document)"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": 44,
145+
"id": "14b7f275-ce0e-48a2-b237-1977107b3989",
146+
"metadata": {},
147+
"outputs": [
148+
{
149+
"data": {
150+
"text/plain": [
151+
"1177"
152+
]
153+
},
154+
"execution_count": 44,
155+
"metadata": {},
156+
"output_type": "execute_result"
157+
}
158+
],
159+
"source": [
160+
"len(documents)"
161+
]
162+
},
163+
{
164+
"cell_type": "code",
165+
"execution_count": 46,
166+
"id": "75f230d4-2c10-4e83-a624-08a2a70584d7",
167+
"metadata": {},
168+
"outputs": [
169+
{
170+
"data": {
171+
"text/plain": [
172+
"{'course': 'mlops-zoomcamp',\n",
173+
" 'section': 'General Course-Related Questions',\n",
174+
" 'section_id': 'general',\n",
175+
" 'question': 'Is it going to be live? When?',\n",
176+
" 'answer': 'The course videos are pre-recorded, and you can start watching the course right now.\\n\\nThe zoomcamps are spread out throughout the year. See the article [Guide to Free Online Courses at DataTalks Club](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\\n\\nWe will also occasionally have office hours—live sessions where we will answer your questions. The office hours sessions are recorded too.\\n\\nYou can see the office hours (playlist with year 20xx) as well as the pre-recorded course videos in the Course Channel’s Bookmarks and/or [DTC’s YouTube channel](https://www.youtube.com/@DataTalksClub/playlists).',\n",
177+
" 'document_id': '7f10dde6f8'}"
178+
]
179+
},
180+
"execution_count": 46,
181+
"metadata": {},
182+
"output_type": "execute_result"
183+
}
184+
],
185+
"source": [
186+
"documents[1]"
187+
]
188+
},
189+
{
190+
"cell_type": "code",
191+
"execution_count": null,
192+
"id": "06803853-967f-4210-8718-c9cc55ed605d",
193+
"metadata": {},
194+
"outputs": [],
195+
"source": []
196+
}
197+
],
198+
"metadata": {
199+
"kernelspec": {
200+
"display_name": "Python 3 (ipykernel)",
201+
"language": "python",
202+
"name": "python3"
203+
},
204+
"language_info": {
205+
"codemirror_mode": {
206+
"name": "ipython",
207+
"version": 3
208+
},
209+
"file_extension": ".py",
210+
"mimetype": "text/x-python",
211+
"name": "python",
212+
"nbconvert_exporter": "python",
213+
"pygments_lexer": "ipython3",
214+
"version": "3.13.5"
215+
}
216+
},
217+
"nbformat": 4,
218+
"nbformat_minor": 5
219+
}

0 commit comments

Comments
 (0)