Skip to content

Commit f1036a8

Browse files
committed
feat: Add the necessary structure for the feature pipelines and MongoDB SQL
1 parent c4d7849 commit f1036a8

File tree

16 files changed

+154
-228
lines changed

16 files changed

+154
-228
lines changed

configs/compute_rag_vector_index.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
parameters:
1+
parameters:
2+
limit: 100
3+
extract_collection_name: raw_data

configs/etl.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
parameters:
1+
parameters:
2+
data_directory: data/
3+
load_collection_name: raw_data

configs/generate_dataset.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1-
parameters:
1+
parameters:
2+
limit: 100
3+
extract_collection_name: raw_data

pipelines/compute_rag_vector_index.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,10 @@
11
from zenml import pipeline
22

3-
from second_brain.config import settings
43
from steps.infrastructure import (
54
fetch_from_mongodb,
65
)
76

87

98
@pipeline
10-
def compute_rag_vector_index() -> None:
11-
fetch_documents_config = {
12-
"mongodb_uri": settings.MONGODB_OFFLINE_URI,
13-
"database_name": settings.MONGODB_OFFLINE_DATABASE,
14-
"collection_name": settings.MONGODB_OFFLINE_COLLECTION,
15-
"limit": 100,
16-
}
17-
18-
fetch_from_mongodb(**fetch_documents_config)
9+
def compute_rag_vector_index(limit: int = 100, extract_collection_name: str = "raw_data") -> None:
10+
fetch_from_mongodb(limit=limit, collection_name=extract_collection_name)

pipelines/etl.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,14 @@
11
from zenml import pipeline
22

3-
from second_brain.config import settings
3+
from steps.etl import crawl, read_pages_from_disk
44
from steps.infrastructure import (
5-
fetch_from_mongodb,
65
ingest_to_mongodb,
76
)
87

98

109
@pipeline
11-
def etl() -> None:
12-
ingest_json_config = {
13-
"mongodb_uri": settings.MONGODB_OFFLINE_URI,
14-
"database_name": settings.MONGODB_OFFLINE_DATABASE,
15-
"collection_name": settings.MONGODB_OFFLINE_COLLECTION,
16-
"data_directory": settings.DATA_DIRECTORY,
17-
}
18-
fetch_documents_config = {
19-
"mongodb_uri": settings.MONGODB_OFFLINE_URI,
20-
"database_name": settings.MONGODB_OFFLINE_DATABASE,
21-
"collection_name": settings.MONGODB_OFFLINE_COLLECTION,
22-
"limit": 100,
23-
}
10+
def etl(data_directory: str, load_collection_name: str) -> None:
11+
pages = read_pages_from_disk(data_directory=data_directory)
12+
documents = crawl(pages=pages)
13+
ingest_to_mongodb(documents=documents, collection_name=load_collection_name)
2414

25-
ingest_to_mongodb(**ingest_json_config)
26-
27-
fetch_from_mongodb(**fetch_documents_config)

pipelines/generate_dataset.py

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,10 @@
11
from zenml import pipeline
22

3-
from second_brain.config import settings
43
from steps.infrastructure import (
54
fetch_from_mongodb,
65
)
76

87

98
@pipeline
10-
def generate_dataset() -> None:
11-
fetch_documents_config = {
12-
"mongodb_uri": settings.MONGODB_OFFLINE_URI,
13-
"database_name": settings.MONGODB_OFFLINE_DATABASE,
14-
"collection_name": settings.MONGODB_OFFLINE_COLLECTION,
15-
"limit": 100,
16-
}
17-
18-
fetch_from_mongodb(**fetch_documents_config)
9+
def generate_dataset(limit: int = 100, extract_collection_name: str = "raw_data") -> None:
10+
fetch_from_mongodb(limit=limit, collection_name=extract_collection_name)

src/second_brain/config.py

Lines changed: 14 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,6 @@ class Settings(BaseSettings):
3333
COMET_API_KEY: Optional[str] = None # API key for CometML integration.
3434
COMET_PROJECT_NAME: str = "twin" # CometML project name for tracking experiments.
3535

36-
# --- Default Genre ---
37-
DEFAULT_GENRE: str = Field("Western", description="Default genre for querying.")
38-
39-
# --- Docker and Network Configuration ---
40-
DOCKER_NETWORK_NAME: str = Field(
41-
"zenml-network", description="Docker network for the application."
42-
)
43-
4436
# --- Enable Flags ---
4537
ENABLE_OFFLINE_MODE: bool = Field(
4638
True, description="Flag to enable offline mode (disables online ingestion)."
@@ -56,32 +48,17 @@ class Settings(BaseSettings):
5648
# --- Hugging Face Configuration ---
5749
HUGGINGFACE_ACCESS_TOKEN: Optional[str] = None # Token for Hugging Face API.
5850

59-
# --- Local Data File Path ---
60-
DATA_DIRECTORY: str = Field(
61-
"./data",
62-
description="Path to the local JSON file for offline processing.",
63-
)
64-
65-
# --- MongoDB Atlas Local Configuration ---
66-
MONGODB_OFFLINE_COLLECTION: str = (
67-
"offline_documents" # Name of the collection in the offline database.
68-
)
69-
MONGODB_OFFLINE_DATABASE: str = "rag_pipeline" # Name of the offline database.
51+
# --- MongoDB Atlas Configuration ---
52+
MONGODB_DATABASE_NAME: str = "second_brain"
7053
MONGODB_OFFLINE_URI: str = Field(
7154
default_factory=lambda: os.getenv(
7255
"MONGODB_OFFLINE_URI", "mongodb://127.0.0.1:27017"
7356
),
74-
description="Connection URI for local MongoDB Atlas instance.",
75-
)
76-
77-
# --- MongoDB Atlas Cloud Configuration ---
78-
MONGODB_ONLINE_COLLECTION: str = (
79-
"movies" # Name of the collection in the online database.
57+
description="Connection URI for the local MongoDB Atlas instance.",
8058
)
81-
MONGODB_ONLINE_DATABASE: str = "sample_mflix" # Name of the online database.
8259
MONGODB_ONLINE_URI: str | None = Field(
8360
default=None,
84-
description="Connection URI for cloud MongoDB Atlas instance.",
61+
description="Connection URI for the Cloud MongoDB Atlas instance.",
8562
)
8663

8764
# --- Notion API Configuration ---
@@ -93,9 +70,8 @@ class Settings(BaseSettings):
9370

9471
# --- Docker Runtime ---
9572
IS_RUNNING_IN_DOCKER: bool = Field(
96-
default_factory=lambda: os.getenv("IS_RUNNING_IN_DOCKER", "false").lower()
97-
in ["true", "1"],
9873
description="Flag to indicate if the application is running inside a Docker container.",
74+
default=False,
9975
)
10076

10177
def __init__(self, **kwargs):
@@ -104,45 +80,23 @@ def __init__(self, **kwargs):
10480
"""
10581
super().__init__(**kwargs)
10682

107-
# Adjust MongoDB URI based on runtime conditions, but respect .env
108-
if os.getenv("IS_RUNNING_IN_DOCKER", "false").lower() == "true":
109-
self.MONGODB_OFFLINE_URI = os.getenv(
110-
"MONGODB_OFFLINE_URI", "mongodb://mongodb-atlas-local:27017"
111-
)
112-
11383
@property
11484
def MONGODB_URI(self) -> str:
11585
"""
11686
Returns the appropriate MongoDB URI based on ENABLE_OFFLINE_MODE.
11787
"""
11888

119-
return (
120-
self.MONGODB_OFFLINE_URI
121-
if self.ENABLE_OFFLINE_MODE
122-
else self.MONGODB_ONLINE_URI
123-
)
89+
if self.IS_RUNNING_IN_DOCKER is True:
90+
self.MONGODB_OFFLINE_URI = "mongodb://mongodb-atlas-local:27017"
12491

125-
@property
126-
def DATABASE_NAME(self) -> str:
127-
"""
128-
Returns the appropriate database name based on ENABLE_OFFLINE_MODE.
129-
"""
130-
return (
131-
self.MONGODB_OFFLINE_DATABASE
132-
if self.ENABLE_OFFLINE_MODE
133-
else self.MONGODB_ONLINE_DATABASE
134-
)
92+
if self.ENABLE_OFFLINE_MODE is True:
93+
return self.MONGODB_OFFLINE_URI
13594

136-
@property
137-
def COLLECTION_NAME(self) -> str:
138-
"""
139-
Returns the appropriate collection name based on ENABLE_OFFLINE_MODE.
140-
"""
141-
return (
142-
self.MONGODB_OFFLINE_COLLECTION
143-
if self.ENABLE_OFFLINE_MODE
144-
else self.MONGODB_ONLINE_COLLECTION
145-
)
95+
assert (
96+
self.MONGODB_ONLINE_URI is not None
97+
), "MONGODB_ONLINE_URI is not set, while ENABLE_OFFLINE_MODE is False."
98+
99+
return self.MONGODB_ONLINE_URI
146100

147101
@property
148102
def OPENAI_MAX_TOKEN_WINDOW(self) -> int:

src/second_brain/entities/page.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,29 @@ class PageMetadata(BaseModel):
1414

1515

1616
class Page(BaseModel):
17-
page_metadata: PageMetadata
17+
metadata: PageMetadata
1818
content: str
1919
urls: list[str]
2020

21+
@classmethod
22+
def from_file(cls, file_path: Path) -> "Page":
23+
"""Read a Page object from a JSON file.
24+
25+
Args:
26+
file_path: Path to the JSON file containing page data.
27+
28+
Returns:
29+
Page: A new Page instance constructed from the file data.
30+
31+
Raises:
32+
FileNotFoundError: If the specified file doesn't exist.
33+
ValidationError: If the JSON data doesn't match the expected model structure.
34+
"""
35+
36+
json_data = file_path.read_text(encoding="utf-8")
37+
38+
return cls.model_validate_json(json_data)
39+
2140
def write(
2241
self, file_path: Path, obfuscate: bool = False, also_save_as_txt: bool = False
2342
) -> None:
@@ -44,18 +63,18 @@ def write(
4463
def _obfuscate_data(self, data: dict) -> dict:
4564
"""Obfuscate sensitive IDs in the page data."""
4665

47-
original_id = data["page_metadata"]["id"]
66+
original_id = data["metadata"]["id"]
4867
fake_id = self._generate_random_hex(32)
4968

5069
obfuscated_data = data.copy()
5170

5271
# Obfuscate the page ID (32-char hex)
53-
obfuscated_data["page_metadata"]["id"] = fake_id
72+
obfuscated_data["metadata"]["id"] = fake_id
5473

5574
# Obfuscate UUID in URL if present
56-
url = data["page_metadata"]["url"]
75+
url = data["metadata"]["url"]
5776
flattened_original_id = original_id.replace("-", "")
58-
obfuscated_data["page_metadata"]["url"] = url.replace(
77+
obfuscated_data["metadata"]["url"] = url.replace(
5978
flattened_original_id, fake_id
6079
)
6180

0 commit comments

Comments
 (0)