Skip to content

Commit 411f215

Browse files
committed
Split files
1 parent 14e70ee commit 411f215

File tree

3 files changed

+202
-184
lines changed

3 files changed

+202
-184
lines changed

vector_store/knowledge_source.py

Lines changed: 20 additions & 180 deletions
Original file line numberDiff line numberDiff line change
@@ -1,179 +1,5 @@
1-
# import logging
2-
# import os
3-
4-
# import pyigloo
5-
6-
# from constants import SOURCE_RESPOSITORY_PATH
7-
8-
# logger = logging.getLogger(__name__)
9-
10-
11-
# class Igloo:
12-
# """Class for connecting to igloo."""
13-
14-
# def __init__(self, endpoint: str):
15-
# """Initialize."""
16-
# self.endpoint: str = endpoint
17-
# # TODO: Raise an error if any of these are None
18-
# self.api_user: str = os.environ.get("IGLOO_USER", None)
19-
# self.api_pass: str = os.environ.get("IGLOO_PASS", None)
20-
# self.api_key: str = os.environ.get("IGLOO_API_KEY", None)
21-
# self.access_key: str = os.environ.get("IGLOO_ACCESS_KEY", None)
22-
23-
# info = {
24-
# "ACCESS_KEY": self.access_key,
25-
# "API_KEY": self.api_key,
26-
# "API_USER": self.api_user,
27-
# "API_PASSWORD": self.api_pass,
28-
# "API_ENDPOINT": self.endpoint,
29-
# }
30-
# self.session = pyigloo.igloo(info=info)
31-
32-
# def get_object(self, object_id: str):
33-
# """Get a single object."""
34-
# result = self.session.objects_view(objectid=object_id)
35-
# return result
36-
37-
# def get_children_from_parent(
38-
# self,
39-
# parent_path: str | None = None,
40-
# parent_object_id: str | None = None,
41-
# recursive: bool = False,
42-
# ):
43-
# """Get all children from a parent url path."""
44-
# # Get the parent object id
45-
# if parent_path is None and parent_object_id is None:
46-
# raise ValueError("Must set one of 'parent_path' or 'parent_object_id'")
47-
# if parent_path is not None:
48-
# logger.info(f"Fetching objects under path {parent_path}")
49-
# response = self.session.objects_bypath(path=parent_path)
50-
# if response is None:
51-
# raise ValueError(
52-
# f"Parent path {parent_path} does not exist. Please check the path and try again."
53-
# )
54-
# parent_object_id = response["id"]
55-
56-
# # Get all the children
57-
# all_children = []
58-
# for child in self.session.get_all_children_from_object(
59-
# parent_object_id, pagesize=100
60-
# ):
61-
# children = [child]
62-
# if recursive:
63-
# try:
64-
# child_object_id = child["id"]
65-
# childs_children = self.get_children_from_parent(
66-
# parent_object_id=child_object_id, recursive=True
67-
# )
68-
# except TypeError:
69-
# continue
70-
# children.extend(childs_children)
71-
# all_children.extend(children)
72-
73-
# return all_children
74-
75-
# def get_document_binary(self, document_id: str) -> bytes:
76-
# """Get the contents of a document."""
77-
# # Send a request to the /documents/document_id/view_binary endpoint to get file contents
78-
# endpoint = self.session.endpoint
79-
# api_root = self.session.IGLOO_API_ROOT_V1
80-
# url = "{0}{1}/documents/{2}/view_binary".format(endpoint, api_root, document_id)
81-
# headers = {b"Accept": "application/json"}
82-
# response = self.session.igloo.get(url=url, headers=headers)
83-
# return response.content
84-
85-
# def get_attachments(self, object_id: str):
86-
# """Get all attachments on an object."""
87-
# # Get page metadata
88-
# page = self.get_object(object_id=object_id)
89-
# # List the attachments
90-
# page_attachments = self.session.attachments_view(objectid=object_id)
91-
# items = page_attachments.get("items", [])
92-
# # Get information about each attachment
93-
# attachments = []
94-
# for item in items:
95-
# document_id = item["ToId"]
96-
# document_metadata = self.session.objects_view(document_id)
97-
# document_binary = self.get_document_binary(document_id=document_id)
98-
# attachment = document_metadata | {
99-
# "contentBinary": document_binary,
100-
# "attachedToHref": page["href"],
101-
# }
102-
# attachments.append(attachment)
103-
# return attachments
104-
105-
106-
# def fetchall(
107-
# url_fragment: str,
108-
# recursive: bool = False,
109-
# attachments: bool = True,
110-
# metadata: dict = {},
111-
# **kwargs,
112-
# ):
113-
# """
114-
# Fetch pages from the Source.
115-
116-
# Args:
117-
# ----
118-
# url_fragment (str): URL fragment to pull all children from.
119-
# For example, to pull all pages under https://source.redhat.com/departments/operations/travel,
120-
# set url_fragment="/departments/operations/travel"
121-
# recursive (bool): Whether or not to recurse into child pages. Defaults to False.
122-
# attachments (bool): Whether or not to fetch page attachments. Defaults to True.
123-
# metadata (dict): Metadata to attach to each page chunk. Defaults to {}.
124-
# **kwargs: Additional arguments not used.
125-
126-
# """
127-
# endpoint = "https://source.redhat.com/"
128-
129-
# # Connect to Igloo
130-
# igloo = Igloo(endpoint=endpoint)
131-
132-
# # Get all documents under parent path
133-
# fragment_documents = igloo.get_children_from_parent(
134-
# parent_path=url_fragxment, recursive=recursive
135-
# )
136-
137-
# # Fetch all attachments
138-
# if attachments:
139-
# for document in fragment_documents:
140-
# object_id = document["id"]
141-
# object_attachments = igloo.get_attachments(object_id=object_id)
142-
# fragment_documents += object_attachments
143-
144-
# # Convert to files and save locally
145-
# meta_lookup = {}
146-
# for document in fragment_documents:
147-
# if document["isPublished"] and not document["IsArchived"]:
148-
# # Write the document in it's URL path locally
149-
# doc_href: str = document.get("attachedToHref", document["href"])
150-
# extension = document.get("fileExtension", ".html")
151-
# doc_title: str = document["title"].replace(extension, "")
152-
# doc_path = doc_href.lstrip("/") + "/" + doc_title + extension
153-
# path = SOURCE_RESPOSITORY_PATH / doc_path
154-
# folder_path = path.parent
155-
# if document["content"].strip() != "" or "contentBinary" in document:
156-
# if not os.path.exists(folder_path):
157-
# os.makedirs(folder_path)
158-
# if "contentBinary" in document:
159-
# with open(path, "wb") as f:
160-
# f.write(document["contentBinary"])
161-
# else:
162-
# with open(path, "w") as f:
163-
# f.write(document["content"])
164-
165-
# # Save metadata
166-
# used_columns = ["content", "contentBinary"]
167-
# file_metadata = {
168-
# key: value for key, value in document.items() if key not in used_columns
169-
# }
170-
# file_metadata["url"] = endpoint + doc_href.lstrip("/")
171-
# file_metadata = file_metadata | metadata
172-
# meta_lookup[path] = file_metadata
173-
174-
# return meta_lookup
175-
176-
# TODO (@abhikdps): Remove the below code once the above one starts working and uncomment the above
1+
# TODO (@abhikdps): Remove this file once the Igloo API keys
2+
# are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
1773
import pathlib
1784
import time
1795
import logging
@@ -186,6 +12,7 @@
18612

18713
logger = logging.getLogger(__name__)
18814

15+
18916
class SourceScraper:
19017
def __init__(self, base_url: str = "https://source.redhat.com/"):
19118
chrome_options = Options()
@@ -243,20 +70,32 @@ def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
24370
file_name = link.split("/")[-1]
24471
full_path = base_path / file_name
24572
try:
246-
self.driver.get(link if link.startswith("http") else self.base_url.rstrip("/") + link)
73+
self.driver.get(
74+
link
75+
if link.startswith("http")
76+
else self.base_url.rstrip("/") + link
77+
)
24778
with open(full_path, "wb") as f:
24879
f.write(self.driver.page_source.encode("utf-8"))
24980
except Exception as e:
25081
logger.warning(f"Failed to download attachment {link}: {e}")
25182

252-
def scrape(self, url_fragment: str, recursive: bool, attachments: bool, metadata: dict[str, Any]):
83+
def scrape(
84+
self,
85+
url_fragment: str,
86+
recursive: bool,
87+
attachments: bool,
88+
metadata: dict[str, Any],
89+
):
25390
meta_lookup = {}
25491
pages = self.fetch_all_pages(url_fragment, recursive)
25592

25693
for i, soup in enumerate(pages):
25794
title = soup.title.string if soup.title else f"page_{i}"
25895
safe_title = title.replace("/", "_").replace(" ", "_")[:50]
259-
page_path = SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
96+
page_path = (
97+
SOURCE_RESPOSITORY_PATH / url_fragment.strip("/") / f"{safe_title}.html"
98+
)
26099
page_path.parent.mkdir(parents=True, exist_ok=True)
261100

262101
self.save_page(soup, page_path)
@@ -271,7 +110,8 @@ def scrape(self, url_fragment: str, recursive: bool, attachments: bool, metadata
271110

272111
return meta_lookup
273112

274-
def fetch_source(
113+
114+
def fetchall(
275115
url_fragment: str,
276116
recursive: bool = False,
277117
attachments: bool = True,

0 commit comments

Comments
 (0)