1- # import logging
2- # import os
3-
4- # import pyigloo
5-
6- # from constants import SOURCE_RESPOSITORY_PATH
7-
8- # logger = logging.getLogger(__name__)
9-
10-
11- # class Igloo:
12- # """Class for connecting to igloo."""
13-
14- # def __init__(self, endpoint: str):
15- # """Initialize."""
16- # self.endpoint: str = endpoint
17- # # TODO: Raise an error if any of these are None
18- # self.api_user: str = os.environ.get("IGLOO_USER", None)
19- # self.api_pass: str = os.environ.get("IGLOO_PASS", None)
20- # self.api_key: str = os.environ.get("IGLOO_API_KEY", None)
21- # self.access_key: str = os.environ.get("IGLOO_ACCESS_KEY", None)
22-
23- # info = {
24- # "ACCESS_KEY": self.access_key,
25- # "API_KEY": self.api_key,
26- # "API_USER": self.api_user,
27- # "API_PASSWORD": self.api_pass,
28- # "API_ENDPOINT": self.endpoint,
29- # }
30- # self.session = pyigloo.igloo(info=info)
31-
32- # def get_object(self, object_id: str):
33- # """Get a single object."""
34- # result = self.session.objects_view(objectid=object_id)
35- # return result
36-
37- # def get_children_from_parent(
38- # self,
39- # parent_path: str | None = None,
40- # parent_object_id: str | None = None,
41- # recursive: bool = False,
42- # ):
43- # """Get all children from a parent url path."""
44- # # Get the parent object id
45- # if parent_path is None and parent_object_id is None:
46- # raise ValueError("Must set one of 'parent_path' or 'parent_object_id'")
47- # if parent_path is not None:
48- # logger.info(f"Fetching objects under path {parent_path}")
49- # response = self.session.objects_bypath(path=parent_path)
50- # if response is None:
51- # raise ValueError(
52- # f"Parent path {parent_path} does not exist. Please check the path and try again."
53- # )
54- # parent_object_id = response["id"]
55-
56- # # Get all the children
57- # all_children = []
58- # for child in self.session.get_all_children_from_object(
59- # parent_object_id, pagesize=100
60- # ):
61- # children = [child]
62- # if recursive:
63- # try:
64- # child_object_id = child["id"]
65- # childs_children = self.get_children_from_parent(
66- # parent_object_id=child_object_id, recursive=True
67- # )
68- # except TypeError:
69- # continue
70- # children.extend(childs_children)
71- # all_children.extend(children)
72-
73- # return all_children
74-
75- # def get_document_binary(self, document_id: str) -> bytes:
76- # """Get the contents of a document."""
77- # # Send a request to the /documents/document_id/view_binary endpoint to get file contents
78- # endpoint = self.session.endpoint
79- # api_root = self.session.IGLOO_API_ROOT_V1
80- # url = "{0}{1}/documents/{2}/view_binary".format(endpoint, api_root, document_id)
81- # headers = {b"Accept": "application/json"}
82- # response = self.session.igloo.get(url=url, headers=headers)
83- # return response.content
84-
85- # def get_attachments(self, object_id: str):
86- # """Get all attachments on an object."""
87- # # Get page metadata
88- # page = self.get_object(object_id=object_id)
89- # # List the attachments
90- # page_attachments = self.session.attachments_view(objectid=object_id)
91- # items = page_attachments.get("items", [])
92- # # Get information about each attachment
93- # attachments = []
94- # for item in items:
95- # document_id = item["ToId"]
96- # document_metadata = self.session.objects_view(document_id)
97- # document_binary = self.get_document_binary(document_id=document_id)
98- # attachment = document_metadata | {
99- # "contentBinary": document_binary,
100- # "attachedToHref": page["href"],
101- # }
102- # attachments.append(attachment)
103- # return attachments
104-
105-
106- # def fetchall(
107- # url_fragment: str,
108- # recursive: bool = False,
109- # attachments: bool = True,
110- # metadata: dict = {},
111- # **kwargs,
112- # ):
113- # """
114- # Fetch pages from the Source.
115-
116- # Args:
117- # ----
118- # url_fragment (str): URL fragment to pull all children from.
119- # For example, to pull all pages under https://source.redhat.com/departments/operations/travel,
120- # set url_fragment="/departments/operations/travel"
121- # recursive (bool): Whether or not to recurse into child pages. Defaults to False.
122- # attachments (bool): Whether or not to fetch page attachments. Defaults to True.
123- # metadata (dict): Metadata to attach to each page chunk. Defaults to {}.
124- # **kwargs: Additional arguments not used.
125-
126- # """
127- # endpoint = "https://source.redhat.com/"
128-
129- # # Connect to Igloo
130- # igloo = Igloo(endpoint=endpoint)
131-
132- # # Get all documents under parent path
133- # fragment_documents = igloo.get_children_from_parent(
134- # parent_path=url_fragxment, recursive=recursive
135- # )
136-
137- # # Fetch all attachments
138- # if attachments:
139- # for document in fragment_documents:
140- # object_id = document["id"]
141- # object_attachments = igloo.get_attachments(object_id=object_id)
142- # fragment_documents += object_attachments
143-
144- # # Convert to files and save locally
145- # meta_lookup = {}
146- # for document in fragment_documents:
147- # if document["isPublished"] and not document["IsArchived"]:
148- # # Write the document in it's URL path locally
149- # doc_href: str = document.get("attachedToHref", document["href"])
150- # extension = document.get("fileExtension", ".html")
151- # doc_title: str = document["title"].replace(extension, "")
152- # doc_path = doc_href.lstrip("/") + "/" + doc_title + extension
153- # path = SOURCE_RESPOSITORY_PATH / doc_path
154- # folder_path = path.parent
155- # if document["content"].strip() != "" or "contentBinary" in document:
156- # if not os.path.exists(folder_path):
157- # os.makedirs(folder_path)
158- # if "contentBinary" in document:
159- # with open(path, "wb") as f:
160- # f.write(document["contentBinary"])
161- # else:
162- # with open(path, "w") as f:
163- # f.write(document["content"])
164-
165- # # Save metadata
166- # used_columns = ["content", "contentBinary"]
167- # file_metadata = {
168- # key: value for key, value in document.items() if key not in used_columns
169- # }
170- # file_metadata["url"] = endpoint + doc_href.lstrip("/")
171- # file_metadata = file_metadata | metadata
172- # meta_lookup[path] = file_metadata
173-
174- # return meta_lookup
175-
176- # TODO (@abhikdps): Remove the below code once the above one starts working and uncomment the above
1+ # TODO (@abhikdps): Remove this file once the Igloo API keys
2+ # are aquired and rename the knowledge_source_igloo.py file to knowledge_source.py
1773import pathlib
1784import time
1795import logging
18612
18713logger = logging .getLogger (__name__ )
18814
15+
18916class SourceScraper :
19017 def __init__ (self , base_url : str = "https://source.redhat.com/" ):
19118 chrome_options = Options ()
@@ -243,20 +70,32 @@ def download_attachments(self, attachments: list[str], base_path: pathlib.Path):
24370 file_name = link .split ("/" )[- 1 ]
24471 full_path = base_path / file_name
24572 try :
246- self .driver .get (link if link .startswith ("http" ) else self .base_url .rstrip ("/" ) + link )
73+ self .driver .get (
74+ link
75+ if link .startswith ("http" )
76+ else self .base_url .rstrip ("/" ) + link
77+ )
24778 with open (full_path , "wb" ) as f :
24879 f .write (self .driver .page_source .encode ("utf-8" ))
24980 except Exception as e :
25081 logger .warning (f"Failed to download attachment { link } : { e } " )
25182
252- def scrape (self , url_fragment : str , recursive : bool , attachments : bool , metadata : dict [str , Any ]):
83+ def scrape (
84+ self ,
85+ url_fragment : str ,
86+ recursive : bool ,
87+ attachments : bool ,
88+ metadata : dict [str , Any ],
89+ ):
25390 meta_lookup = {}
25491 pages = self .fetch_all_pages (url_fragment , recursive )
25592
25693 for i , soup in enumerate (pages ):
25794 title = soup .title .string if soup .title else f"page_{ i } "
25895 safe_title = title .replace ("/" , "_" ).replace (" " , "_" )[:50 ]
259- page_path = SOURCE_RESPOSITORY_PATH / url_fragment .strip ("/" ) / f"{ safe_title } .html"
96+ page_path = (
97+ SOURCE_RESPOSITORY_PATH / url_fragment .strip ("/" ) / f"{ safe_title } .html"
98+ )
26099 page_path .parent .mkdir (parents = True , exist_ok = True )
261100
262101 self .save_page (soup , page_path )
@@ -271,7 +110,8 @@ def scrape(self, url_fragment: str, recursive: bool, attachments: bool, metadata
271110
272111 return meta_lookup
273112
274- def fetch_source (
113+
114+ def fetchall (
275115 url_fragment : str ,
276116 recursive : bool = False ,
277117 attachments : bool = True ,
0 commit comments