forked from tvox15/RAG-translation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
130 lines (102 loc) · 3.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import shutil
import openai
import time
from dotenv import load_dotenv
load_dotenv()
from get_embedding_function import get_embedding_function
from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
openai.api_key = os.environ['OPENAI_API_KEY']
CHROMA_PATH = os.environ['CHROMA_PATH']
DATA_PATH = os.environ['DATA_PATH']
def main():
init_db_with_docs()
def init_db_with_docs():
docs = load_documents()
split_docs = split_documents(docs)
add_to_chroma(split_docs)
def load_documents():
loader = DirectoryLoader(DATA_PATH,
glob="*.xls",
loader_cls=UnstructuredExcelLoader,
)
docs = loader.load()
return docs
def split_documents(docs: list[Document]):
split_docs = []
for doc in docs:
text = doc.page_content
chunks = text.split("\n\n\n")[2:]
# for each split chunk, split by \n and turn to json formatted by:
# { idx, start_tc, end_tc, source_lang, target_lang}
split_doc = []
for _, chunk in enumerate(chunks):
split_chunk = chunk.split("\n")
if len(split_chunk) == 5:
split_doc.append({
"idx": split_chunk[0],
"start_tc": split_chunk[1],
"end_tc": split_chunk[2],
"source_lang": split_chunk[3],
"target_lang": split_chunk[4]
})
# we want to save it back to doc.page_content as 10 lines of source_lang, 10 lines of target_lang, separated by \n
# and overlapped by 5 lines
start_range = 0
end_range = 10
page = 0
while end_range < len(split_doc):
str = ""
for i in range(start_range, end_range):
str += split_doc[i]["source_lang"] + " "
str += "\n"
for i in range(start_range, end_range):
str += split_doc[i]["target_lang"] + " "
str += "\n\n"
start_range += 5
end_range += 5
backslash_char = "\\"
# This parses the filename from the filepath and removes extension
id = f"{doc.metadata['source'].split(backslash_char)[-1].split('.')[0]}:{page}"
new_doc = Document(
page_content = str,
metadata = {
"source": doc.metadata["source"],
"page": page,
"id": id
}
)
split_docs.append(new_doc)
page += 1
return split_docs
def clear_database():
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
def add_to_chroma(chunks: list[Document]):
#Load the existing database.
db = Chroma(
persist_directory=CHROMA_PATH, embedding_function=get_embedding_function()
)
#Add or Update the documents.
existing_items = db.get(include=[]) # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")
# Only add documents that don't exist in the DB by checking matching IDs
new_chunks = []
for chunk in chunks:
if chunk.metadata["id"] not in existing_ids:
new_chunks.append(chunk)
# add 1 doc every .2 seconds to avoid rate limit
# because persist() calls openai's API for embedding
for i, chunk in enumerate(new_chunks):
print(f"Adding document {i+1}/{len(new_chunks)}")
db.add_documents([chunk])
db.persist()
time.sleep(0.2)
else:
print("✅ No new documents to add")
if __name__ == "__main__":
main()