Skip to content

Commit 962e097

Browse files
authored
Refactor web retriever (#1102)
1 parent 4480d80 commit 962e097

9 files changed

Lines changed: 217 additions & 152 deletions

File tree

.github/workflows/docker/compose/web_retrievers-compose.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
# this file should be run in the root of the repo
55
services:
6-
web-retriever-chroma:
6+
web-retriever:
77
build:
8-
dockerfile: comps/web_retrievers/chroma/langchain/Dockerfile
9-
image: ${REGISTRY:-opea}/web-retriever-chroma:${TAG:-latest}
8+
dockerfile: comps/web_retrievers/src/Dockerfile
9+
image: ${REGISTRY:-opea}/web-retriever:${TAG:-latest}

comps/web_retrievers/chroma/langchain/retriever_chroma.py

Lines changed: 0 additions & 134 deletions
This file was deleted.

comps/web_retrievers/chroma/langchain/Dockerfile renamed to comps/web_retrievers/src/Dockerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ COPY comps /home/user/comps
1919

2020
RUN pip install --no-cache-dir --upgrade pip setuptools && \
2121
if [ ${ARCH} = "cpu" ]; then \
22-
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/web_retrievers/chroma/langchain/requirements.txt; \
22+
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/web_retrievers/src/requirements.txt; \
2323
else \
24-
pip install --no-cache-dir -r /home/user/comps/web_retrievers/chroma/langchain/requirements.txt; \
24+
pip install --no-cache-dir -r /home/user/comps/web_retrievers/src/requirements.txt; \
2525
fi
2626

2727
ENV PYTHONPATH=$PYTHONPATH:/home/user
2828

29-
WORKDIR /home/user/comps/web_retrievers/chroma/langchain
29+
WORKDIR /home/user/comps/web_retrievers/src
3030

31-
ENTRYPOINT ["python", "retriever_chroma.py"]
31+
ENTRYPOINT ["python", "opea_web_retrievers_microservice.py"]

comps/web_retrievers/chroma/langchain/README.md renamed to comps/web_retrievers/src/README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@ The Web Retriever Microservice is designed to efficiently search web pages relev
88

99
```bash
1010
cd ../../../../
11-
docker build -t opea/web-retriever-chroma:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/web_retrievers/chroma/langchain/Dockerfile .
11+
docker build -t opea/web-retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/web_retrievers/src/Dockerfile .
1212
```
1313

1414
### Start TEI Service
1515

1616
```bash
1717
model=BAAI/bge-base-en-v1.5
1818
volume=$PWD/data
19-
docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model
19+
docker run -d -p 6060:80 -v $volume:/data -e http_proxy=$http_proxy -e https_proxy=$https_proxy --pull always ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 --model-id $model --auto-truncate
2020
```
2121

2222
### Start Web Retriever Service
@@ -31,7 +31,7 @@ export GOOGLE_CSE_ID=xxx
3131
```
3232

3333
```bash
34-
docker run -d --name="web-retriever-chroma-server" -p 7077:7077 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e GOOGLE_API_KEY=$GOOGLE_API_KEY -e GOOGLE_CSE_ID=$GOOGLE_CSE_ID opea/web-retriever-chroma:latest
34+
docker run -d --name="web-retriever-server" -p 7077:7077 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e GOOGLE_API_KEY=$GOOGLE_API_KEY -e GOOGLE_CSE_ID=$GOOGLE_CSE_ID opea/web-retriever:latest
3535
```
3636

3737
### Consume Web Retriever Service
@@ -44,6 +44,6 @@ your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) fo
4444

4545
http_proxy= curl http://${your_ip}:7077/v1/web_retrieval \
4646
-X POST \
47-
-d "{\"text\":\"What is black myth wukong?\",\"embedding\":${your_embedding}}" \
47+
-d "{\"text\":\"What is The Game of the Year 2024?\",\"embedding\":${your_embedding},\"k\":4}" \
4848
-H 'Content-Type: application/json'
4949
```
File renamed without changes.
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import os
5+
import time
6+
7+
from langchain.text_splitter import RecursiveCharacterTextSplitter
8+
from langchain_community.document_loaders import AsyncHtmlLoader
9+
from langchain_community.document_transformers import Html2TextTransformer
10+
from langchain_community.utilities import GoogleSearchAPIWrapper
11+
from langchain_community.vectorstores import Chroma
12+
from langchain_huggingface import HuggingFaceEndpointEmbeddings
13+
14+
from comps import (
15+
CustomLogger,
16+
EmbedDoc,
17+
OpeaComponent,
18+
OpeaComponentRegistry,
19+
SearchedDoc,
20+
ServiceType,
21+
TextDoc,
22+
statistics_dict,
23+
)
24+
25+
logger = CustomLogger("opea_google_search")
26+
logflag = os.getenv("LOGFLAG", False)
27+
28+
29+
@OpeaComponentRegistry.register("OPEA_GOOGLE_SEARCH")
30+
class OpeaGoogleSearch(OpeaComponent):
31+
"""A specialized Web Retrieval component derived from OpeaComponent for Google web retriever services."""
32+
33+
def __init__(self, name: str, description: str, config: dict = None):
34+
self.google_api_key = os.environ.get("GOOGLE_API_KEY")
35+
self.google_cse_id = os.environ.get("GOOGLE_CSE_ID")
36+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=50)
37+
# Create vectorstore
38+
self.tei_embedding_endpoint = os.getenv("TEI_EMBEDDING_ENDPOINT")
39+
health_status = self.check_health()
40+
if not health_status:
41+
logger.error("OpeaGoogleSearch health check failed.")
42+
43+
super().__init__(name, ServiceType.WEB_RETRIEVER.name.lower(), description, config)
44+
45+
def get_urls(self, query, num_search_result=1):
46+
result = self.search.results(query, num_search_result)
47+
return result
48+
49+
def dump_docs(self, docs):
50+
batch_size = 32
51+
for i in range(0, len(docs), batch_size):
52+
self.vector_db.add_documents(docs[i : i + batch_size])
53+
54+
def retrieve_htmls(self, all_urls):
55+
loader = AsyncHtmlLoader(all_urls, ignore_load_errors=True, trust_env=True)
56+
docs = loader.load()
57+
return docs
58+
59+
def parse_htmls(self, docs):
60+
if logflag:
61+
logger.info("Indexing new urls...")
62+
63+
html2text = Html2TextTransformer()
64+
docs = list(html2text.transform_documents(docs))
65+
docs = self.text_splitter.split_documents(docs)
66+
67+
return docs
68+
69+
async def invoke(self, input: EmbedDoc) -> SearchedDoc:
70+
"""Involve the Google search service to retrieve the documents related to the prompt."""
71+
# Read the uploaded file
72+
if logflag:
73+
logger.info(input)
74+
start = time.time()
75+
query = input.text
76+
embedding = input.embedding
77+
78+
# Google Search the results, parse the htmls
79+
search_results = self.get_urls(query=query, num_search_result=input.k)
80+
urls_to_look = []
81+
for res in search_results:
82+
if res.get("link", None):
83+
urls_to_look.append(res["link"])
84+
urls = list(set(urls_to_look))
85+
if logflag:
86+
logger.info(f"urls: {urls}")
87+
docs = self.retrieve_htmls(urls)
88+
docs = self.parse_htmls(docs)
89+
if logflag:
90+
logger.info(docs)
91+
# Remove duplicated docs
92+
unique_documents_dict = {(doc.page_content, tuple(sorted(doc.metadata.items()))): doc for doc in docs}
93+
unique_documents = list(unique_documents_dict.values())
94+
statistics_dict["opea_service@search"].append_latency(time.time() - start, None)
95+
96+
# dump to vector_db
97+
self.dump_docs(unique_documents)
98+
99+
# Do the retrieval
100+
search_res = await self.vector_db.asimilarity_search_by_vector(embedding=embedding, k=input.k)
101+
102+
searched_docs = []
103+
104+
for r in search_res:
105+
# include the metadata into the retrieved docs content
106+
description_str = f"\n description: {r.metadata['description']} \n" if "description" in r.metadata else ""
107+
title_str = f"\n title: {r.metadata['title']} \n" if "title" in r.metadata else ""
108+
source_str = f"\n source: {r.metadata['source']} \n" if "source" in r.metadata else ""
109+
text_with_meta = f"{r.page_content} {description_str} {title_str} {source_str}"
110+
searched_docs.append(TextDoc(text=text_with_meta))
111+
112+
result = SearchedDoc(retrieved_docs=searched_docs, initial_query=query)
113+
statistics_dict["opea_service@web_retriever"].append_latency(time.time() - start, None)
114+
115+
# For Now history is banned
116+
if self.vector_db.get()["ids"]:
117+
self.vector_db.delete(self.vector_db.get()["ids"])
118+
if logflag:
119+
logger.info(result)
120+
return result
121+
122+
def check_health(self) -> bool:
123+
"""Checks the health of the embedding service.
124+
125+
Returns:
126+
bool: True if the service is reachable and healthy, False otherwise.
127+
"""
128+
try:
129+
self.search = GoogleSearchAPIWrapper(
130+
google_api_key=self.google_api_key, google_cse_id=self.google_cse_id, k=10
131+
)
132+
# vectordb_persistent_directory = os.getenv("VECTORDB_PERSISTENT_DIR", "/home/user/chroma_db_oai")
133+
self.vector_db = Chroma(
134+
embedding_function=HuggingFaceEndpointEmbeddings(model=self.tei_embedding_endpoint),
135+
# persist_directory=vectordb_persistent_directory
136+
)
137+
except Exception as e:
138+
logger.error(e)
139+
return False
140+
return True
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# Copyright (C) 2024 Intel Corporation
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import os
5+
import time
6+
7+
from integrations.google_search import OpeaGoogleSearch
8+
9+
from comps import (
10+
CustomLogger,
11+
EmbedDoc,
12+
OpeaComponentLoader,
13+
SearchedDoc,
14+
ServiceType,
15+
opea_microservices,
16+
register_microservice,
17+
register_statistics,
18+
statistics_dict,
19+
)
20+
21+
logger = CustomLogger("opea_web_retriever_microservice")
22+
logflag = os.getenv("LOGFLAG", False)
23+
24+
web_retriever_component_name = os.getenv("WEB_RETRIEVER_NAME", "OPEA_GOOGLE_SEARCH")
25+
# Initialize OpeaComponentLoader
26+
loader = OpeaComponentLoader(
27+
web_retriever_component_name, description=f"OPEA WEB RETRIEVER Component: {web_retriever_component_name}"
28+
)
29+
30+
31+
@register_microservice(
32+
name="opea_service@web_retriever",
33+
service_type=ServiceType.WEB_RETRIEVER,
34+
endpoint="/v1/web_retrieval",
35+
host="0.0.0.0",
36+
port=7077,
37+
input_datatype=EmbedDoc,
38+
output_datatype=SearchedDoc,
39+
)
40+
@register_statistics(names=["opea_service@web_retriever", "opea_service@search"])
41+
async def web_retriever(input: EmbedDoc) -> SearchedDoc:
42+
start = time.time()
43+
44+
try:
45+
# Use the loader to invoke the active component
46+
res = await loader.invoke(input)
47+
if logflag:
48+
logger.info(res)
49+
statistics_dict["opea_service@web_retriever"].append_latency(time.time() - start, None)
50+
return res
51+
52+
except Exception as e:
53+
logger.error(f"Error during web retriever invocation: {e}")
54+
raise
55+
56+
57+
if __name__ == "__main__":
58+
logger.info("OPEA Web Retriever Microservice is starting....")
59+
opea_microservices["opea_service@web_retriever"].start()
File renamed without changes.

0 commit comments

Comments
 (0)