-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocuments.py
More file actions
116 lines (90 loc) · 3.48 KB
/
Copy pathdocuments.py
File metadata and controls
116 lines (90 loc) · 3.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import abc
import dataclasses
from abc import ABC
from typing import List, Iterable, Iterator, Dict
@dataclasses.dataclass
class InputDocument:
"""
Common raw document representation as produced by Text Acquisition stage.
This representation is stored in the DocumentCollection.
"""
doc_id: str
text: str
title: str
@dataclasses.dataclass
class TransformedDocument:
"""
Document representation after the Text Transformation stage.
This representation is the input to the Indexing stage.
"""
doc_id: str
tokens: List[str]
class DocumentCollection(ABC):
"""
Collection of InputDocuments.
Abstracts Document Data Store.
Produced and updated by the indexing process.
Used by Query Process for User Interactions.
"""
@abc.abstractmethod
def insert(self, doc: InputDocument) -> None:
"""
Add another document into this document collection.
:param doc: An InputDocument to add to the collection.
:return: None
"""
pass
@abc.abstractmethod
def get_doc(self, doc_id: str) -> InputDocument:
"""
Get a document by document ID.
:param doc_id: ID of the document to return.
:return: An InputDocument for the given doc_id.
"""
pass
@abc.abstractmethod
def get_docs(self, doc_ids: Iterable[str]) -> 'DocumentCollection':
"""
Batch get.
:param doc_ids: IDs of the documents to retrieve.
:return: A collection of documents with the given IDs.
"""
pass
@abc.abstractmethod
def __iter__(self) -> Iterator[InputDocument]:
"""
:return: Iterator over all documents in the collection.
"""
pass
class DictDocumentCollection(DocumentCollection):
"""
In memory DocumentCollection implementation that uses a dict of doc_ids and corresponding InputDocuments.
"""
def __init__(self, docs_dict: Dict[str, InputDocument] = None):
"""
Create a new DictDocumentCollection. Leave argument blank to create an empty instance,
or supply an argument to include data right away.
:param docs_dict: A dictionary with doc_ids as keys, and InputDocuments for values.
"""
if not docs_dict:
docs_dict = dict()
self.documents = docs_dict
@staticmethod
def create_empty() -> 'DictDocumentCollection':
"""
:return: An empty instance of DictDocumentCollection
"""
return DictDocumentCollection(dict())
def insert(self, doc: InputDocument) -> None:
# If the input is an instance of InputDocument, then add it to the dictionary with the id as the key.
if isinstance(doc, InputDocument):
self.documents[doc.doc_id] = doc
def get_doc(self, doc_id: str) -> InputDocument | None:
return self.documents.get(doc_id) # Return the InputDocument, or None if not found.
def get_docs(self, doc_ids: Iterable[str]) -> 'DocumentCollection':
doc_collection = DictDocumentCollection() # Create a new document collection.
for requested_id in doc_ids: # For each of the requested ID's, add the document to the collection:
doc_collection.insert(self.documents.get(requested_id))
return doc_collection
def __iter__(self) -> Iterator[InputDocument]:
return iter(self.documents.values())