-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocument_source.py
More file actions
69 lines (53 loc) · 2.51 KB
/
Copy pathdocument_source.py
File metadata and controls
69 lines (53 loc) · 2.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import abc
import json
from abc import ABC
from documents import InputDocument, DocumentCollection, DictDocumentCollection
class DocumentSource(ABC):
"""
Text Acquisition component of the Indexing Process.
This can be a feed or a crawled source, though the current interface reads all documents
at the same time.
"""
@abc.abstractmethod
def read(self) -> DocumentCollection:
"""
Get documents from this source.
:return: The DocumentCollection of all documents from this source.
"""
pass
class WikiJsonDocumentSource(DocumentSource):
def __init__(self, file_path: str):
"""
A DocumentSource implementation that uses JSON files formatted like wiki_small.json
to read in the data and build a collection of documents.
:param file_path: The string path and name of the JSON file.
"""
self.file_path = file_path
def read(self) -> DocumentCollection:
with open(self.file_path) as fp: # Open raw json file and load contents.
data = json.load(fp)
# For every record in the file, construct an InputDocument with the raw text and
# add it to the document collection.
doc_collection = DictDocumentCollection() # Create the new document collection.
for record in data:
doc_collection.insert(
InputDocument(doc_id=record['id'], text=record['init_text'], title=record['title']))
return doc_collection
class TrecCovidJsonlSource(DocumentSource):
def __init__(self, file_path: str):
"""
A DocumentSource implementation that uses JSONL files formatted like the trec
covid corpus.jsonl with a json record on each line of the file. This JSONL file
is read in, and the data is used build a collection of documents.
:param file_path: The string path and name of the JSONL file.
"""
self.file_path = file_path
def read(self) -> DocumentCollection:
doc_collection = DictDocumentCollection()
with open(self.file_path, 'r') as fp: # Open raw jsonl file and load contents.
# For each line of the file, parse the line as json, and add the record to the collection:
for line in fp:
record = json.loads(line)
doc_collection.insert(
InputDocument(doc_id=record['_id'], text=record['text'], title=record['title']))
return doc_collection