@@ -206,36 +206,45 @@ def gen_context_docs(context: Context) -> Generator[Document, None, None]:
206
206
for _ , docs in reversed (context ):
207
207
if docs is not None :
208
208
for doc in docs :
209
- if doc .docno != EMPTY_PLACEHOLDER_DOC .docno :
209
+ if (
210
+ doc .docno != EMPTY_PLACEHOLDER_DOC .docno
211
+ and doc .content != ""
212
+ ):
210
213
yield doc
214
+ i : int = 1
215
+ while True :
216
+ yield Document (f"{ i } " , "" )
217
+ i += 1
211
218
212
219
# if in df there are multiple rows that have the same qid and docno, keep the one with the highest score. For the ones removed, add a row each with the EMPTY_PLACEHOLDER_DOC
213
220
rank_size_per_qid : int = df .groupby ("qid" ).size ().max ()
214
- print (f"Rank size per qid: { rank_size_per_qid } " )
221
+ logging . info (f"Rank size per qid: { rank_size_per_qid } " )
215
222
df = df .sort_values (["qid" , "docno" , "score" ], ascending = [True , True , False ])
216
223
total_size = df .shape [0 ]
217
224
df = df .drop_duplicates (subset = ["qid" , "docno" ], keep = "first" )
218
225
dropped_any : bool = total_size != df .shape [0 ]
219
- print (f"Dropped any: { dropped_any } " )
226
+ logging . info (f"Dropped any: { dropped_any } " )
220
227
df = df .reset_index (drop = True )
221
228
df = self .pad_empty_documents (
222
229
df , df ["qid" ].unique (), rank_size_per_qid , df [["qid" , "query" ]]
223
230
)
224
- print (f"Number of max rank size per qid: { df .groupby ('qid' ).size ().max ()} " )
231
+ logging .info (
232
+ f"Number of max rank size per qid: { df .groupby ('qid' ).size ().max ()} "
233
+ )
225
234
df = df .reset_index (drop = True )
226
235
df = df .sort_values (["qid" , "rank" ], ascending = [True , True ])
227
236
228
237
for query , context in context_list :
229
238
# check if there is a row in the df with "qid" == query.query_id, where "docno" == EMPTY_PLACEHOLDER_DOC.docno
230
239
# if yes, replace it with the top document from the context
240
+ doc_gen = gen_context_docs (context )
231
241
while True :
232
242
if not df [
233
243
(df ["qid" ] == query .query_id )
234
244
& (df ["docno" ] == EMPTY_PLACEHOLDER_DOC .docno )
235
245
].empty :
236
246
# Check if gen_docs has next element
237
247
doc : Document
238
- doc_gen = gen_context_docs (context )
239
248
try :
240
249
doc = next (doc_gen )
241
250
while (
0 commit comments