@@ -24,6 +24,7 @@ use std::sync::Arc;
24
24
use itertools:: Itertools ;
25
25
use quickwit_doc_mapper:: { DocMapper , WarmupInfo } ;
26
26
use quickwit_proto:: { LeafSearchResponse , PartialHit , SearchRequest , SortOrder } ;
27
+ use serde:: Deserialize ;
27
28
use tantivy:: aggregation:: agg_req:: {
28
29
get_fast_field_names, get_term_dict_field_names, Aggregations ,
29
30
} ;
@@ -35,6 +36,7 @@ use tantivy::schema::Schema;
35
36
use tantivy:: { DocId , Score , SegmentOrdinal , SegmentReader } ;
36
37
37
38
use crate :: filters:: { TimestampFilter , TimestampFilterBuilder } ;
39
+ use crate :: jaeger_collector:: { FindTraceIdsCollector , FindTraceIdsSegmentCollector } ;
38
40
use crate :: partial_hit_sorting_key;
39
41
40
42
#[ derive( Clone , Debug ) ]
@@ -161,6 +163,11 @@ impl PartialEq for PartialHitHeapItem {
161
163
162
164
impl Eq for PartialHitHeapItem { }
163
165
166
+ enum AggregationSegmentCollectors {
167
+ FindTraceIdsSegmentCollector ( FindTraceIdsSegmentCollector ) ,
168
+ TantivyAggregationSegmentCollector ( AggregationSegmentCollector ) ,
169
+ }
170
+
164
171
/// Quickwit collector working at the scale of the segment.
165
172
pub struct QuickwitSegmentCollector {
166
173
num_hits : u64 ,
@@ -170,7 +177,7 @@ pub struct QuickwitSegmentCollector {
170
177
max_hits : usize ,
171
178
segment_ord : u32 ,
172
179
timestamp_filter_opt : Option < TimestampFilter > ,
173
- aggregation : Option < AggregationSegmentCollector > ,
180
+ aggregation : Option < AggregationSegmentCollectors > ,
174
181
}
175
182
176
183
impl QuickwitSegmentCollector {
@@ -219,8 +226,15 @@ impl SegmentCollector for QuickwitSegmentCollector {
219
226
220
227
self . num_hits += 1 ;
221
228
self . collect_top_k ( doc_id, score) ;
222
- if let Some ( aggregation_collector) = self . aggregation . as_mut ( ) {
223
- aggregation_collector. collect ( doc_id, score) ;
229
+
230
+ match self . aggregation . as_mut ( ) {
231
+ Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector ( collector) ) => {
232
+ collector. collect ( doc_id, score)
233
+ }
234
+ Some ( AggregationSegmentCollectors :: TantivyAggregationSegmentCollector ( collector) ) => {
235
+ collector. collect ( doc_id, score)
236
+ }
237
+ None => ( ) ,
224
238
}
225
239
}
226
240
@@ -240,15 +254,19 @@ impl SegmentCollector for QuickwitSegmentCollector {
240
254
} )
241
255
. collect ( ) ;
242
256
243
- let intermediate_aggregation_result = if let Some ( collector) = self . aggregation {
244
- Some (
245
- serde_json:: to_string ( & collector. harvest ( ) ?)
246
- . expect ( "could not serialize aggregation to json" ) ,
247
- )
248
- } else {
249
- None
257
+ let intermediate_aggregation_result = match self . aggregation {
258
+ Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector ( collector) ) => Some (
259
+ serde_json:: to_string ( & collector. harvest ( ) )
260
+ . expect ( "Collector fruit should be JSON serializable." ) ,
261
+ ) ,
262
+ Some ( AggregationSegmentCollectors :: TantivyAggregationSegmentCollector ( collector) ) => {
263
+ Some (
264
+ serde_json:: to_string ( & collector. harvest ( ) ?)
265
+ . expect ( "Collector fruit should be JSON serializable." ) ,
266
+ )
267
+ }
268
+ None => None ,
250
269
} ;
251
-
252
270
Ok ( LeafSearchResponse {
253
271
intermediate_aggregation_result,
254
272
num_hits : self . num_hits ,
@@ -259,6 +277,37 @@ impl SegmentCollector for QuickwitSegmentCollector {
259
277
}
260
278
}
261
279
280
+ #[ derive( Debug , Clone , Deserialize ) ]
281
+ #[ serde( untagged) ]
282
+ pub enum QuickwitAggregations {
283
+ FindTraceIdsAggregation ( FindTraceIdsCollector ) ,
284
+ TantivyAggregations ( Aggregations ) ,
285
+ }
286
+
287
+ impl QuickwitAggregations {
288
+ fn fast_field_names ( & self ) -> HashSet < String > {
289
+ match self {
290
+ QuickwitAggregations :: FindTraceIdsAggregation ( collector) => {
291
+ collector. fast_field_names ( )
292
+ }
293
+ QuickwitAggregations :: TantivyAggregations ( aggregations) => {
294
+ get_fast_field_names ( aggregations)
295
+ }
296
+ }
297
+ }
298
+
299
+ fn term_dict_field_names ( & self ) -> HashSet < String > {
300
+ match self {
301
+ QuickwitAggregations :: FindTraceIdsAggregation ( collector) => {
302
+ collector. term_dict_field_names ( )
303
+ }
304
+ QuickwitAggregations :: TantivyAggregations ( aggregations) => {
305
+ get_term_dict_field_names ( aggregations)
306
+ }
307
+ }
308
+ }
309
+ }
310
+
262
311
/// The quickwit collector is the tantivy Collector used in Quickwit.
263
312
///
264
313
/// It defines the data that should be accumulated about the documents matching
@@ -270,7 +319,7 @@ pub(crate) struct QuickwitCollector {
270
319
pub max_hits : usize ,
271
320
pub sort_by : SortBy ,
272
321
timestamp_filter_builder_opt : Option < TimestampFilterBuilder > ,
273
- pub aggregation : Option < Aggregations > ,
322
+ pub aggregation : Option < QuickwitAggregations > ,
274
323
}
275
324
276
325
impl QuickwitCollector {
@@ -282,21 +331,23 @@ impl QuickwitCollector {
282
331
fast_field_names. insert ( field_name. clone ( ) ) ;
283
332
}
284
333
}
285
- if let Some ( aggregate ) = self . aggregation . as_ref ( ) {
286
- fast_field_names. extend ( get_fast_field_names ( aggregate ) ) ;
334
+ if let Some ( aggregations ) = & self . aggregation {
335
+ fast_field_names. extend ( aggregations . fast_field_names ( ) ) ;
287
336
}
288
337
if let Some ( timestamp_filter_builder) = & self . timestamp_filter_builder_opt {
289
338
fast_field_names. insert ( timestamp_filter_builder. timestamp_field_name . clone ( ) ) ;
290
339
}
291
340
fast_field_names
292
341
}
342
+
293
343
pub fn term_dict_field_names ( & self ) -> HashSet < String > {
294
344
let mut term_dict_field_names = HashSet :: default ( ) ;
295
- if let Some ( aggregate ) = self . aggregation . as_ref ( ) {
296
- term_dict_field_names. extend ( get_term_dict_field_names ( aggregate ) ) ;
345
+ if let Some ( aggregations ) = & self . aggregation {
346
+ term_dict_field_names. extend ( aggregations . term_dict_field_names ( ) ) ;
297
347
}
298
348
term_dict_field_names
299
349
}
350
+
300
351
pub fn warmup_info ( & self ) -> WarmupInfo {
301
352
WarmupInfo {
302
353
term_dict_field_names : self . term_dict_field_names ( ) ,
@@ -323,13 +374,27 @@ impl Collector for QuickwitCollector {
323
374
// starting from 0 for every leaves.
324
375
let leaf_max_hits = self . max_hits + self . start_offset ;
325
376
326
- let timestamp_filter_opt =
327
- if let Some ( timestamp_filter_builder) = & self . timestamp_filter_builder_opt {
328
- timestamp_filter_builder. build ( segment_reader) ?
329
- } else {
330
- None
331
- } ;
332
-
377
+ let timestamp_filter_opt = match & self . timestamp_filter_builder_opt {
378
+ Some ( timestamp_filter_builder) => timestamp_filter_builder. build ( segment_reader) ?,
379
+ None => None ,
380
+ } ;
381
+ let aggregation = match & self . aggregation {
382
+ Some ( QuickwitAggregations :: FindTraceIdsAggregation ( collector) ) => {
383
+ Some ( AggregationSegmentCollectors :: FindTraceIdsSegmentCollector (
384
+ collector. for_segment ( 0 , segment_reader) ?,
385
+ ) )
386
+ }
387
+ Some ( QuickwitAggregations :: TantivyAggregations ( aggs) ) => Some (
388
+ AggregationSegmentCollectors :: TantivyAggregationSegmentCollector (
389
+ AggregationSegmentCollector :: from_agg_req_and_reader (
390
+ aggs,
391
+ segment_reader,
392
+ AGGREGATION_BUCKET_LIMIT ,
393
+ ) ?,
394
+ ) ,
395
+ ) ,
396
+ None => None ,
397
+ } ;
333
398
Ok ( QuickwitSegmentCollector {
334
399
num_hits : 0u64 ,
335
400
split_id : self . split_id . clone ( ) ,
@@ -338,17 +403,7 @@ impl Collector for QuickwitCollector {
338
403
segment_ord,
339
404
max_hits : leaf_max_hits,
340
405
timestamp_filter_opt,
341
- aggregation : self
342
- . aggregation
343
- . as_ref ( )
344
- . map ( |aggs| {
345
- AggregationSegmentCollector :: from_agg_req_and_reader (
346
- aggs,
347
- segment_reader,
348
- AGGREGATION_BUCKET_LIMIT ,
349
- )
350
- } )
351
- . transpose ( ) ?,
406
+ aggregation,
352
407
} )
353
408
}
354
409
@@ -372,7 +427,8 @@ impl Collector for QuickwitCollector {
372
427
// All leaves will return their top [0..max_hits) documents.
373
428
// We compute the overall [0..start_offset + max_hits) documents ...
374
429
let num_hits = self . start_offset + self . max_hits ;
375
- let mut merged_leaf_response = merge_leaf_responses ( segment_fruits?, num_hits) ?;
430
+ let mut merged_leaf_response =
431
+ merge_leaf_responses ( & self . aggregation , segment_fruits?, num_hits) ?;
376
432
// ... and drop the first [..start_offsets) hits.
377
433
merged_leaf_response
378
434
. partial_hits
@@ -388,31 +444,54 @@ impl Collector for QuickwitCollector {
388
444
389
445
/// Merges a set of Leaf Results.
390
446
fn merge_leaf_responses (
447
+ aggregations_opt : & Option < QuickwitAggregations > ,
391
448
leaf_responses : Vec < LeafSearchResponse > ,
392
449
max_hits : usize ,
393
450
) -> tantivy:: Result < LeafSearchResponse > {
394
451
// Optimization: No merging needed if there is only one result.
395
452
if leaf_responses. len ( ) == 1 {
396
453
return Ok ( leaf_responses. into_iter ( ) . next ( ) . unwrap_or_default ( ) ) ; //< default is actually never called
397
454
}
398
- let intermediate_aggregation_results = leaf_responses
399
- . iter ( )
400
- . flat_map ( |leaf_response| {
401
- leaf_response
402
- . intermediate_aggregation_result
403
- . as_ref ( )
404
- . map ( |res| serde_json:: from_str ( res) )
405
- } )
406
- . collect :: < Result < Vec < IntermediateAggregationResults > , _ > > ( ) ?;
407
-
408
- let intermediate_aggregation_result =
409
- intermediate_aggregation_results
410
- . into_iter ( )
411
- . reduce ( |mut res1, res2| {
412
- res1. merge_fruits ( res2) ;
413
- res1
414
- } ) ;
455
+ let merged_intermediate_aggregation_result = match aggregations_opt {
456
+ Some ( QuickwitAggregations :: FindTraceIdsAggregation ( collector) ) => {
457
+ let fruits: Vec <
458
+ <<FindTraceIdsCollector as Collector >:: Child as SegmentCollector >:: Fruit ,
459
+ > = leaf_responses
460
+ . iter ( )
461
+ . filter_map ( |leaf_response| {
462
+ leaf_response. intermediate_aggregation_result . as_ref ( ) . map (
463
+ |intermediate_aggregation_result| {
464
+ serde_json:: from_str ( intermediate_aggregation_result)
465
+ } ,
466
+ )
467
+ } )
468
+ . collect :: < Result < _ , _ > > ( ) ?;
469
+ let merged_fruit = collector. merge_fruits ( fruits) ?;
470
+ Some ( serde_json:: to_string ( & merged_fruit) ?)
471
+ }
472
+ Some ( QuickwitAggregations :: TantivyAggregations ( _) ) => {
473
+ let fruits: Vec < IntermediateAggregationResults > = leaf_responses
474
+ . iter ( )
475
+ . filter_map ( |leaf_response| {
476
+ leaf_response. intermediate_aggregation_result . as_ref ( ) . map (
477
+ |intermediate_aggregation_result| {
478
+ serde_json:: from_str ( intermediate_aggregation_result)
479
+ } ,
480
+ )
481
+ } )
482
+ . collect :: < Result < _ , _ > > ( ) ?;
415
483
484
+ fruits
485
+ . into_iter ( )
486
+ . reduce ( |mut left, right| {
487
+ left. merge_fruits ( right) ;
488
+ left
489
+ } )
490
+ . map ( |merged_fruit| serde_json:: to_string ( & merged_fruit) )
491
+ . transpose ( ) ?
492
+ }
493
+ None => None ,
494
+ } ;
416
495
let num_attempted_splits = leaf_responses
417
496
. iter ( )
418
497
. map ( |leaf_response| leaf_response. num_attempted_splits )
@@ -433,10 +512,7 @@ fn merge_leaf_responses(
433
512
// TODO optimize
434
513
let top_k_partial_hits = top_k_partial_hits ( all_partial_hits, max_hits) ;
435
514
Ok ( LeafSearchResponse {
436
- intermediate_aggregation_result : intermediate_aggregation_result
437
- . as_ref ( )
438
- . map ( serde_json:: to_string)
439
- . transpose ( ) ?,
515
+ intermediate_aggregation_result : merged_intermediate_aggregation_result,
440
516
num_hits,
441
517
partial_hits : top_k_partial_hits,
442
518
failed_splits,
@@ -465,12 +541,10 @@ pub(crate) fn make_collector_for_split(
465
541
search_request : & SearchRequest ,
466
542
split_schema : & Schema ,
467
543
) -> crate :: Result < QuickwitCollector > {
468
- let aggregation = if let Some ( agg) = & search_request. aggregation_request {
469
- Some ( serde_json:: from_str ( agg) ?)
470
- } else {
471
- None
544
+ let aggregation = match & search_request. aggregation_request {
545
+ Some ( aggregation) => Some ( serde_json:: from_str ( aggregation) ?) ,
546
+ None => None ,
472
547
} ;
473
-
474
548
let timestamp_field_opt = doc_mapper. timestamp_field ( split_schema) ;
475
549
let timestamp_filter_builder_opt = TimestampFilterBuilder :: new (
476
550
doc_mapper. timestamp_field_name ( ) ,
0 commit comments