Skip to content

Commit faac031

Browse files
committed
Add support for non-aggregated rows & other feedback improvements
1 parent f838523 commit faac031

File tree

3 files changed

+281
-58
lines changed

3 files changed

+281
-58
lines changed

core/utils/metadata-utils/src/main/java/datawave/query/util/MetadataHelper.java

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import java.io.IOException;
66
import java.nio.charset.CharacterCodingException;
77
import java.time.format.DateTimeParseException;
8+
import java.util.ArrayList;
89
import java.util.Arrays;
910
import java.util.Calendar;
1011
import java.util.Collection;
@@ -40,6 +41,7 @@
4041
import org.apache.accumulo.core.data.PartialKey;
4142
import org.apache.accumulo.core.data.Range;
4243
import org.apache.accumulo.core.data.Value;
44+
import org.apache.accumulo.core.iterators.FirstEntryInRowIterator;
4345
import org.apache.accumulo.core.iterators.ValueFormatException;
4446
import org.apache.accumulo.core.iterators.user.RegExFilter;
4547
import org.apache.accumulo.core.iterators.user.SummingCombiner;
@@ -1714,6 +1716,7 @@ public Set<String> getMissingFieldsInDateRange(Set<String> fields, Set<String> d
17141716
fields = Sets.difference(fields, specialFields);
17151717
Set<Range> ranges = createExactFieldCountRanges(fields);
17161718
StringBuilder dataTypeRegex = new StringBuilder();
1719+
List<IteratorSetting> settings = new ArrayList<>();
17171720

17181721
if (ranges.isEmpty()) {
17191722
return Collections.emptySet();
@@ -1735,22 +1738,55 @@ public Set<String> getMissingFieldsInDateRange(Set<String> fields, Set<String> d
17351738
}
17361739

17371740
try (BatchScanner bs = ScannerHelper.createBatchScanner(client, getMetadataTableName(), getAuths(), fields.size())) {
1738-
IteratorSetting regexIter = new IteratorSetting(50, "regexFilter", RegExFilter.class);
1741+
settings.add(new IteratorSetting(51, "FirstEntryInRow", FirstEntryInRowIterator.class));
1742+
settings.add(new IteratorSetting(50, "regexFilter", RegExFilter.class));
17391743
if (!dataTypeRegex.toString().isEmpty()) {
1740-
regexIter.addOption(RegExFilter.COLQ_REGEX, dataTypeRegex.toString());
1744+
for (IteratorSetting setting : settings) {
1745+
if (setting.getName().equals("regexFilter")) {
1746+
setting.addOption(RegExFilter.COLQ_REGEX, dataTypeRegex.toString());
1747+
break;
1748+
}
1749+
}
17411750
}
17421751
bs.setRanges(ranges);
1743-
bs.addScanIterator(regexIter);
1752+
for (IteratorSetting setting : settings) {
1753+
bs.addScanIterator(setting);
1754+
}
17441755

17451756
for (Entry<Key,Value> entry : bs) {
1757+
Text colq = entry.getKey().getColumnQualifier();
1758+
int colqIndex = colq.find(NULL_BYTE);
1759+
1760+
String remainder;
17461761
try {
1747-
DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get());
1748-
if (!map.subMap(beginDate, endDate).isEmpty()) {
1749-
foundFields.add(entry.getKey().getRow().toString());
1762+
remainder = Text.decode(colq.getBytes(), colqIndex + 1, colq.getLength() - (colqIndex + 1));
1763+
} catch (CharacterCodingException e) {
1764+
log.warn("Could not deserialize colqual: {} ", entry.getKey());
1765+
continue;
1766+
}
1767+
if (remainder.equals(FrequencyMetadataAggregator.AGGREGATED)) {
1768+
// This is an aggregated entry.
1769+
try {
1770+
DateFrequencyMap map = new DateFrequencyMap(entry.getValue().get());
1771+
if (!map.subMap(beginDate, endDate).isEmpty()) {
1772+
foundFields.add(entry.getKey().getRow().toString());
1773+
}
1774+
} catch (IOException e) {
1775+
log.error("Failed to convert Value to DateFrequencyMap", e);
1776+
}
1777+
} else {
1778+
// This is an entry with a count for a single date.
1779+
try {
1780+
Date date = DateHelper.parse(remainder);
1781+
// Add the field if we fall within beginDate and endDate, inclusively.
1782+
if (date.compareTo(DateHelper.parse(beginDate)) >= 0 && date.compareTo(DateHelper.parse(endDate)) <= 0) {
1783+
foundFields.add(entry.getKey().getRow().toString());
1784+
}
1785+
} catch (ValueFormatException e) {
1786+
log.warn("Could not convert the Value to a long: {}", entry.getValue());
1787+
} catch (DateTimeParseException e) {
1788+
log.warn("Could not convert date string: {}", remainder);
17501789
}
1751-
} catch (IOException e) {
1752-
log.trace("Could not convert the Value to a DateFrequencyMap: {}", entry.getValue());
1753-
log.error("Failed to convert Value to DateFrequencyMap", e);
17541790
}
17551791
}
17561792
} catch (TableNotFoundException e) {

core/utils/metadata-utils/src/test/java/datawave/query/util/MetadataHelperTest.java

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,4 +392,100 @@ void testMixedEntryFormats() {
392392
Assertions.assertEquals(DateHelper.parse("20200103"), helper.getEarliestOccurrenceOfFieldWithType("NAME", "maze", accumuloClient, null));
393393
}
394394
}
395+
396+
/**
397+
* Tests for {@link MetadataHelper#getMissingFieldsInDateRange(Set, Set, String, String, Set)}.
398+
*/
399+
@Nested
400+
public class GetMissingFieldsInDateRangeTest {
401+
/**
402+
* Test against a table that has only non-aggregated entries as matches.
403+
*/
404+
@Test
405+
void testNonAggregatedEntriesOnly() throws TableNotFoundException {
406+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200103", "20200120", 1L);
407+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "wiki", "20200101", "20200120", 2L);
408+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200105", "20200120", 3L);
409+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200107", "20200102", 3L);
410+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L);
411+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L);
412+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L);
413+
writeMutations();
414+
415+
// No DataTypes
416+
Assertions.assertEquals(Collections.emptySet(), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Collections.emptySet(), "20200101",
417+
"20200120", Collections.emptySet()));
418+
// Using DataTypes
419+
Assertions.assertEquals(Set.of("EVENT_DATE"),
420+
helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("data"), "20200101", "20200120", Collections.emptySet()));
421+
// Fictitious field
422+
Assertions.assertEquals(Set.of("FOO"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE", "FOO"),
423+
Set.of("wiki", "data", "csv", "maze"), "20200101", "20200120", Collections.emptySet()));
424+
// Missing because of date range
425+
Assertions.assertEquals(Set.of("NAME", "EVENT_DATE"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("wiki", "data"),
426+
"20190101", "20191231", Collections.emptySet()));
427+
}
428+
429+
/**
430+
* Test against a table that has only aggregated entries as matches.
431+
*/
432+
@Test
433+
void testAggregatedEntriesOnly() throws TableNotFoundException {
434+
givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200113", 1L, "20200115", 5L, "20200116", 3L));
435+
givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L));
436+
givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200102", 1L, "20200104", 55L, "20200105", 3L));
437+
givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200101", 1L, "20200103", 3L));
438+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
439+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
440+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
441+
writeMutations();
442+
443+
// No DataTypes
444+
Assertions.assertEquals(Collections.emptySet(), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Collections.emptySet(), "20200101",
445+
"20200120", Collections.emptySet()));
446+
// Using DataTypes
447+
Assertions.assertEquals(Set.of("EVENT_DATE"),
448+
helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("data"), "20200101", "20200120", Collections.emptySet()));
449+
// Fictitious field
450+
Assertions.assertEquals(Set.of("FOO"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE", "FOO"),
451+
Set.of("wiki", "data", "csv", "maze"), "20200101", "20200120", Collections.emptySet()));
452+
// Missing because of date range
453+
Assertions.assertEquals(Set.of("NAME", "EVENT_DATE"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("wiki", "data"),
454+
"20190101", "20191231", Collections.emptySet()));
455+
}
456+
457+
/**
458+
* Test against a table that has both aggregated and non-aggregated entries as matches.
459+
*/
460+
@Test
461+
void testMixedEntryFormats() throws TableNotFoundException {
462+
givenAggregatedFrequencyRow("NAME", COLF_F, "csv", createDateFrequencyMap("20200111", 1L, "20200112", 5L, "20200113", 3L));
463+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "csv", "20200111", "20200120", 1L);
464+
givenAggregatedFrequencyRow("NAME", COLF_F, "wiki", createDateFrequencyMap("20200111", 1L, "20200112", 15L, "20200113", 3L));
465+
givenAggregatedFrequencyRow("NAME", COLF_F, "maze", createDateFrequencyMap("20200111", 1L, "20200112", 55L, "20200113", 3L));
466+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "maze", "20200103", "20200120", 3L);
467+
givenAggregatedFrequencyRow("NAME", COLF_F, "data", createDateFrequencyMap("20200111", 1L, "20200113", 3L));
468+
givenNonAggregatedFrequencyRows("NAME", COLF_F, "data", "20200101", "20200115", 3L);
469+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "csv", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
470+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "wiki", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
471+
givenAggregatedFrequencyRow("EVENT_DATE", COLF_F, "maze", createDateFrequencyMap("20200101", 2L, "20200102", 3L, "20200103", 4L));
472+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "csv", "20200101", "20200120", 4L);
473+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "wiki", "20200101", "20200120", 5L);
474+
givenNonAggregatedFrequencyRows("EVENT_DATE", COLF_F, "maze", "20200101", "20200120", 6L);
475+
writeMutations();
476+
477+
// No DataTypes
478+
Assertions.assertEquals(Collections.emptySet(), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Collections.emptySet(), "20200101",
479+
"20200120", Collections.emptySet()));
480+
// Using DataTypes
481+
Assertions.assertEquals(Set.of("EVENT_DATE"),
482+
helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("data"), "20200101", "20200120", Collections.emptySet()));
483+
// Fictitious field
484+
Assertions.assertEquals(Set.of("FOO"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE", "FOO"),
485+
Set.of("wiki", "data", "csv", "maze"), "20200101", "20200120", Collections.emptySet()));
486+
// Missing because of date range
487+
Assertions.assertEquals(Set.of("NAME", "EVENT_DATE"), helper.getMissingFieldsInDateRange(Set.of("NAME", "EVENT_DATE"), Set.of("wiki", "data"),
488+
"20190101", "20191231", Collections.emptySet()));
489+
}
490+
}
395491
}

0 commit comments

Comments
 (0)