Skip to content
This repository was archived by the owner on Nov 11, 2022. It is now read-only.

Commit baa8e2f

Browse files
lukecwikdavorbonaci
authored andcommitted
Handle when Dataflow service tells us that values are sorted in GBK
----Release Notes---- [] ------------- Created by MOE: https://github.com/google/moe MOE_MIGRATED_REVID=112725244
1 parent 054f1e6 commit baa8e2f

File tree

9 files changed

+263
-154
lines changed

9 files changed

+263
-154
lines changed

sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/GroupingShuffleReader.java

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@
2121
import static com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils.cloudProgressToReaderProgress;
2222
import static com.google.cloud.dataflow.sdk.runners.worker.SourceTranslationUtils.splitRequestToApproximateSplitRequest;
2323
import static com.google.cloud.dataflow.sdk.util.common.Counter.AggregationKind.SUM;
24+
import static com.google.common.base.Preconditions.checkState;
2425

2526
import com.google.api.services.dataflow.model.ApproximateReportedProgress;
2627
import com.google.api.services.dataflow.model.ApproximateSplitRequest;
2728
import com.google.cloud.dataflow.sdk.coders.Coder;
29+
import com.google.cloud.dataflow.sdk.coders.Coder.Context;
2830
import com.google.cloud.dataflow.sdk.coders.IterableCoder;
2931
import com.google.cloud.dataflow.sdk.coders.KvCoder;
3032
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
@@ -54,6 +56,7 @@
5456
import org.slf4j.Logger;
5557
import org.slf4j.LoggerFactory;
5658

59+
import java.io.ByteArrayInputStream;
5760
import java.io.IOException;
5861
import java.util.Iterator;
5962
import java.util.concurrent.atomic.AtomicLong;
@@ -80,7 +83,8 @@ public class GroupingShuffleReader<K, V> extends NativeReader<WindowedValue<KV<K
8083
// Counts how many bytes were from by a given operation from a given shuffle session.
8184
@Nullable Counter<Long> perOperationPerDatasetBytesCounter;
8285
Coder<K> keyCoder;
83-
Coder<V> valueCoder;
86+
Coder<?> valueCoder;
87+
@Nullable Coder<?> secondaryKeyCoder;
8488

8589
public GroupingShuffleReader(
8690
PipelineOptions options,
@@ -90,15 +94,16 @@ public GroupingShuffleReader(
9094
Coder<WindowedValue<KV<K, Iterable<V>>>> coder,
9195
BatchModeExecutionContext executionContext,
9296
CounterSet.AddCounterMutator addCounterMutator,
93-
String operationName)
97+
String operationName,
98+
boolean valuesAreSorted)
9499
throws Exception {
95100
this.shuffleReaderConfig = shuffleReaderConfig;
96101
this.startShufflePosition = startShufflePosition;
97102
this.stopShufflePosition = stopShufflePosition;
98103
this.executionContext = executionContext;
99104
this.addCounterMutator = addCounterMutator;
100105
this.operationName = operationName;
101-
initCoder(coder);
106+
initCoder(coder, valuesAreSorted);
102107
// We cannot initialize perOperationPerDatasetBytesCounter here, as it
103108
// depends on shuffleReaderConfig, which isn't populated yet.
104109
}
@@ -131,7 +136,8 @@ public GroupingShuffleReaderIterator<K, V> iterator() throws IOException {
131136
new ChunkingShuffleBatchReader(asr)));
132137
}
133138

134-
private void initCoder(Coder<WindowedValue<KV<K, Iterable<V>>>> coder) throws Exception {
139+
private void initCoder(Coder<WindowedValue<KV<K, Iterable<V>>>> coder,
140+
boolean valuesAreSorted) throws Exception {
135141
if (!(coder instanceof WindowedValueCoder)) {
136142
throw new Exception("unexpected kind of coder for WindowedValue: " + coder);
137143
}
@@ -151,7 +157,17 @@ private void initCoder(Coder<WindowedValue<KV<K, Iterable<V>>>> coder) throws Ex
151157
+ "a key-grouping shuffle");
152158
}
153159
IterableCoder<V> iterCoder = (IterableCoder<V>) kvValueCoder;
154-
this.valueCoder = iterCoder.getElemCoder();
160+
if (valuesAreSorted) {
161+
checkState(iterCoder.getElemCoder() instanceof KvCoder,
162+
"unexpected kind of coder for elements read from a "
163+
+ "key-grouping value sorting shuffle: %s", iterCoder.getElemCoder());
164+
@SuppressWarnings("rawtypes")
165+
KvCoder<?, ?> valueKvCoder = (KvCoder) iterCoder.getElemCoder();
166+
this.secondaryKeyCoder = valueKvCoder.getKeyCoder();
167+
this.valueCoder = valueKvCoder.getValueCoder();
168+
} else {
169+
this.valueCoder = iterCoder.getElemCoder();
170+
}
155171
}
156172

157173
final GroupingShuffleReaderIterator<K, V> iterator(ShuffleEntryReader reader) {
@@ -390,7 +406,20 @@ public V next() {
390406
// notify the bytes that have been read so far.
391407
notifyValueReturned(currentGroupSize.getAndSet(0L));
392408
try {
393-
return CoderUtils.decodeFromByteArray(parentReader.valueCoder, entry.getValue());
409+
if (parentReader.secondaryKeyCoder != null) {
410+
ByteArrayInputStream bais = new ByteArrayInputStream(entry.getSecondaryKey());
411+
@SuppressWarnings("unchecked")
412+
V value = (V) KV.of(
413+
// We ignore decoding the timestamp.
414+
parentReader.secondaryKeyCoder.decode(bais, Context.NESTED),
415+
CoderUtils.decodeFromByteArray(parentReader.valueCoder, entry.getValue()));
416+
return value;
417+
} else {
418+
@SuppressWarnings("unchecked")
419+
V value = (V) CoderUtils.decodeFromByteArray(parentReader.valueCoder,
420+
entry.getValue());
421+
return value;
422+
}
394423
} catch (IOException exn) {
395424
throw new RuntimeException(exn);
396425
}

sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/GroupingShuffleReaderFactory.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package com.google.cloud.dataflow.sdk.runners.worker;
1818

1919
import static com.google.api.client.util.Base64.decodeBase64;
20+
import static com.google.cloud.dataflow.sdk.util.Structs.getBoolean;
2021
import static com.google.cloud.dataflow.sdk.util.Structs.getString;
2122

2223
import com.google.cloud.dataflow.sdk.coders.Coder;
@@ -39,7 +40,6 @@
3940
* Creates a GroupingShuffleReader from a CloudObject spec.
4041
*/
4142
public class GroupingShuffleReaderFactory implements ReaderFactory {
42-
4343
@Override
4444
public NativeReader<?> create(
4545
CloudObject spec,
@@ -69,7 +69,8 @@ public <K, V> GroupingShuffleReader<K, V> createTyped(
6969
decodeBase64(getString(spec, PropertyNames.SHUFFLE_READER_CONFIG)),
7070
getString(spec, PropertyNames.START_SHUFFLE_POSITION, null),
7171
getString(spec, PropertyNames.END_SHUFFLE_POSITION, null), coder,
72-
(BatchModeExecutionContext) executionContext, addCounterMutator, operationName);
72+
(BatchModeExecutionContext) executionContext, addCounterMutator, operationName,
73+
getBoolean(spec, PropertyNames.SORT_VALUES, false));
7374
}
7475

7576
return new GroupingShuffleReader<K, V>(options,
@@ -78,7 +79,8 @@ public <K, V> GroupingShuffleReader<K, V> createTyped(
7879
getString(spec, PropertyNames.END_SHUFFLE_POSITION, null),
7980
coder,
8081
(BatchModeExecutionContext) executionContext,
81-
addCounterMutator, operationName);
82+
addCounterMutator, operationName,
83+
getBoolean(spec, PropertyNames.SORT_VALUES, false));
8284
}
8385

8486
/**

sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/GroupingShuffleReaderWithFaultyBytesReadCounter.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,11 @@ public GroupingShuffleReaderWithFaultyBytesReadCounter(
4747
Coder<WindowedValue<KV<K, Iterable<V>>>> coder,
4848
BatchModeExecutionContext executionContext,
4949
CounterSet.AddCounterMutator addCounterMutator,
50-
String operationName)
50+
String operationName,
51+
boolean sortValues)
5152
throws Exception {
5253
super(options, shuffleReaderConfig, startShufflePosition, stopShufflePosition, coder,
53-
executionContext, addCounterMutator, operationName);
54+
executionContext, addCounterMutator, operationName, sortValues);
5455
}
5556

5657
@Override

sdk/src/main/java/com/google/cloud/dataflow/sdk/runners/worker/ShuffleSink.java

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import com.google.cloud.dataflow.sdk.coders.BigEndianLongCoder;
2222
import com.google.cloud.dataflow.sdk.coders.Coder;
23+
import com.google.cloud.dataflow.sdk.coders.Coder.Context;
2324
import com.google.cloud.dataflow.sdk.coders.InstantCoder;
2425
import com.google.cloud.dataflow.sdk.coders.KvCoder;
2526
import com.google.cloud.dataflow.sdk.options.DataflowWorkerHarnessOptions;
@@ -36,6 +37,7 @@
3637
import com.google.cloud.dataflow.sdk.values.KV;
3738
import com.google.common.base.Preconditions;
3839

40+
import java.io.ByteArrayOutputStream;
3941
import java.io.IOException;
4042

4143
/**
@@ -220,14 +222,18 @@ public long add(WindowedValue<T> windowedElem) throws IOException {
220222
Object sortKey = kvValue.getKey();
221223
Object sortValue = kvValue.getValue();
222224

223-
// TODO: Need to coordinate with the
224-
// GroupingShuffleReader, to make sure it knows how to
225-
// reconstruct the value from the sortKeyBytes and
226-
// sortValueBytes. Right now, it doesn't know between
227-
// sorting and non-sorting GBKs.
228-
secondaryKeyBytes = CoderUtils.encodeToByteArray(sortKeyCoder, sortKey);
225+
// Sort values by key and then timestamp so that any GroupAlsoByWindows
226+
// can run more efficiently.
227+
ByteArrayOutputStream baos = new ByteArrayOutputStream();
228+
sortKeyCoder.encode(sortKey, baos, Context.NESTED);
229+
if (!windowedElem.getTimestamp().equals(BoundedWindow.TIMESTAMP_MIN_VALUE)) {
230+
// Empty timestamp suffixes sort before all other sort value keys with
231+
// the same prefix. So We can omit this suffix for this common value here
232+
// for efficiency and only encode when its not the minimum timestamp.
233+
InstantCoder.of().encode(windowedElem.getTimestamp(), baos, Context.OUTER);
234+
}
235+
secondaryKeyBytes = baos.toByteArray();
229236
valueBytes = CoderUtils.encodeToByteArray(sortValueCoder, sortValue);
230-
231237
} else if (groupValues) {
232238
// Sort values by timestamp so that GroupAlsoByWindows can run efficiently.
233239
if (windowedElem.getTimestamp().equals(BoundedWindow.TIMESTAMP_MIN_VALUE)) {

sdk/src/main/java/com/google/cloud/dataflow/sdk/util/PropertyNames.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@ public class PropertyNames {
8484
public static final String SHUFFLE_KIND = "shuffle_kind";
8585
public static final String SHUFFLE_READER_CONFIG = "shuffle_reader_config";
8686
public static final String SHUFFLE_WRITER_CONFIG = "shuffle_writer_config";
87+
public static final String SORT_VALUES = "sort_values";
8788
public static final String START_INDEX = "start_index";
8889
public static final String START_OFFSET = "start_offset";
8990
public static final String START_SHUFFLE_POSITION = "start_shuffle_position";

0 commit comments

Comments
 (0)