forked from gstamatelat/random-sampling
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement ParetoSampling (gstamatelat#36)
- Loading branch information
1 parent
747dd5f
commit f0075b9
Showing
6 changed files
with
295 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
package gr.james.sampling; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* Implementation of the algorithm by Rosén in <b>On sampling with probability proportional to size</b>. | ||
* <p> | ||
* Weighted are not being assigned a particular meaning or have physical interpretation but the resulting inclusion | ||
* probabilities are an approximation of the exact model ({@link ChaoSampling}). Weights must be in the range (0,+Inf) | ||
* but not the value {@code 1.0}, otherwise an {@link IllegalWeightException} is thrown. A side effect of this is that | ||
* the signatures {@link #feed(Object)}, {@link #feed(Iterable)} and {@link #feed(Iterator)} will always throw | ||
* {@link IllegalWeightException}. | ||
* <p> | ||
* This implementation never throws {@link StreamOverflowException}. | ||
* <p> | ||
* The space complexity of this class is {@code O(k)}, where {@code k} is the sample size. | ||
* | ||
* @param <T> the item type | ||
* @author Giorgos Stamatelatos | ||
* @see <a href="https://doi.org/10.1016/S0378-3758(96)00185-1">Asymptotic theory for order sampling</a> | ||
* @see <a href="https://doi.org/10.1016/S0378-3758(96)00186-3">On sampling with probability proportional to size</a> | ||
*/ | ||
public class ParetoSampling<T> implements WeightedRandomSampling<T> { | ||
private final int sampleSize; | ||
private final Random random; | ||
private final PriorityQueue<Weighted<T>> pq; | ||
private final Collection<T> unmodifiableSample; | ||
private long streamSize; | ||
|
||
/** | ||
* Construct a new instance of {@link ParetoSampling} using the specified sample size and RNG. The implementation | ||
* assumes that {@code random} conforms to the contract of {@link Random} and will perform no checks to ensure | ||
* that. If this contract is violated, the behavior is undefined. | ||
* | ||
* @param sampleSize the sample size | ||
* @param random the RNG to use | ||
* @throws NullPointerException if {@code random} is {@code null} | ||
* @throws IllegalArgumentException if {@code sampleSize} is less than 1 | ||
*/ | ||
public ParetoSampling(int sampleSize, Random random) { | ||
if (random == null) { | ||
throw new NullPointerException("Random was null"); | ||
} | ||
if (sampleSize < 1) { | ||
throw new IllegalArgumentException("Sample size was less than 1"); | ||
} | ||
this.random = random; | ||
this.sampleSize = sampleSize; | ||
this.streamSize = 0; | ||
this.pq = new PriorityQueue<>(sampleSize, Comparator.reverseOrder()); | ||
this.unmodifiableSample = new AbstractCollection<T>() { | ||
@Override | ||
public Iterator<T> iterator() { | ||
return new Iterator<T>() { | ||
final Iterator<Weighted<T>> it = pq.iterator(); | ||
|
||
@Override | ||
public boolean hasNext() { | ||
return it.hasNext(); | ||
} | ||
|
||
@Override | ||
public T next() { | ||
return it.next().object; | ||
} | ||
}; | ||
} | ||
|
||
@Override | ||
public int size() { | ||
return pq.size(); | ||
} | ||
}; | ||
} | ||
|
||
/** | ||
* Construct a new instance of {@link ParetoSampling} using the specified sample size and a default source of | ||
* randomness. | ||
* | ||
* @param sampleSize the sample size | ||
* @throws IllegalArgumentException if {@code sampleSize} is less than 1 | ||
*/ | ||
public ParetoSampling(int sampleSize) { | ||
this(sampleSize, new Random()); | ||
} | ||
|
||
/** | ||
* Get a {@link RandomSamplingCollector} from this class. | ||
* | ||
* @param sampleSize the sample size | ||
* @param random the RNG to use | ||
* @param <E> the type of elements | ||
* @return a {@link RandomSamplingCollector} from this class | ||
*/ | ||
public static <E> RandomSamplingCollector<E> collector(int sampleSize, Random random) { | ||
return new RandomSamplingCollector<>(() -> new ParetoSampling<>(sampleSize, random)); | ||
} | ||
|
||
/** | ||
* Get a {@link WeightedRandomSamplingCollector} from this class. | ||
* | ||
* @param sampleSize the sample size | ||
* @param random the RNG to use | ||
* @param <E> the type of elements | ||
* @return a {@link WeightedRandomSamplingCollector} from this class | ||
*/ | ||
public static <E> WeightedRandomSamplingCollector<E> weightedCollector(int sampleSize, Random random) { | ||
return new WeightedRandomSamplingCollector<>(() -> new ParetoSampling<>(sampleSize, random)); | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param item {@inheritDoc} | ||
* @param weight {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
* @throws IllegalWeightException if {@code weight} is outside the range (0,+Inf) | ||
*/ | ||
@Override | ||
public boolean feed(T item, double weight) { | ||
// Checks | ||
if (item == null) { | ||
throw new NullPointerException("Item was null"); | ||
} | ||
if (weight <= 0) { | ||
throw new IllegalWeightException("Weight was not positive, must be in (0,+Inf) but not 1"); | ||
} | ||
if (Double.isInfinite(weight)) { | ||
throw new IllegalWeightException("Weight was infinite, must be in (0,+Inf) but not 1"); | ||
} | ||
if (weight == 1.0) { | ||
throw new IllegalWeightException("Weight was 1, must be in (0,+Inf) but not 1"); | ||
} | ||
|
||
// Produce a random value | ||
final double r = RandomSamplingUtils.randomExclusive(random); | ||
|
||
// Increase stream size | ||
this.streamSize++; | ||
|
||
// Calculate item weight | ||
final Weighted<T> newItem = new Weighted<>(item, (r * (1 - weight)) / ((1 - r) * weight)); | ||
assert newItem.weight >= 0.0; // weight can also be 0.0 because of double precision | ||
|
||
// Add item to reservoir | ||
if (pq.size() < sampleSize) { | ||
pq.add(newItem); | ||
return true; | ||
} else if (pq.peek().weight > newItem.weight) { | ||
// Seems unfair for equal weight items to not have a chance to get in the sample | ||
// Of course in the long run it hardly matters | ||
assert pq.size() == sampleSize(); | ||
pq.poll(); | ||
pq.add(newItem); | ||
return true; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param items {@inheritDoc} | ||
* @param weights {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
* @throws IllegalArgumentException {@inheritDoc} | ||
* @throws IllegalWeightException {@inheritDoc} | ||
*/ | ||
@Override | ||
public boolean feed(Iterator<T> items, Iterator<Double> weights) { | ||
return WeightedRandomSampling.super.feed(items, weights); | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param items {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
* @throws IllegalWeightException {@inheritDoc} | ||
*/ | ||
@Override | ||
public boolean feed(Map<T, Double> items) { | ||
return WeightedRandomSampling.super.feed(items); | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @return {@inheritDoc} | ||
*/ | ||
@Override | ||
public Collection<T> sample() { | ||
return this.unmodifiableSample; | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @return {@inheritDoc} | ||
*/ | ||
@Override | ||
public final int sampleSize() { | ||
assert this.sampleSize > 0; | ||
return this.sampleSize; | ||
} | ||
|
||
/** | ||
* Get the number of items that have been feeded to the algorithm during the lifetime of this instance. | ||
* <p> | ||
* If more than {@link Long#MAX_VALUE} items has been feeded to the instance, {@code streamSize()} will cycle the | ||
* long values, continuing from {@link Long#MIN_VALUE}. | ||
* <p> | ||
* This method runs in constant time. | ||
* | ||
* @return the number of items that have been feeded to the algorithm | ||
*/ | ||
@Override | ||
public final long streamSize() { | ||
return this.streamSize; | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param item {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
*/ | ||
@Override | ||
public boolean feed(T item) { | ||
return WeightedRandomSampling.super.feed(item); | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param items {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
*/ | ||
@Override | ||
public boolean feed(Iterator<T> items) { | ||
return WeightedRandomSampling.super.feed(items); | ||
} | ||
|
||
/** | ||
* {@inheritDoc} | ||
* | ||
* @param items {@inheritDoc} | ||
* @return {@inheritDoc} | ||
* @throws NullPointerException {@inheritDoc} | ||
*/ | ||
@Override | ||
public boolean feed(Iterable<T> items) { | ||
return WeightedRandomSampling.super.feed(items); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters