From e10db8aba06b1c9a6746efd451c9cfb995806b8f Mon Sep 17 00:00:00 2001 From: Giorgos Stamatelatos Date: Mon, 2 Jul 2018 12:29:42 +0300 Subject: [PATCH] Implement Li's Algorithm L as LiLSampling (#22) --- README.md | 12 +++- .../java/gr/james/sampling/LiLSampling.java | 63 +++++++++++++++++++ .../gr/james/sampling/VitterXSampling.java | 10 +-- .../gr/james/sampling/VitterZSampling.java | 10 +-- .../gr/james/sampling/WatermanSampling.java | 6 +- .../java/gr/james/sampling/package-info.java | 1 + .../java/gr/james/sampling/Benchmark.java | 2 + .../gr/james/sampling/RandomSamplingTest.java | 3 + 8 files changed, 92 insertions(+), 15 deletions(-) create mode 100644 src/main/java/gr/james/sampling/LiLSampling.java diff --git a/README.md b/README.md index 1ba16f4..a25fa8c 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ System.out.println(sample); | `WatermanSampling` | Algorithm R by Waterman | `O(k)` | | | `VitterXSampling` | Algorithm X by Vitter | `O(k)` | | | `VitterZSampling` | Algorithm Z by Vitter | `O(k)` | | +| `LiLSampling` | Algorithm L by Li | `O(k)` | | | `EfraimidisSampling` | Algorithm A-Res by Efraimidis | `O(k)` | ✔ | | `ChaoSampling` | Algorithm by Chao | `O(k)` | ✔ | @@ -135,14 +136,21 @@ Signature: `VitterZSampling` implements `RandomSampling` #### References - [Vitter, Jeffrey S. "Random sampling with a reservoir." ACM Transactions on Mathematical Software (TOMS) 11.1 (1985): 37-57.](https://doi.org/10.1145/3147.3165) -### 4 Algorithm A-Res by Efraimidis +### 4 Algorithm L by Li + +Signature: `LiLSampling` implements `RandomSampling` + +#### References +- [Li, Kim-Hung. "Reservoir-sampling algorithms of time complexity O (n (1+ log (N/n)))." ACM Transactions on Mathematical Software (TOMS) 20.4 (1994): 481-493.](https://doi.org/10.1145/198429.198435) + +### 5 Algorithm A-Res by Efraimidis Signature: `EfraimidisSampling` implements `WeightedRandomSampling` #### References - [Efraimidis, Pavlos S., and Paul G. Spirakis. "Weighted random sampling with a reservoir." Information Processing Letters 97.5 (2006): 181-185.](https://doi.org/10.1016/j.ipl.2005.11.003) -### 5 Algorithm by Chao +### 6 Algorithm by Chao Signature: `ChaoSampling` implements `WeightedRandomSampling` diff --git a/src/main/java/gr/james/sampling/LiLSampling.java b/src/main/java/gr/james/sampling/LiLSampling.java new file mode 100644 index 0000000..55b90c6 --- /dev/null +++ b/src/main/java/gr/james/sampling/LiLSampling.java @@ -0,0 +1,63 @@ +package gr.james.sampling; + +import java.util.Random; + +/** + * Implementation of Algorithm L by Li in Reservoir-sampling algorithms of time complexity + * O(n(1 + log(N/n))). + *

+ * Unlike {@link WatermanSampling}, the {@link VitterXSampling}, {@link VitterZSampling} and {@code LiLSampling} + * algorithms decide how many items to skip, rather than deciding whether or not to skip an item each time it is feeded. + * This property allows these algorithms to perform better by efficiently calculating the number of items that need to + * be skipped, while making fewer calls to the RNG. + *

+ * This implementation throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. + *

+ * The space complexity of this class is {@code O(k)}, where {@code k} is the sample size. + * + * @param the item type + * @author Giorgos Stamatelatos + * @see Reservoir-sampling algorithms of time complexity + * O(n(1 + log(N/n))) + */ +public class LiLSampling extends AbstractRandomSampling { + private double W; + + /** + * Construct a new instance of {@link LiLSampling} using the specified sample size and RNG. The implementation + * assumes that {@code random} conforms to the contract of {@link Random} and will perform no checks to ensure that. + * If this contract is violated, the behavior is undefined. + * + * @param sampleSize the sample size + * @param random the RNG to use + * @throws NullPointerException if {@code random} is {@code null} + * @throws IllegalArgumentException if {@code sampleSize} is less than 1 + */ + public LiLSampling(int sampleSize, Random random) { + super(sampleSize, random); + } + + /** + * Get a {@link RandomSamplingCollector} from this class. + * + * @param sampleSize the sample size + * @param random the RNG to use + * @param the type of elements + * @return a {@link RandomSamplingCollector} from this class + */ + public static RandomSamplingCollector collector(int sampleSize, Random random) { + return new RandomSamplingCollector<>(() -> new LiLSampling<>(sampleSize, random)); + } + + @Override + void init(int sampleSize, Random random) { + W = Math.exp(Math.log(random.nextDouble()) / sampleSize); + } + + @Override + long skipLength(long streamSize, int sampleSize, Random random) { + final long skip = (long) (Math.log(random.nextDouble()) / Math.log(1 - W)); + W = W * Math.exp(Math.log(random.nextDouble()) / sampleSize); + return skip; + } +} diff --git a/src/main/java/gr/james/sampling/VitterXSampling.java b/src/main/java/gr/james/sampling/VitterXSampling.java index 9c34303..bc81771 100644 --- a/src/main/java/gr/james/sampling/VitterXSampling.java +++ b/src/main/java/gr/james/sampling/VitterXSampling.java @@ -5,12 +5,12 @@ /** * Implementation of Algorithm X by Vitter in Random Sampling with a Reservoir. *

- * Unlike {@link WatermanSampling}, the {@code VitterXSampling} and {@link VitterZSampling} algorithms decide how many - * items to skip, rather than deciding whether or not to skip an item each time it is feeded. This property allows these - * algorithms to perform better by efficiently calculating the number of items that need to be skipped, while making - * fewer calls to the RNG. + * Unlike {@link WatermanSampling}, the {@code VitterXSampling}, {@link VitterZSampling} and {@link LiLSampling} + * algorithms decide how many items to skip, rather than deciding whether or not to skip an item each time it is feeded. + * This property allows these algorithms to perform better by efficiently calculating the number of items that need to + * be skipped, while making fewer calls to the RNG. *

- * This implementations throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. + * This implementation throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. *

* The space complexity of this class is {@code O(k)}, where {@code k} is the sample size. * diff --git a/src/main/java/gr/james/sampling/VitterZSampling.java b/src/main/java/gr/james/sampling/VitterZSampling.java index 88a1983..68c008c 100644 --- a/src/main/java/gr/james/sampling/VitterZSampling.java +++ b/src/main/java/gr/james/sampling/VitterZSampling.java @@ -5,12 +5,12 @@ /** * Implementation of Algorithm Z by Vitter in Random Sampling with a Reservoir. *

- * Unlike {@link WatermanSampling}, the {@link VitterXSampling} and {@code VitterZSampling} algorithms decide how many - * items to skip, rather than deciding whether or not to skip an item each time it is feeded. This property allows these - * algorithms to perform better by efficiently calculating the number of items that need to be skipped, while making - * fewer calls to the RNG. + * Unlike {@link WatermanSampling}, the {@link VitterXSampling}, {@code VitterZSampling} and {@link LiLSampling} + * algorithms decide how many items to skip, rather than deciding whether or not to skip an item each time it is feeded. + * This property allows these algorithms to perform better by efficiently calculating the number of items that need to + * be skipped, while making fewer calls to the RNG. *

- * This implementations throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. + * This implementation throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. *

* The space complexity of this class is {@code O(k)}, where {@code k} is the sample size. * diff --git a/src/main/java/gr/james/sampling/WatermanSampling.java b/src/main/java/gr/james/sampling/WatermanSampling.java index 3460283..b55b7a6 100644 --- a/src/main/java/gr/james/sampling/WatermanSampling.java +++ b/src/main/java/gr/james/sampling/WatermanSampling.java @@ -8,10 +8,10 @@ *

* The implementation is the simplest unweighted sampling algorithm that each time a new element is feeded, it * determines whether is should be accepted in the sample by producing a random number. The more efficient - * {@link VitterXSampling} and {@link VitterZSampling} decide how many items to skip, rather than deciding whether or - * not to skip an item each time it is feeded. + * {@link VitterXSampling}, {@link VitterZSampling} and {@link LiLSampling} decide how many items to skip, rather than + * deciding whether or not to skip an item each time it is feeded. *

- * This implementations throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. + * This implementation throws {@link StreamOverflowException} if more than {@link Long#MAX_VALUE} items are feeded. *

* The space complexity of this class is {@code O(k)}, where {@code k} is the sample size. * diff --git a/src/main/java/gr/james/sampling/package-info.java b/src/main/java/gr/james/sampling/package-info.java index 45bd470..312b6cc 100644 --- a/src/main/java/gr/james/sampling/package-info.java +++ b/src/main/java/gr/james/sampling/package-info.java @@ -9,6 +9,7 @@ *

  • {@link gr.james.sampling.WatermanSampling}
  • *
  • {@link gr.james.sampling.VitterXSampling}
  • *
  • {@link gr.james.sampling.VitterZSampling}
  • + *
  • {@link gr.james.sampling.LiLSampling}
  • * *

    WeightedRandomSampling implementations

    *