From ee189595734485382c2687094def013cf8c08fd8 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 21:19:02 +0200 Subject: [PATCH 01/16] Finish new API decleration in probabilities.jl --- src/entropy.jl | 27 ++++++++++---------- src/probabilities.jl | 60 +++++++++++++------------------------------- 2 files changed, 32 insertions(+), 55 deletions(-) diff --git a/src/entropy.jl b/src/entropy.jl index 5feb60651..834905482 100644 --- a/src/entropy.jl +++ b/src/entropy.jl @@ -21,12 +21,11 @@ These entropy types are given as inputs to [`entropy`](@ref) and [`entropy_norma Mathematically speaking, generalized entropies are just nonnegative functions of probability distributions that verify certain (entropy-type-dependent) axioms. -Amigó et al., 2018's -[summary paper](https://www.mdpi.com/1099-4300/20/11/813) gives a nice overview. +Amigó et al.[^Amigó2018] summary paper gives a nice overview. [Amigó2018]: Amigó, J. M., Balogh, S. G., & Hernández, S. (2018). A brief review of - generalized entropies. Entropy, 20(11), 813. + generalized entropies. [Entropy, 20(11), 813.](https://www.mdpi.com/1099-4300/20/11/813) """ abstract type Entropy <: AbstractEntropy end @@ -57,19 +56,20 @@ abstract type EntropyEstimator <: AbstractEntropy end ########################################################################################### # Notice that StatsBase.jl exports `entropy` and Wavelets.jl exports `Entropy`. """ - entropy([e::Entropy,] probs::Probabilities) → h::Real ∈ [0, ∞) - entropy([e::Entropy,] est::ProbabilitiesEstimator, x) → h::Real ∈ [0, ∞) - entropy([e::Entropy,] est::EntropyEstimator, x) → h::Real ∈ [0, ∞) + entropy([e::Entropy,] probs::Probabilities) + entropy([e::Entropy,] est::ProbabilitiesEstimator, x) + entropy([e::Entropy,] est::EntropyEstimator, x) -Compute `h`, a (generalized) [`Entropy`](@ref) of type `e`, in one of three ways: +Compute `h`, with `h::Real ∈ [0, ∞)`, which is +a (generalized) [`Entropy`](@ref) of type `e`, in one of three ways: 1. Directly from existing [`Probabilities`](@ref) `probs`. 2. From input data `x`, by first estimating a probability distribution using the provided - [`ProbabilitiesEstimator`](@ref), then computing entropy from that distribution. - In fact, the second method is just a 2-lines-of-code wrapper that calls - [`probabilities`](@ref) and gives the result to the first method. + [`ProbabilitiesEstimator`](@ref), then computing entropy from that distribution. + In fact, the second method is just a 2-lines-of-code wrapper that calls + [`probabilities`](@ref) and gives the result to the first method. 3. From input data `x`, by using a dedicated [`EntropyEstimator`](@ref) that computes - entropy in a way that doesn't involve explicitly computing probabilities first. + entropy in a way that doesn't involve explicitly computing probabilities first. The entropy (first argument) is optional. When `est` is a probability estimator, `Shannon()` is used by default. When `est` is a dedicated entropy estimator, @@ -123,8 +123,9 @@ function entropy!(s::AbstractVector{Int}, e::Entropy, est::ProbabilitiesEstimato entropy(e, probs) end -entropy!(s::AbstractVector{Int}, est::ProbabilitiesEstimator, x) = +function entropy!(s::AbstractVector{Int}, est::ProbabilitiesEstimator, x) entropy!(s, Shannon(), est, x) +end ########################################################################################### # API: entropy from entropy estimators @@ -132,7 +133,7 @@ entropy!(s::AbstractVector{Int}, est::ProbabilitiesEstimator, x) = # Dispatch for these functions is implemented in individual estimator files in # `entropies/estimators/`. function entropy(e::Entropy, est::EntropyEstimator, x) - t = string(typeof(e).name.name) + t = string(nameof(typeof(e))) throw(ArgumentError("$t entropy not implemented for $(typeof(est)) estimator")) end diff --git a/src/probabilities.jl b/src/probabilities.jl index d72b13efd..97078464f 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -32,7 +32,6 @@ function Probabilities(x::AbstractVector{<:Integer}) return Probabilities(x ./ s, true) end - # extend base Vector interface: for f in (:length, :size, :eachindex, :eltype, :lastindex, :firstindex, :vec, :getindex, :iterate) @@ -42,7 +41,8 @@ Base.IteratorSize(::Probabilities) = Base.HasLength() @inline Base.sum(::Probabilities{T}) where T = one(T) """ -ProbabilitiesEstimator + ProbabilitiesEstimator + The supertype for all probabilities estimators. In Entropies.jl, probability distributions are estimated from data by defining a set of @@ -110,13 +110,14 @@ function probabilities(est::ProbabilitiesEstimator, x) end """ - probabilities_and_outcomes(x, est) → (probs, Ω::Vector) + probabilities_and_outcomes(est, x) -Return `probs, Ω`, where `probs = probabilities(x, est)` and -`Ω[i]` is the outcome with probability `probs[i]`. -The element type of `Ω` depends on the estimator. +Return `probs, outs`, where `probs = probabilities(x, est)` and +`outs[i]` is the outcome with probability `probs[i]`. +The element type of `outs` depends on the estimator. +`outs` is a subset of the [`outcome_space`](@ref) of `est`. -See also [`outcomes`](@ref), [`total_outcomes`](@ref), and [`outcome_space`](@ref). +See also [`outcomes`](@ref), [`total_outcomes`](@ref). """ function probabilities_and_outcomes(est::ProbabilitiesEstimator, x) error("`probabilities_and_outcomes` not implemented for estimator $(typeof(est)).") @@ -136,7 +137,7 @@ function probabilities! end # Outcome space ########################################################################################### """ - outcome_space([x,] est::ProbabilitiesEstimator) → Ω + outcome_space(est::ProbabilitiesEstimator) → Ω Return a container (typically `Vector`) containing all _possible_ outcomes of `est`, i.e., the outcome space `Ω`. @@ -144,44 +145,19 @@ Only possible for estimators that implement [`total_outcomes`](@ref), and similarly, for some estimators `x` is not needed. The _values_ of `x` are never needed; but some times the type and dimensional layout of `x` is. """ -function outcome_space(x, est::ProbabilitiesEstimator) - outcome_space(est) -end function outcome_space(est::ProbabilitiesEstimator) - error( - "`outcome_space(est)` not known/implemented for estimator $(typeof(est))."* - "Try providing some input data, e.g. `outcomes_space(x, est)`."* - "In some cases, this gives the dimensional layout/type information needed "* - "to define the outcome space." - ) + error("`outcome_space` not implemented for estimator $(typeof(est)).") end """ - total_outcomes([x::Array_or_Dataset,] est::ProbabilitiesEstimator) → Int - -Return the size/cardinality of the outcome space ``\\Omega`` defined by the probabilities -estimator `est` imposed on the input data `x`. + total_outcomes(est::ProbabilitiesEstimator) -For some estimators, the total number of outcomes is independent of `x`, in which case -the input `x` is ignored and the method `total_outcomes(est)` is called. If the total -number of states cannot be known a priori, an error is thrown. Primarily used in -[`entropy_normalized`](@ref). - -## Examples - -```jldoctest setup = :(using Entropies) -julia> est = SymbolicPermutation(m = 4); - -julia> total_outcomes(rand(42), est) # same as `factorial(m)` for any `x` -24 -``` +Return the length (cardinality) of the outcome space ``\\Omega`` of `est`. """ -function total_outcomes(x, est::ProbabilitiesEstimator) - return length(outcome_space(x, est)) -end +total_outcomes(est::ProbabilitiesEstimator) = length(outcome_space(est)) """ - missing_outcomes(x, est::ProbabilitiesEstimator) → n_missing::Int + missing_outcomes(est::ProbabilitiesEstimator, x) → n_missing::Int Estimate a probability distribution for `x` using the given estimator, then count the number of missing (i.e. zero-probability) outcomes. @@ -190,19 +166,19 @@ Works for estimators that implement [`total_outcomes`](@ref). See also: [`MissingDispersionPatterns`](@ref). """ -function missing_outcomes(x::Array_or_Dataset, est::ProbabilitiesEstimator) +function missing_outcomes(est::ProbabilitiesEstimator, x::Array_or_Dataset) probs = probabilities(x, est) - L = total_outcomes(x, est) + L = total_outcomes(est) O = count(!iszero, probs) return L - O end """ - outcomes(x, est::ProbabilitiesEstimator) + outcomes(est::ProbabilitiesEstimator, x) Return all (unique) outcomes contained in `x` according to the given estimator. Equivalent with `probabilities_and_outcomes(x, est)[2]`, but for some estimators it may be explicitly extended for better performance. """ -function outcomes(x, est::ProbabilitiesEstimator) +function outcomes(est::ProbabilitiesEstimator, x) return probabilities_and_outcomes(est, x)[2] end From 8f39592c514bb0552a9c7a7bc56dc77eee4f9519 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 21:23:52 +0200 Subject: [PATCH 02/16] actually finish better --- src/probabilities.jl | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/probabilities.jl b/src/probabilities.jl index 97078464f..096a8efe2 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -33,11 +33,12 @@ function Probabilities(x::AbstractVector{<:Integer}) end # extend base Vector interface: -for f in (:length, :size, :eachindex, :eltype, +for f in (:length, :size, :eachindex, :eltype, :parent, :lastindex, :firstindex, :vec, :getindex, :iterate) @eval Base.$(f)(d::Probabilities, args...) = $(f)(d.p, args...) end Base.IteratorSize(::Probabilities) = Base.HasLength() +# Special extension due to the rules of the API @inline Base.sum(::Probabilities{T}) where T = one(T) """ @@ -139,11 +140,7 @@ function probabilities! end """ outcome_space(est::ProbabilitiesEstimator) → Ω -Return a container (typically `Vector`) containing all _possible_ outcomes of `est`, -i.e., the outcome space `Ω`. -Only possible for estimators that implement [`total_outcomes`](@ref), -and similarly, for some estimators `x` is not needed. The _values_ of `x` are never needed; -but some times the type and dimensional layout of `x` is. +Return a container containing all _possible_ outcomes of `est`. """ function outcome_space(est::ProbabilitiesEstimator) error("`outcome_space` not implemented for estimator $(typeof(est)).") @@ -162,8 +159,6 @@ total_outcomes(est::ProbabilitiesEstimator) = length(outcome_space(est)) Estimate a probability distribution for `x` using the given estimator, then count the number of missing (i.e. zero-probability) outcomes. -Works for estimators that implement [`total_outcomes`](@ref). - See also: [`MissingDispersionPatterns`](@ref). """ function missing_outcomes(est::ProbabilitiesEstimator, x::Array_or_Dataset) From a4b3d6b2353202ab7a46d77d0b1b4f40d2f5effa Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 17 Dec 2022 11:21:10 +0200 Subject: [PATCH 03/16] finish new API for spectral entropy --- .../timescales/power_spectrum.jl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/probabilities_estimators/timescales/power_spectrum.jl b/src/probabilities_estimators/timescales/power_spectrum.jl index e159c1bdc..83534c94b 100644 --- a/src/probabilities_estimators/timescales/power_spectrum.jl +++ b/src/probabilities_estimators/timescales/power_spectrum.jl @@ -2,7 +2,7 @@ export PowerSpectrum import FFTW """ - PowerSpectrum() <: ProbabilitiesEstimator + PowerSpectrum(x_or_length(x)) <: ProbabilitiesEstimator Calculate the power spectrum of a timeseries (amplitude square of its Fourier transform), and return the spectrum normalized to sum = 1 as probabilities. @@ -17,6 +17,8 @@ in spectral space depends on the length of the input. The outcome space `Ω` for `PowerSpectrum` is the set of frequencies in Fourier space. They should be multiplied with the sampling rate of the signal, which is assumed to be `1`. +The length of the input is therefore required for this estimator to have a well-defined +outcome space. [^Llanos2016]: Llanos et al., _Power spectral entropy as an information-theoretic correlate of manner @@ -28,24 +30,27 @@ should be multiplied with the sampling rate of the signal, which is assumed to b by Short-Time Training in the Delayed-Match-to-Sample Task_, [Front. Hum. Neurosci.](https://doi.org/10.3389/fnhum.2017.00437) """ -struct PowerSpectrum <: ProbabilitiesEstimator end +struct PowerSpectrum <: ProbabilitiesEstimator + length::Int +end +PowerSpectrum(x::AbstractVector) = PowerSpectrum(length(x)) function probabilities_and_outcomes(est::PowerSpectrum, x) probs = probabilities(est, x) - events = FFTW.rfftfreq(length(x)) + events = FFTW.rfftfreq(est.length) return probs, events end -outcome_space(x, ::PowerSpectrum) = FFTW.rfftfreq(length(x)) +outcome_space(est::PowerSpectrum) = FFTW.rfftfreq(est.length) function probabilities(::PowerSpectrum, x) @assert x isa AbstractVector{<:Real} "`PowerSpectrum` only works for timeseries input!" f = FFTW.rfft(x) - Probabilities(abs2.(f)) + return Probabilities(abs2.(f)) end -function total_outcomes(x, ::PowerSpectrum) - n = length(x) +function total_outcomes(est::PowerSpectrum) + n = est.length # From the docstring of `AbstractFFTs.rfftfreq`: iseven(n) ? length(0:(n÷2)) : length(0:((n-1)÷2)) end From fb5c776d4addc491a46eeee9a8a8d280833ae20c Mon Sep 17 00:00:00 2001 From: George Datseris Date: Sat, 17 Dec 2022 12:36:56 +0200 Subject: [PATCH 04/16] fix a bunch more tests --- src/encodings.jl | 7 ++++--- .../counting/count_occurences.jl | 9 ++++++--- test/convenience.jl | 4 ++-- .../estimators/count_occurrences.jl | 16 ++++++++-------- test/probabilities/estimators/timescales.jl | 10 +++++----- test/probabilities/estimators/value_histogram.jl | 2 +- test/probabilities/interface.jl | 2 +- 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/encodings.jl b/src/encodings.jl index adf7a85de..5bd0e88c8 100644 --- a/src/encodings.jl +++ b/src/encodings.jl @@ -3,8 +3,8 @@ export Encoding, encode, decode """ Encoding -The supertype for all encoding schemes. Encodings **always encode elements of -input data into the positive integers**. The encoding API is defined by the +The supertype for all encoding schemes. Encodings always encode elements of +input data into the positive integers. The encoding API is defined by the functions [`encode`](@ref) and [`decode`](@ref). Some probability estimators utilize encodings internally. @@ -19,7 +19,8 @@ abstract type Encoding end """ encode(χ, e::Encoding) -> i::Int Encoding an element `χ ∈ x` of input data `x` (those given to [`probabilities`](@ref)) -using encoding `e`. +using encoding `e`. The special value of `-1` is reserved as a return value for +inappropriate elements `χ` that cannot be encoded according to `e`. """ function encode end diff --git a/src/probabilities_estimators/counting/count_occurences.jl b/src/probabilities_estimators/counting/count_occurences.jl index f145a16f1..fd96a696c 100644 --- a/src/probabilities_estimators/counting/count_occurences.jl +++ b/src/probabilities_estimators/counting/count_occurences.jl @@ -1,7 +1,7 @@ export CountOccurrences """ - CountOccurrences() + CountOccurrences(x) A probabilities/entropy estimator based on straight-forward counting of distinct elements in a univariate time series or multivariate dataset. This is the same as giving no @@ -9,8 +9,11 @@ estimator to [`probabilities`](@ref). ## Outcome space The outcome space is the unique sorted values of the input. +Hence, input `x` is needed for a well-defined outcome space. """ -struct CountOccurrences <: ProbabilitiesEstimator end +struct CountOccurrences{X} <: ProbabilitiesEstimator + x::X +end function probabilities_and_outcomes(::CountOccurrences, x::Array_or_Dataset) z = copy(x) @@ -19,7 +22,7 @@ function probabilities_and_outcomes(::CountOccurrences, x::Array_or_Dataset) return probs, unique!(z) end -outcome_space(x, ::CountOccurrences) = sort!(unique(x)) +outcome_space(est::CountOccurrences) = sort!(unique(est.x)) probabilities(::CountOccurrences, x::Array_or_Dataset) = probabilities(x) function probabilities(x::Array_or_Dataset) diff --git a/test/convenience.jl b/test/convenience.jl index 0f3a3023f..3a067cc22 100644 --- a/test/convenience.jl +++ b/test/convenience.jl @@ -26,7 +26,7 @@ import Statistics end end -@testset "`probabilities` defaults to `CountOccurrences`" begin +@testset "probabilities(x)" begin x = [1, 1, 2, 2, 3, 3] - @test probabilities(x) == probabilities(CountOccurrences(), x) == [1/3, 1/3, 1/3] + @test probabilities(x) == probabilities(CountOccurrences(x), x) == [1/3, 1/3, 1/3] end diff --git a/test/probabilities/estimators/count_occurrences.jl b/test/probabilities/estimators/count_occurrences.jl index 0a21defdc..2aba2ce6e 100644 --- a/test/probabilities/estimators/count_occurrences.jl +++ b/test/probabilities/estimators/count_occurrences.jl @@ -6,23 +6,23 @@ rng = Random.MersenneTwister(1234) x = [rand(rng, Bool) for _ in 1:10000] probs1 = probabilities(x) -probs2 = probabilities(CountOccurrences(), x) -probs3, outs = probabilities_and_outcomes(CountOccurrences(), x) +probs2 = probabilities(CountOccurrences(x), x) +probs3, outs = probabilities_and_outcomes(CountOccurrences(x), x) for ps in (probs1, probs2, probs3) for p in ps; @test 0.49 < p < 0.51; end end -@test outs == [false, true] -@test outcome_space(x, CountOccurrences()) == [false, true] -@test total_outcomes(x, CountOccurrences()) == 2 +@test sort(outs) == [false, true] +@test sort(outcome_space(CountOccurrences(x))) == [false, true] +@test total_outcomes(CountOccurrences(x)) == 2 # Same for 2D sets (outcomes not tested here) y = [rand(rng, Bool) for _ in 1:10000] D = Dataset(x, y) probs1 = probabilities(D) -probs2 = probabilities(CountOccurrences(), D) +probs2 = probabilities(CountOccurrences(D), D) for ps in (probs1, probs2) for p in ps; @test 0.24 < p < 0.26; end end @@ -30,8 +30,8 @@ end # Renyi of coin toss is 1 bit, and for two coin tosses is two bits # Result doesn't depend on `q` due to uniformity of the PDF. for q in (0.5, 1.0, 2.0) - h = entropy(Renyi(q), CountOccurrences(), x) + h = entropy(Renyi(q), CountOccurrences(x), x) @test 0.99 < h < 1.01 - h = entropy(Renyi(q), CountOccurrences(), D) + h = entropy(Renyi(q), CountOccurrences(D), D) @test 1.99 < h < 2.01 end diff --git a/test/probabilities/estimators/timescales.jl b/test/probabilities/estimators/timescales.jl index 6e36ea792..e01e615d0 100644 --- a/test/probabilities/estimators/timescales.jl +++ b/test/probabilities/estimators/timescales.jl @@ -21,14 +21,14 @@ using Entropies, Test x = sin.(t) y = @. sin(t) + sin(sqrt(3)*t) z = randn(N) - est = PowerSpectrum() + est = PowerSpectrum(N) ents = [entropy(Renyi(), est, w) for w in (x,y,z)] @test ents[1] < ents[2] < ents[3] # Test event stuff (analytically, using sine wave) - probs, events = probabilities_and_outcomes(est, x) - @test length(events) == length(probs) == 501 - @test events[1] ≈ 0 atol=1e-16 # 0 frequency, i.e., mean value + probs, outs = probabilities_and_outcomes(est, x) + @test length(outs) == length(probs) == 501 + @test outs[1] ≈ 0 atol=1e-16 # 0 frequency, i.e., mean value @test probs[1] ≈ 0 atol=1e-16 # sine wave has 0 mean value - @test events[end] == 0.5 # Nyquist frequency, 1/2 the sampling rate (Which is 1) + @test outs[end] == 0.5 # Nyquist frequency, 1/2 the sampling rate (Which is 1) end end diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index 058f6833a..e086b029c 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -30,7 +30,7 @@ using Random @test o isa Vector{SVector{2, Float64}} @test length(o) == length(p) @test all(x -> x < 1, maximum(o)) - o2 = outcomes(x, est) + o2 = outcomes(est, x) @test o2 == o end end diff --git a/test/probabilities/interface.jl b/test/probabilities/interface.jl index f187b0b12..6445a79f7 100644 --- a/test/probabilities/interface.jl +++ b/test/probabilities/interface.jl @@ -1,3 +1,3 @@ x = ones(3) -p = probabilities(ValueHistogram(0.1), x) +p = probabilities(x) @test p isa Probabilities From f46087e5355fd12d15fca7b1b9faa07ee0ac4773 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 10:13:49 +0200 Subject: [PATCH 05/16] finish all minus TE and PE --- src/encoding/ordinal_pattern.jl | 5 ++-- src/encodings.jl | 8 ++--- .../dispersion/dispersion.jl | 4 +-- .../diversity/diversity.jl | 13 +++----- .../histograms/value_histogram.jl | 7 ++--- .../kernel/kernel_density.jl | 30 ++++++++++--------- 6 files changed, 32 insertions(+), 35 deletions(-) diff --git a/src/encoding/ordinal_pattern.jl b/src/encoding/ordinal_pattern.jl index 921c79cf5..ccc522205 100644 --- a/src/encoding/ordinal_pattern.jl +++ b/src/encoding/ordinal_pattern.jl @@ -1,12 +1,13 @@ export OrdinalPatternEncoding +#TODO: The docstring here, and probably the source code, needs a full re-write +# based on new `encode` interface. """ OrdinalPatternEncoding <: Encoding OrdinalPatternEncoding(m = 3, τ = 1; lt = est.lt) A encoding scheme that converts the input time series to ordinal patterns, which are -then encoded to integers using [`encode_motif`](@ref), used with -[`outcomes`](@ref). +then encoded to integers using [`encode`](@ref). !!! note `OrdinalPatternEncoding` is intended for symbolizing *time series*. If providing a short vector, diff --git a/src/encodings.jl b/src/encodings.jl index 5bd0e88c8..3d1497e20 100644 --- a/src/encodings.jl +++ b/src/encodings.jl @@ -17,16 +17,16 @@ Current available encodings are: abstract type Encoding end """ - encode(χ, e::Encoding) -> i::Int + encode(χ, c::Encoding) -> i::Int Encoding an element `χ ∈ x` of input data `x` (those given to [`probabilities`](@ref)) -using encoding `e`. The special value of `-1` is reserved as a return value for +using encoding `c`. The special value of `-1` is reserved as a return value for inappropriate elements `χ` that cannot be encoded according to `e`. """ function encode end """ - decode(i::Int, e::Encoding) -> ω + decode(i::Int, c::Encoding) -> ω Decode an encoded element `i::Int` into the outcome it corresponds to `ω ∈ Ω`. -`Ω` is the [`outcome_space`](@ref) of a probabilities estimator that uses encoding `e`. +`Ω` is the [`outcome_space`](@ref) of a probabilities estimator that uses encoding `c`. """ function decode end \ No newline at end of file diff --git a/src/probabilities_estimators/dispersion/dispersion.jl b/src/probabilities_estimators/dispersion/dispersion.jl index 1d9422203..e55addef8 100644 --- a/src/probabilities_estimators/dispersion/dispersion.jl +++ b/src/probabilities_estimators/dispersion/dispersion.jl @@ -118,11 +118,11 @@ function probabilities_and_outcomes(est::Dispersion, x::AbstractVector{<:Real}) return Probabilities(hist), dispersion_patterns end -total_outcomes(est::Dispersion)::Int = est.encoding.c ^ est.m - function outcome_space(est::Dispersion) c, m = 1:est.encoding.c, est.m cart = CartesianIndices(ntuple(i -> c, m)) V = SVector{m, Int} return map(i -> V(Tuple(i)), vec(cart)) end +# Performance extension +total_outcomes(est::Dispersion)::Int = total_outcomes(est.encoding) ^ est.m diff --git a/src/probabilities_estimators/diversity/diversity.jl b/src/probabilities_estimators/diversity/diversity.jl index 3ccde602a..31b3c7ccb 100644 --- a/src/probabilities_estimators/diversity/diversity.jl +++ b/src/probabilities_estimators/diversity/diversity.jl @@ -7,7 +7,7 @@ export Diversity A [`ProbabilitiesEstimator`](@ref) based on the cosine similarity. It can be used with [`entropy`](@ref) to -compute diversity entropy (Wang et al., 2020)[^Wang2020]. +compute the diversity entropy of an input timeseries[^Wang2020]. The implementation here allows for `τ != 1`, which was not considered in the original paper. @@ -26,12 +26,8 @@ Diversity probabilities are computed as follows. 5. Sum-normalize the histogram to obtain probabilities. ## Outcome space -The outcome space for `Diversity` is the bins of the `[-1, 1]` interval. -The left side of each bin is returned in [`outcome_space`](@ref). - -- [`probabilities_and_outcomes`](@ref). Events are the corners of the cosine similarity bins. - Each bin has width `nextfloat(2 / nbins)`. -- [`total_outcomes`](@ref). The total number of states is given by `nbins`. +The outcome space for `Diversity` is the bins of the `[-1, 1]` interval, +and the return configuration is the same as in [`ValueHistogram`](@ref) (left bin edge). [^Wang2020]: Wang, X., Si, S., & Li, Y. (2020). Multiscale diversity entropy: A novel @@ -54,10 +50,9 @@ function probabilities_and_outcomes(est::Diversity, x::AbstractVector{T}) where return probabilities_and_outcomes(ValueHistogram(binning), ds) end +outcome_space(est::Diversity) = outcome_space(binning_for_diversity(est)) total_outcomes(est::Diversity) = est.nbins -outcome_space(x, est::Diversity) = outcome_space(x, binning_for_diversity(est)) - function similarities_and_binning(est::Diversity, x::AbstractVector{T}) where T <: Real # embed and then calculate cosine similary for each consecutive pair of delay vectors τs = 0:est.τ:(est.m - 1)*est.τ diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 4613e1ffa..06e32e89a 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -41,7 +41,7 @@ See also: [`RectangularBinning`](@ref). struct ValueHistogram{H<:HistogramEncoding} <: ProbabilitiesEstimator encoding::H end -ValueHistogram(ϵ::Union{Real,Vector}) = ValueHistogram(RectangularBinning(ϵ)) +ValueHistogram(x, ϵ::Union{Real,Vector}) = ValueHistogram(x, RectangularBinning(ϵ)) function ValueHistogram(x, b::AbstractBinning) encoding = RectangularBinEncoding(x, b) return ValueHistogram(encoding) @@ -63,7 +63,7 @@ const VisitationFrequency = ValueHistogram # the underlying encoding and the `fasthist` function and extensions. # See the `rectangular_binning.jl` file for more. function probabilities(est::ValueHistogram, x) - Probabilities(fasthist(x, est.encoding)[1]) + Probabilities(fasthist(est.encoding, x)[1]) end function probabilities_and_outcomes(est::ValueHistogram, x) @@ -74,5 +74,4 @@ function probabilities_and_outcomes(est::ValueHistogram, x) return Probabilities(probs), vec(outcomes) end -outcome_space(x, est::ValueHistogram) = outcome_space(est.encoding) -total_outcomes(x, est::ValueHistogram) = total_outcomes(est.encoding) +outcome_space(est::ValueHistogram) = outcome_space(est.encoding) diff --git a/src/probabilities_estimators/kernel/kernel_density.jl b/src/probabilities_estimators/kernel/kernel_density.jl index b007574bc..ed3c31a1c 100644 --- a/src/probabilities_estimators/kernel/kernel_density.jl +++ b/src/probabilities_estimators/kernel/kernel_density.jl @@ -3,7 +3,7 @@ using Distances: Metric, Euclidean export NaiveKernel, KDTree, BruteForce """ - NaiveKernel(ϵ::Real, ss = KDTree; w = 0, metric = Euclidean()) <: ProbabilitiesEstimator + NaiveKernel(x, ϵ::Real; ss = KDTree, w = 0, metric = Euclidean()) <: ProbabilitiesEstimator Estimate probabilities/entropy using a "naive" kernel density estimation approach (KDE), as discussed in Prichard and Theiler (1995) [^PrichardTheiler1995]. @@ -18,16 +18,17 @@ P_i( X, \\epsilon) \\approx \\dfrac{1}{N} \\sum_{s} B(||X_i - X_j|| < \\epsilon) where ``B`` gives 1 if the argument is `true`. Probabilities are then normalized. -The search structure `ss` is any search structure supported by Neighborhood.jl. -Specifically, use `KDTree` to use a tree-based neighbor search, or `BruteForce` for -the direct distances between all points. KDTrees heavily outperform direct distances -when the dimensionality of the data is much smaller than the data length. - -The keyword `w` stands for the Theiler window, and excludes indices ``s`` -that are within ``|i - s| ≤ w`` from the given point ``X_i``. +## Keyword arguments +- `ss = KDTree`: the search structure supported by Neighborhood.jl. + Specifically, use `KDTree` to use a tree-based neighbor search, or `BruteForce` for + the direct distances between all points. KDTrees heavily outperform direct distances + when the dimensionality of the data is much smaller than the data length. +- `w = 0`: the Theiler window, which excludes indices ``s`` that are within + ``|i - s| ≤ w`` from the given point ``x_i``. +- `metric = Euclidean()`: the distance metric. ## Outcome space -The outcome space `Ω` for `NaiveKernel` are the indices of the input data, `1:length(x)`. +The outcome space `Ω` for `NaiveKernel` are the indices of the input data, `eachindex(x)`. The reason to not return the data points themselves is because duplicate data points may not have same probabilities (due to having different neighbors). @@ -35,15 +36,16 @@ not have same probabilities (due to having different neighbors). Prichard, D., & Theiler, J. (1995). Generalized redundancies for time series analysis. Physica D: Nonlinear Phenomena, 84(3-4), 476-493. """ -struct NaiveKernel{KM, M <: Metric} <: ProbabilitiesEstimator +struct NaiveKernel{KM, M <: Metric, I} <: ProbabilitiesEstimator ϵ::Float64 method::KM w::Int metric::M + indices::I end -function NaiveKernel(ϵ::Real, method = KDTree; w = 0, metric = Euclidean()) +function NaiveKernel(x, ϵ::Real; method = KDTree, w = 0, metric = Euclidean()) ϵ > 0 || error("Radius ϵ must be larger than zero!") - return NaiveKernel(ϵ, method, w, metric) + return NaiveKernel(ϵ, method, w, metric, eachindex(x)) end function probabilities_and_outcomes(est::NaiveKernel, x::AbstractDataset) @@ -51,7 +53,7 @@ function probabilities_and_outcomes(est::NaiveKernel, x::AbstractDataset) ss = searchstructure(est.method, x.data, est.metric) idxs = bulkisearch(ss, x.data, WithinRange(est.ϵ), theiler) p = Float64.(length.(idxs)) - return Probabilities(p), 1:length(x) + return Probabilities(p), est.indices end -outcome_space(x::AbstractDataset, ::NaiveKernel) = 1:length(x) +outcome_space(est::NaiveKernel) = est.indices From 13746dc4254a89519af8e9aadc94f77649c24511 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 10:16:29 +0200 Subject: [PATCH 06/16] alter docstring of `entropy` --- src/entropy.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/entropy.jl b/src/entropy.jl index 834905482..1a5bfa088 100644 --- a/src/entropy.jl +++ b/src/entropy.jl @@ -60,8 +60,8 @@ abstract type EntropyEstimator <: AbstractEntropy end entropy([e::Entropy,] est::ProbabilitiesEstimator, x) entropy([e::Entropy,] est::EntropyEstimator, x) -Compute `h`, with `h::Real ∈ [0, ∞)`, which is -a (generalized) [`Entropy`](@ref) of type `e`, in one of three ways: +Compute `h::Real`, which is +a (generalized) entropy defined by `e`, in one of three ways: 1. Directly from existing [`Probabilities`](@ref) `probs`. 2. From input data `x`, by first estimating a probability distribution using the provided From 0c351e7cadcc73dda62e9e0ffc8688e0ee82f8b4 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 10:21:59 +0200 Subject: [PATCH 07/16] fix value hist tests --- src/entropy.jl | 3 ++- .../histograms/rectangular_binning.jl | 4 ++-- src/probabilities_estimators/histograms/value_histogram.jl | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/entropy.jl b/src/entropy.jl index 1a5bfa088..b904b1b15 100644 --- a/src/entropy.jl +++ b/src/entropy.jl @@ -71,7 +71,8 @@ a (generalized) entropy defined by `e`, in one of three ways: 3. From input data `x`, by using a dedicated [`EntropyEstimator`](@ref) that computes entropy in a way that doesn't involve explicitly computing probabilities first. -The entropy (first argument) is optional. When `est` is a probability estimator, +The entropy definition (first argument) is optional. +When `est` is a probability estimator, `Shannon()` is used by default. When `est` is a dedicated entropy estimator, the default entropy type is inferred from the estimator (e.g. [`Kraskov`](@ref) estimates the [`Shannon`](@ref) entropy). diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 1de4e7bf1..7cf9707d7 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -210,12 +210,12 @@ end ################################################################## # This method is called by `probabilities(est::ValueHistogram, x::Array_or_Dataset)` """ - fasthist(x::Vector_or_Dataset, ϵ::RectangularBinEncoding) + fasthist(c::RectangularBinEncoding, x::Vector_or_Dataset) Intermediate method that runs `fasthist!` in the encoded space and returns the encoded space histogram (counts) and corresponding bins. Also skips any instances of out-of-bound points for the histogram. """ -function fasthist(x, encoder::RectangularBinEncoding) +function fasthist(encoder::RectangularBinEncoding, x) bins = map(y -> encode(y, encoder), x) # We discard `-1`, as it encodes points outside the histogram limit # (which should only happen for `Fixed` binnings) diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 06e32e89a..b7b5b8441 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -67,7 +67,7 @@ function probabilities(est::ValueHistogram, x) end function probabilities_and_outcomes(est::ValueHistogram, x) - probs, bins = fasthist(x, est.encoding) # bins are integers here + probs, bins = fasthist(est.encoding, x) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: outcomes = map(b -> decode(b, est.encoding), bins) From d477feab16a99e4dff7769dd5ba08fe6701ae7e9 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 11:18:03 +0200 Subject: [PATCH 08/16] more fixes --- src/entropy.jl | 14 +++++--------- src/probabilities.jl | 5 +++++ .../diversity/diversity.jl | 13 +++++++------ test/probabilities/estimators/dispersion.jl | 6 ++++-- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/src/entropy.jl b/src/entropy.jl index b904b1b15..47d0f1a3e 100644 --- a/src/entropy.jl +++ b/src/entropy.jl @@ -152,20 +152,16 @@ entropy(est::EntropyEstimator, x; base = 2) = entropy(Shannon(; base), est, x) # Normalize API ########################################################################################### """ - entropy_maximum(e::Entropy, est::ProbabilitiesEstimator, x) → m::Real + entropy_maximum(e::Entropy, est::ProbabilitiesEstimator) → m::Real -Return the maximum value `m` of the given entropy type based on the given estimator -and the given input `x` (whose values are not important, but layout and type are). - -This function only works if the maximum value is deducable, which is possible only -when the estimator has a known [`total_outcomes`](@ref). +Return the maximum value `m` of the given entropy definition based on the given estimator. entropy_maximum(e::Entropy, L::Int) → m::Real Alternatively, compute the maximum entropy from the number of total outcomes `L` directly. """ -function entropy_maximum(e::Entropy, est::ProbabilitiesEstimator, x) - L = total_outcomes(x, est) +function entropy_maximum(e::Entropy, est::ProbabilitiesEstimator) + L = total_outcomes(est) return entropy_maximum(e, L) end function entropy_maximum(e::Entropy, ::Int) @@ -184,7 +180,7 @@ Notice that unlike for [`entropy`](@ref), here there is no method the amount of _possible_ events (i.e., the [`total_outcomes`](@ref)) from `probs`. """ function entropy_normalized(e::Entropy, est::ProbabilitiesEstimator, x) - return entropy(e, est, x) / entropy_maximum(e, est, x) + return entropy(e, est, x) / entropy_maximum(e, est) end function entropy_normalized(est::ProbabilitiesEstimator, x::Array_or_Dataset) return entropy_normalized(Shannon(), est, x) diff --git a/src/probabilities.jl b/src/probabilities.jl index 096a8efe2..0ff3c6f15 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -67,6 +67,11 @@ across experimental realizations, by using the outcome as a dictionary key and t probability as the value for that key (or, alternatively, the key remains the outcome and one has a vector of probabilities, one for each experimental realization). +We have made the design decision that all probabilities estimators have a well defined +outcome space when instantiated. For some estimators this means that the input data +`x` must be provided both when instantiating the estimator, but also when computing +the probabilities. + All currently implemented probability estimators are: - [`CountOccurrences`](@ref). diff --git a/src/probabilities_estimators/diversity/diversity.jl b/src/probabilities_estimators/diversity/diversity.jl index 31b3c7ccb..d22750e8d 100644 --- a/src/probabilities_estimators/diversity/diversity.jl +++ b/src/probabilities_estimators/diversity/diversity.jl @@ -41,13 +41,13 @@ Base.@kwdef struct Diversity <: ProbabilitiesEstimator end function probabilities(est::Diversity, x::AbstractVector{T}) where T <: Real - ds, binning = similarities_and_binning(est, x) - return fasthist(ds, binning)[1] + ds, rbc = similarities_and_binning(est, x) + return fasthist(rbc, ds)[1] end function probabilities_and_outcomes(est::Diversity, x::AbstractVector{T}) where T <: Real - ds, binning = similarities_and_binning(est, x) - return probabilities_and_outcomes(ValueHistogram(binning), ds) + ds, rbc = similarities_and_binning(est, x) + return probabilities_and_outcomes(ValueHistogram(rbc), ds) end outcome_space(est::Diversity) = outcome_space(binning_for_diversity(est)) @@ -63,9 +63,10 @@ function similarities_and_binning(est::Diversity, x::AbstractVector{T}) where T end # Cosine similarities are all on [-1.0, 1.0], so just discretize this interval. binning = binning_for_diversity(est) - return ds, binning + rbc = RectangularBinEncoding(binning) + return ds, rbc end cosine_similarity(xᵢ, xⱼ) = sum(xᵢ .* xⱼ) / (sqrt(sum(xᵢ .^ 2)) * sqrt(sum(xⱼ .^ 2))) -binning_for_diversity(est::Diversity) = FixedRectangularBinning(-1.0, 1.0, est.nbins) +binning_for_diversity(est::Diversity) = FixedRectangularBinning((-1.0,), (1.0,), est.nbins) diff --git a/test/probabilities/estimators/dispersion.jl b/test/probabilities/estimators/dispersion.jl index e67aaee0c..8beacef06 100644 --- a/test/probabilities/estimators/dispersion.jl +++ b/test/probabilities/estimators/dispersion.jl @@ -1,4 +1,6 @@ -using Entropies, Test +using Entropies +using Test +import DelayEmbeddings @testset "Dispersion" begin @@ -12,7 +14,7 @@ using Entropies, Test s = GaussianCDFEncoding(c) # Symbols should be in the set [1, 2, …, c]. - symbols = Entropies.outcomes(x, s) + symbols = outcomes(x, s) @test all([s ∈ collect(1:c) for s in symbols]) # Dispersion patterns should have a normalized histogram that sums to 1.0. From 53e5b9530f9b96e939627da00474874db360a7cc Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 11:38:21 +0200 Subject: [PATCH 09/16] change FixedRectangularBinning to really be fixed always --- .../histograms/rectangular_binning.jl | 66 ++++++------------- 1 file changed, 21 insertions(+), 45 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 7cf9707d7..13cd8ef98 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -7,7 +7,7 @@ abstract type HistogramEncoding <: Encoding end ################################################################## # Structs and docstrings ################################################################## -# Notice that the binning types are intermediate structs that are NOT retained +# Notice that the binning types are intermediate structs that are not used directly # in the source code. Their only purpose is instructions of how to create a # `RectangularBinEncoder`. All actual source code functionality of `ValueHistogram` # is implemented based on `RectangularBinEncoder`. @@ -39,43 +39,34 @@ const ValidFixedBinInputs = Union{Number, NTuple} """ FixedRectangularBinning <: AbstractBinning - FixedRectangularBinning(ϵmin::E, ϵmax::E, N::Int) where E + FixedRectangularBinning(ϵmin::NTuple, ϵmax::NTuple, N::Int) Rectangular box partition of state space where the extent of the grid is explicitly specified by `ϵmin` and `emax`, and along each dimension, the grid is subdivided into `N` subintervals. Points falling outside the partition do not attribute to probabilities. +This binning type leads to a well-defined outcome space without knowledge of input, +see [`ValueHistogram`](@ref). -Binning instructions are deduced from the types of `ϵmin`/`emax` as follows: +`ϵmin`/`emax` must be `NTuple{D, <:Real}` for input of `D`-dimensional data. -1. `E<:Real` sets the grid range along all dimensions to to `[ϵmin, ϵmax]`. -2. `E::NTuple{D, <:Real}` sets ranges along each dimension individually, i.e. - `[ϵmin[i], ϵmax[i]]` is the range along the `i`-th dimension. + FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N::Int, D::Int = 1) -If the grid spans the range `[r1, r2]` along a particular dimension, then this range -is subdivided into `N` subintervals of equal length `nextfloat((r2 - r1)/N)`. -Thus, for `D`-dimensional data, there are `N^D` boxes. - -Generally it is preferred to use the second method (`E::NTuple{D, <:Real}`) which -has a well defined outcome space irrespectively of input data. +This is a convenience method where each dimension of the binning has the same extent +and the input data are `D` dimensional, which defaults to 1 (timeseries). """ -struct FixedRectangularBinning{E} <: AbstractBinning - ϵmin::E - ϵmax::E +struct FixedRectangularBinning{D,T<:Real} <: AbstractBinning + ϵmin::NTuple{D,T} + ϵmax::NTuple{D,T} N::Int - - function FixedRectangularBinning(ϵmin::E1, ϵmax::E2, N::Int) where {E1 <: ValidFixedBinInputs, E2 <: ValidFixedBinInputs} - f_ϵmin = float.(ϵmin) - f_ϵmax = float.(ϵmax) - return new{typeof(f_ϵmin)}(f_ϵmin, f_ϵmax, N) - end end - -const Floating_or_Fixed_RectBinning = Union{RectangularBinning, FixedRectangularBinning} +function FixedRectangularBinning(ϵmin::Real, ϵmax::Real, N::Int, D::Int = 1) + FixedRectangularBinning(ntuple(x->ϵmin, D), ntuple(x->ϵmax, D), N) +end """ RectangularBinEncoding <: Encoding - RectangularBinEncoding(x, binning::AbstractBinning) - RectangularBinEncoding(binning::FixedRectangularBinning{<:NTuple{D}}) + RectangularBinEncoding(binning::RectangularBinning, x) + RectangularBinEncoding(binning::FixedRectangularBinning) Struct used in [`outcomes`](@ref) to map points of `x` into their respective bins. It finds the minima along each dimension, and computes appropriate @@ -134,8 +125,9 @@ end ################################################################## # Initialization of encodings ################################################################## +# TODO: Document `n_eps` # Data-controlled grid -function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) +function RectangularBinEncoding(b::RectangularBinning, x; n_eps = 2) # This function always returns static vectors and is type stable D = dimension(x) T = eltype(x) @@ -166,21 +158,11 @@ function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) end # fixed grid -function RectangularBinEncoding(x, b::FixedRectangularBinning{E}; n_eps = 2) where {E} - D = dimension(x) - T = eltype(x) - D ≠ length(E) && error("Dimension of data and fixed rectangular binning don't match!") +function RectangularBinEncoding(b::FixedRectangularBinning{D,T}; n_eps = 2) where {D,T} # This function always returns static vectors and is type stable ϵmin, ϵmax = b.ϵmin, b.ϵmax - if E <: Real - mini = SVector{D, T}(repeat([ϵmin], D)) - maxi = SVector{D, T}(repeat([ϵmax], D)) - elseif E <: NTuple{D} - mini = SVector{D, T}(ϵmin) - maxi = SVector{D, T}(ϵmax) - else - error("Invalid ϵmin or ϵmax for binning of a dataset") - end + mini = SVector{D, T}(ϵmin) + maxi = SVector{D, T}(ϵmax) edgelengths_nonadjusted = @. (maxi .- mini) / b.N edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) histsize = SVector{D,Int}(fill(b.N, D)) @@ -189,12 +171,6 @@ function RectangularBinEncoding(x, b::FixedRectangularBinning{E}; n_eps = 2) whe RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) end -# This version exists if the given `ϵ`s are already tuples. -# Then, the dataset doesn't need to be provided. -function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple{D,T}}; n_eps = 2) where {D,T} - return RectangularBinEncoding(Dataset{D,T}(), b; n_eps) -end - ################################################################## # Outcomes / total outcomes ################################################################## From 2ba7ef512a2df8f20e700264e88d39ef7491ae58 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 11:48:13 +0200 Subject: [PATCH 10/16] fix valuehistogram tests --- .../histograms/rectangular_binning.jl | 4 +-- .../histograms/value_histogram.jl | 6 ++-- .../estimators/value_histogram.jl | 35 ++++++++++++++++--- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 13cd8ef98..e3ff0e28b 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -158,7 +158,7 @@ function RectangularBinEncoding(b::RectangularBinning, x; n_eps = 2) end # fixed grid -function RectangularBinEncoding(b::FixedRectangularBinning{D,T}; n_eps = 2) where {D,T} +function RectangularBinEncoding(b::FixedRectangularBinning{D, T}; n_eps = 2) where {D,T} # This function always returns static vectors and is type stable ϵmin, ϵmax = b.ϵmin, b.ϵmax mini = SVector{D, T}(ϵmin) @@ -168,7 +168,7 @@ function RectangularBinEncoding(b::FixedRectangularBinning{D,T}; n_eps = 2) wher histsize = SVector{D,Int}(fill(b.N, D)) ci = CartesianIndices(Tuple(histsize)) li = LinearIndices(ci) - RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) + RectangularBinEncoding(b, typeof(edgelengths)(mini), edgelengths, histsize, ci, li) end ################################################################## diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index b7b5b8441..c558ebf92 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -4,7 +4,7 @@ include("rectangular_binning.jl") include("fasthist.jl") """ - ValueHistogram(x, b::AbstractBinning) <: ProbabilitiesEstimator + ValueHistogram(x, b::RectangularBinning) <: ProbabilitiesEstimator ValueHistogram(b::FixedRectangularBinning) <: ProbabilitiesEstimator A probability estimator based on binning the values of the data as dictated by @@ -42,8 +42,8 @@ struct ValueHistogram{H<:HistogramEncoding} <: ProbabilitiesEstimator encoding::H end ValueHistogram(x, ϵ::Union{Real,Vector}) = ValueHistogram(x, RectangularBinning(ϵ)) -function ValueHistogram(x, b::AbstractBinning) - encoding = RectangularBinEncoding(x, b) +function ValueHistogram(x, b::RectangularBinning) + encoding = RectangularBinEncoding(b, x) return ValueHistogram(encoding) end function ValueHistogram(b::FixedRectangularBinning) diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index e086b029c..0bd471a14 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -63,15 +63,42 @@ using Random x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] N = 10 b = RectangularBinning(N) - rb1 = RectangularBinEncoding(x1, b, n_eps = 1) - rb2 = RectangularBinEncoding(x1, b, n_eps = 2) + rb1 = RectangularBinEncoding(b, x1; n_eps = 1) + rb2 = RectangularBinEncoding(b, x1; n_eps = 2) @test encode(maximum(x1), rb1) == -1 # shouldn't occur, but does when tolerance is too low @test encode(maximum(x1), rb2) == 10 - rb1 = RectangularBinEncoding(x2, b, n_eps = 1) - rb2 = RectangularBinEncoding(x2, b, n_eps = 2) + rb1 = RectangularBinEncoding(b, x2; n_eps = 1) + rb2 = RectangularBinEncoding(b, x2; n_eps = 2) @test encode(maximum(x2), rb1) == -1 # shouldn't occur, but does when tolerance is too low @test encode(maximum(x2), rb2) == 10 end end + + +@testset "Fixed Rectangular binning" begin + + x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) + push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. + push!(x, SVector(1, 1)) + # All these binnings should give approximately same probabilities + n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 + ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! + + bin = FixedRectangularBinning(0, 1, n, 2) + + est = ValueHistogram(bin) + p = probabilities(est, x) + @test length(p) == 100 + @test all(e -> 0.009 ≤ e ≤ 0.011, p) + + p2, o = probabilities_and_outcomes(est, x) + @test p2 == p + @test o isa Vector{SVector{2, Float64}} + @test length(o) == length(p) + @test all(x -> x < 1, maximum(o)) + o2 = outcomes(est, x) + @test o2 == o + +end From c84669d3a33183114e0d159403c00ea8a5f9a6c3 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 11:59:31 +0200 Subject: [PATCH 11/16] wavelet overlap needs input for well defined outcome --- .../timescales/wavelet_overlap.jl | 14 ++++++++------ test/probabilities/estimators/timescales.jl | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/probabilities_estimators/timescales/wavelet_overlap.jl b/src/probabilities_estimators/timescales/wavelet_overlap.jl index 3dfa50aa4..42bec4563 100644 --- a/src/probabilities_estimators/timescales/wavelet_overlap.jl +++ b/src/probabilities_estimators/timescales/wavelet_overlap.jl @@ -2,12 +2,13 @@ export WaveletOverlap import Wavelets """ - WaveletOverlap([wavelet]) <: ProbabilitiesEstimator + WaveletOverlap(x [,wavelet]) <: ProbabilitiesEstimator Apply the maximal overlap discrete wavelet transform (MODWT) to a signal, then compute probabilities as the (normalized) energies at different wavelet scales. These probabilities are used to compute the wavelet entropy, according to Rosso et al. (2001)[^Rosso2001]. +Input timeseries `x` is needed for a well-defined outcome space. By default the wavelet `Wavelets.WT.Daubechies{12}()` is used. Otherwise, you may choose a wavelet from the `Wavelets` package @@ -25,19 +26,20 @@ As such, this estimator only works for timeseries input. Rosso et al. (2001). Wavelet entropy: a new tool for analysis of short duration brain electrical signals. Journal of neuroscience methods, 105(1), 65-75. """ -struct WaveletOverlap{W<:Wavelets.WT.OrthoWaveletClass} <: ProbabilitiesEstimator +struct WaveletOverlap{X, W<:Wavelets.WT.OrthoWaveletClass} <: ProbabilitiesEstimator + x::X wl::W end -WaveletOverlap() = WaveletOverlap(Wavelets.WT.Daubechies{12}()) +WaveletOverlap(x) = WaveletOverlap(x, Wavelets.WT.Daubechies{12}()) function probabilities_and_outcomes(est::WaveletOverlap, x) - @assert x isa AbstractVector{<:Real} "`WaveletOverlap` only works for timeseries input!" + x isa AbstractVector{<:Real} || error("`WaveletOverlap` only works for timeseries input!") p = Probabilities(time_scale_density(x, est.wl)) return p, 1:length(p) end -function outcome_space(x, ::WaveletOverlap) - nscales = Wavelets.WT.maxmodwttransformlevels(x) +function outcome_space(est::WaveletOverlap) + nscales = Wavelets.WT.maxmodwttransformlevels(est.x) return 1:nscales end diff --git a/test/probabilities/estimators/timescales.jl b/test/probabilities/estimators/timescales.jl index e01e615d0..dfd66b8ef 100644 --- a/test/probabilities/estimators/timescales.jl +++ b/test/probabilities/estimators/timescales.jl @@ -7,12 +7,12 @@ using Entropies, Test x = sin.(t .+ cos.(t/0.1)) .- 0.1; @testset "WaveletOverlap" begin - wl = Entropies.Wavelets.WT.Daubechies{4}() + wl = Entropies.Wavelets.WT.Daubechies{4}(x) est = WaveletOverlap(wl) ps = probabilities(est, x) @test length(ps) == 8 @test ps isa Probabilities - @test entropy(Renyi( q = 1, base = 2), WaveletOverlap(), x) isa Real + @test entropy(Renyi( q = 1, base = 2), WaveletOverlap(x), x) isa Real end @testset "Fourier Spectrum" begin From ed3e8b330fd4f747a514c4fff0d9631a9227df67 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 12:04:10 +0200 Subject: [PATCH 12/16] fix diversity --- src/probabilities_estimators/diversity/diversity.jl | 5 +++-- test/probabilities/estimators/diversity.jl | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/probabilities_estimators/diversity/diversity.jl b/src/probabilities_estimators/diversity/diversity.jl index d22750e8d..603435adb 100644 --- a/src/probabilities_estimators/diversity/diversity.jl +++ b/src/probabilities_estimators/diversity/diversity.jl @@ -42,7 +42,8 @@ end function probabilities(est::Diversity, x::AbstractVector{T}) where T <: Real ds, rbc = similarities_and_binning(est, x) - return fasthist(rbc, ds)[1] + bins = fasthist(rbc, ds)[1] + return Probabilities(bins) end function probabilities_and_outcomes(est::Diversity, x::AbstractVector{T}) where T <: Real @@ -69,4 +70,4 @@ end cosine_similarity(xᵢ, xⱼ) = sum(xᵢ .* xⱼ) / (sqrt(sum(xᵢ .^ 2)) * sqrt(sum(xⱼ .^ 2))) -binning_for_diversity(est::Diversity) = FixedRectangularBinning((-1.0,), (1.0,), est.nbins) +binning_for_diversity(est::Diversity) = FixedRectangularBinning(-1.0, 1.0, est.nbins) diff --git a/test/probabilities/estimators/diversity.jl b/test/probabilities/estimators/diversity.jl index 943c3f93b..c04cde704 100644 --- a/test/probabilities/estimators/diversity.jl +++ b/test/probabilities/estimators/diversity.jl @@ -1,3 +1,5 @@ +using Entropies, Test + # Analytical example from Wang et al. (2020) x = [1, 2, 13, 7, 9, 5, 4] nbins = 10 @@ -22,5 +24,5 @@ bins = floor.(Int, (ds .- (-1.0)) / binsize) # The coordinates corresponding to the left corners of these bins are as # follows, and should correspond with the events. -coords = ((bins .- 1) .* binsize) .+ (-1.0) -@test all(round.(events, digits = 13) == round.(unique(coords), digits = 13)) +coords = (bins .* binsize) .+ (-1.0) +@test all(round.(only.(events), digits = 13) == round.(unique(coords), digits = 13)) From f36bef6595a9b0fbcc0b824814c2a5763b8ad9d0 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 12:07:17 +0200 Subject: [PATCH 13/16] fix wavelet again --- test/probabilities/estimators/timescales.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/probabilities/estimators/timescales.jl b/test/probabilities/estimators/timescales.jl index dfd66b8ef..650651e6e 100644 --- a/test/probabilities/estimators/timescales.jl +++ b/test/probabilities/estimators/timescales.jl @@ -7,8 +7,8 @@ using Entropies, Test x = sin.(t .+ cos.(t/0.1)) .- 0.1; @testset "WaveletOverlap" begin - wl = Entropies.Wavelets.WT.Daubechies{4}(x) - est = WaveletOverlap(wl) + wl = Entropies.Wavelets.WT.Daubechies{4}() + est = WaveletOverlap(x, wl) ps = probabilities(est, x) @test length(ps) == 8 @test ps isa Probabilities From ca8af4fd3a9ddf4a690e2148a638467c1f969116 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 12:09:53 +0200 Subject: [PATCH 14/16] fix convenience --- src/entropies/convenience_definitions.jl | 2 +- test/convenience.jl | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/entropies/convenience_definitions.jl b/src/entropies/convenience_definitions.jl index 415150139..104c93827 100644 --- a/src/entropies/convenience_definitions.jl +++ b/src/entropies/convenience_definitions.jl @@ -58,7 +58,7 @@ entropy(Shannon(base), est, x) See [`WaveletOverlap`](@ref) for more info. """ function entropy_wavelet(x; wavelet = Wavelets.WT.Daubechies{12}(), base = 2) - est = WaveletOverlap(wavelet) + est = WaveletOverlap(x, wavelet) entropy(Shannon(; base), est, x) end diff --git a/test/convenience.jl b/test/convenience.jl index 3a067cc22..312c40b63 100644 --- a/test/convenience.jl +++ b/test/convenience.jl @@ -1,17 +1,16 @@ using Entropies using Test -import Statistics @testset "Common literature names" begin x = randn(1000) - σ = 0.2*Statistics.std(x) + σ = 0.2 @testset "Permutation entropy" begin @test entropy_permutation(x) == entropy(SymbolicPermutation(), x) end @testset "Wavelet entropy" begin - @test entropy_wavelet(x) == entropy(WaveletOverlap(), x) + @test entropy_wavelet(x) == entropy(WaveletOverlap(x), x) end @testset "Dispersion entropy" begin From f08cd34e5e6d7b2de844d5a9363607c3ba37bf12 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 12:24:42 +0200 Subject: [PATCH 15/16] reverse order in encode/decode --- src/encodings.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/encodings.jl b/src/encodings.jl index 3d1497e20..f65c741c0 100644 --- a/src/encodings.jl +++ b/src/encodings.jl @@ -17,16 +17,16 @@ Current available encodings are: abstract type Encoding end """ - encode(χ, c::Encoding) -> i::Int + encode(c::Encoding, χ) -> i::Int Encoding an element `χ ∈ x` of input data `x` (those given to [`probabilities`](@ref)) using encoding `c`. The special value of `-1` is reserved as a return value for -inappropriate elements `χ` that cannot be encoded according to `e`. +inappropriate elements `χ` that cannot be encoded according to `c`. """ function encode end """ - decode(i::Int, c::Encoding) -> ω -Decode an encoded element `i::Int` into the outcome it corresponds to `ω ∈ Ω`. + decode(c::Encoding, i::Int) -> ω +Decode an encoded element `i` into the outcome it corresponds to `ω ∈ Ω`. `Ω` is the [`outcome_space`](@ref) of a probabilities estimator that uses encoding `c`. """ function decode end \ No newline at end of file From 5a96a073f33a5a4c08617505d8f0465a45208062 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Mon, 19 Dec 2022 12:24:53 +0200 Subject: [PATCH 16/16] apply encode decode for rect binning --- .../histograms/rectangular_binning.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index e3ff0e28b..d40362e90 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -95,7 +95,7 @@ function Base.show(io::IO, x::RectangularBinEncoding) ) end -function encode(point, e::RectangularBinEncoding) +function encode(e::RectangularBinEncoding, point) (; mini, edgelengths) = e # Map a data point to its bin edge (plus one because indexing starts from 1) bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 @@ -110,7 +110,7 @@ function encode(point, e::RectangularBinEncoding) end end -function decode(bin::Int, e::RectangularBinEncoding{B, D, T}) where {B, D, T} +function decode(e::RectangularBinEncoding{B, D, T}, bin::Int) where {B, D, T} V = SVector{D,T} if checkbounds(Bool, e.ci, bin) @inbounds cartesian = e.ci[bin] @@ -178,7 +178,7 @@ total_outcomes(e::RectangularBinEncoding) = prod(e.histsize) function outcome_space(e::RectangularBinEncoding) # this is super simple :P could be optimized but its not a frequent operation - return [decode(i, e) for i in 1:total_outcomes(e)] + return [decode(e, i) for i in 1:total_outcomes(e)] end ################################################################## @@ -192,7 +192,7 @@ and returns the encoded space histogram (counts) and corresponding bins. Also skips any instances of out-of-bound points for the histogram. """ function fasthist(encoder::RectangularBinEncoding, x) - bins = map(y -> encode(y, encoder), x) + bins = map(y -> encode(encoder, y), x) # We discard `-1`, as it encodes points outside the histogram limit # (which should only happen for `Fixed` binnings) discard_minus_ones!(bins)