From daa64f1f33ceac53853ff11e7eefa4982ac7ea88 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 10:31:43 +0100 Subject: [PATCH 01/17] only convert to probabilities at the very last step --- .../histograms/rectangular_binning.jl | 2 +- src/probabilities_estimators/histograms/value_histogram.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index d765653d9..ca1e875e1 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -248,5 +248,5 @@ function fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) encoder = RectangularBinEncoding(x, ϵ) bins = map(y -> encode_as_bin(y, encoder), x) hist = fasthist!(bins) - return Probabilities(hist), bins, encoder + return hist, bins, encoder end diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index b0425fd6a..ccd41f0d1 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -62,7 +62,7 @@ const VisitationFrequency = ValueHistogram function probabilities(x::Array_or_Dataset, est::ValueHistogram) # and the `fasthist` actually just makes an encoding, # this function is in `rectangular_binning.jl` - fasthist(x, est.binning)[1] + Probabilities(fasthist(x, est.binning)[1]) end function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) @@ -70,7 +70,7 @@ function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: outcomes = map(b -> decode_from_bin(b, encoder), bins) - return probs, outcomes + return Probabilities(probs), outcomes end outcome_space(x, est::ValueHistogram) = outcome_space(x, est.binning) From 4e63ebe036d64c111ff6d71ff284a445b282a948 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 10:38:49 +0100 Subject: [PATCH 02/17] put the `vec` in there --- .../histograms/rectangular_binning.jl | 10 +++++++--- .../histograms/value_histogram.jl | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index ca1e875e1..e71ee49d9 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -242,11 +242,15 @@ end """ fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) Create an encoding for binning, then map `x` to bins, then call `fasthist!` on the bins. -Return the output probabilities, the bins, and the created encoder. +Return the output counts, the bins, and the created encoder. """ -function fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) +function fasthist(x, ϵ::AbstractBinning) encoder = RectangularBinEncoding(x, ϵ) + hist, bins = fasthist(x, encoder) + return hist, bins, encoder +end +function fasthist(x, encoder::RectangularBinEncoding) bins = map(y -> encode_as_bin(y, encoder), x) hist = fasthist!(bins) - return hist, bins, encoder + return hist, bins end diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index ccd41f0d1..183c030d4 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -35,7 +35,7 @@ and initializes this binning directly. ## Outcomes -The outcome space for `ValueHistogram` is the set of unique bins constructed +The outcome space for `ValueHistogram` is the unique bins constructed from `b`. Each bin is identified by its left (lowest-value) corner. The bins are in data units, not integer (cartesian indices units), and are returned as `SVector`s. @@ -70,7 +70,7 @@ function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: outcomes = map(b -> decode_from_bin(b, encoder), bins) - return Probabilities(probs), outcomes + return Probabilities(probs), vec(outcomes) end outcome_space(x, est::ValueHistogram) = outcome_space(x, est.binning) From 673965b9529c2d3b08732eef35631eb4ee139237 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 11:10:23 +0100 Subject: [PATCH 03/17] finish `encode/decode` and linear indexing --- .../histograms/rectangular_binning.jl | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index e71ee49d9..5d933ddf5 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -7,7 +7,7 @@ export RectangularBinEncoding # Notice that the binning types are intermediate structs that are NOT retained # in the source code. Their only purpose is instructions of how to create a # `RectangularBinEncoder`. All actual source code functionality of `ValueHistogram` -# is implemented based on `RectangularBinEncoder` and +# is implemented based on `RectangularBinEncoder`. """ RectangularBinning(ϵ) <: AbstractBinning @@ -79,10 +79,12 @@ information as `ϵmin/max` is already an `NTuple`. See also: [`RectangularBinning`](@ref), [`FixedRectangularBinning`](@ref). """ -struct RectangularBinEncoding{B, V, E} <: Encoding +struct RectangularBinEncoding{B, V, E, C, L} <: Encoding binning::B # either RectangularBinning or FixedRectangularBinning mini::V # fields are either static vectors or numbers edgelengths::E + ci::C # cartesian indices + li::L # linear indices end function Base.show(io::IO, x::RectangularBinEncoding) @@ -93,17 +95,20 @@ function Base.show(io::IO, x::RectangularBinEncoding) ) end -function encode_as_bin(point, b::RectangularBinEncoding) - (; mini, edgelengths) = b - # Map a data point to its bin edge - return floor.(Int, (point .- mini) ./ edgelengths) +function encode(point, e::RectangularBinEncoding) + (; mini, edgelengths) = e + # Map a data point to its bin edge (plus one because indexing starts from 1) + bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 + return e.li[CartesianIndex(Tuple(bin))] end -function decode_from_bin(bin, b::RectangularBinEncoding{B, V}) where {B, V} - (; mini, edgelengths) = b +function decode(bin::Int, e::RectangularBinEncoding{B, V}) where {B, V} + cartesian = e.ci[bin] + (; mini, edgelengths) = e # Remove one because we want lowest value corner, and we get indices starting from 1 - return (V(Tuple(bin)) .- 1) .* edgelengths .+ mini + return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini end + function decode_from_bin(bin, b::RectangularBinEncoding{B, T}) where {B, T<:Real} (; mini, edgelengths) = b return (T(Tuple(bin)[1]) - 1)*edgelengths + mini @@ -130,7 +135,10 @@ function RectangularBinEncoding(x::AbstractDataset{D,T}, b::RectangularBinning; error("Invalid ϵ for binning of a dataset") end - RectangularBinEncoding(b, mini, edgelengths) + # Cartesian indices of the underlying histogram + ci = CartesianIndices(ntuple(i -> length(x), D)) + li = LinearIndices(ci) + RectangularBinEncoding(b, mini, edgelengths, ci, li) end function RectangularBinEncoding(x::AbstractVector{<:Real}, b::RectangularBinning; n_eps = 2) From ef239619d38b8ca785d52c146f683ae407fa4bb0 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 11:22:33 +0100 Subject: [PATCH 04/17] rework valuehistogram to store the encoding directly --- .../histograms/rectangular_binning.jl | 5 ++- .../histograms/value_histogram.jl | 43 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 5d933ddf5..47044c5a6 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -1,6 +1,9 @@ export RectangularBinning, FixedRectangularBinning export RectangularBinEncoding +abstract type AbstractBinning end +abstract type HistogramEncoding <: Encoding end + ################################################################## # Structs and docstrings ################################################################## @@ -79,7 +82,7 @@ information as `ϵmin/max` is already an `NTuple`. See also: [`RectangularBinning`](@ref), [`FixedRectangularBinning`](@ref). """ -struct RectangularBinEncoding{B, V, E, C, L} <: Encoding +struct RectangularBinEncoding{B, V, E, C, L} <: HistogramEncoding binning::B # either RectangularBinning or FixedRectangularBinning mini::V # fields are either static vectors or numbers edgelengths::E diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 183c030d4..2eb85e83b 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -1,12 +1,4 @@ -export ValueHistogram, VisitationFrequency, AbstractBinning - -""" - AbstractBinning - -The supertype of all binning schemes. -""" -abstract type AbstractBinning end - +export ValueHistogram, VisitationFrequency # We need binning to be defined first to add it as a field to a struct include("rectangular_binning.jl") include("fasthist.jl") @@ -42,10 +34,19 @@ are returned as `SVector`s. See also: [`RectangularBinning`](@ref). """ -struct ValueHistogram{B<:AbstractBinning} <: ProbabilitiesEstimator - binning::B +struct ValueHistogram{H<:HistogramEncoding} <: ProbabilitiesEstimator + encoding::H end ValueHistogram(ϵ::Union{Real,Vector}) = ValueHistogram(RectangularBinning(ϵ)) +function ValueHistogram(x, b::AbstractBinning) + encoding = RectangularBinEncoding(x, b) + return ValueHistogram(encoding) +end +function ValueHistogram(b::FixedRectangularBinning) + encoding = RectangularBinEncoding(b) + return ValueHistogram(encoding) +end + """ VisitationFrequency @@ -54,24 +55,20 @@ An alias for [`ValueHistogram`](@ref). """ const VisitationFrequency = ValueHistogram -# For organizational outcomes we extend methods here. However, their -# source code in truth is in the binnings file using the bin encoding - -# This method is only valid for rectangular binnings, as `fasthist` -# is only valid for rectangular binnings. For more binnings, it needs to be extended. +# The source code of `ValueHistogram` operates as rather simple calls to +# the underlying encoding and the `fasthist` function and extensions. +# See the `rectangular_binning.jl` file for more. function probabilities(x::Array_or_Dataset, est::ValueHistogram) - # and the `fasthist` actually just makes an encoding, - # this function is in `rectangular_binning.jl` - Probabilities(fasthist(x, est.binning)[1]) + Probabilities(fasthist(x, est.encoding)[1]) end function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) - probs, bins, encoder = fasthist(x, est.binning) + probs, bins = fasthist(x, est.encoding) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: - outcomes = map(b -> decode_from_bin(b, encoder), bins) + outcomes = map(b -> decode(b, encoder), bins) return Probabilities(probs), vec(outcomes) end -outcome_space(x, est::ValueHistogram) = outcome_space(x, est.binning) -total_outcomes(x, est::ValueHistogram) = total_outcomes(x, est.binning) +outcome_space(x, est::ValueHistogram) = outcome_space(x, est.encoding) +total_outcomes(x, est::ValueHistogram) = total_outcomes(x, est.encoding) From 11346a986a4f6b819137199566cdeddcf563480a Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 11:30:10 +0100 Subject: [PATCH 05/17] remove special clauses for input timeseries --- .../histograms/rectangular_binning.jl | 81 +++++-------------- 1 file changed, 19 insertions(+), 62 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 47044c5a6..07598da67 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -112,17 +112,14 @@ function decode(bin::Int, e::RectangularBinEncoding{B, V}) where {B, V} return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini end -function decode_from_bin(bin, b::RectangularBinEncoding{B, T}) where {B, T<:Real} - (; mini, edgelengths) = b - return (T(Tuple(bin)[1]) - 1)*edgelengths + mini -end - ################################################################## -# Encoding bins using a *floating* (i.e. controlled by data) grid +# Initialization of encodings ################################################################## -function RectangularBinEncoding(x::AbstractDataset{D,T}, b::RectangularBinning; - n_eps = 2) where {D, T} +# Data-controlled grid +function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) # This function always returns static vectors and is type stable + D = dimension(x) + T = eltype(x) ϵ = b.ϵ mini, maxi = minmaxima(x) v = ones(SVector{D,T}) @@ -144,72 +141,32 @@ function RectangularBinEncoding(x::AbstractDataset{D,T}, b::RectangularBinning; RectangularBinEncoding(b, mini, edgelengths, ci, li) end -function RectangularBinEncoding(x::AbstractVector{<:Real}, b::RectangularBinning; n_eps = 2) - # This function always returns numbers and is type stable - ϵ = b.ϵ - mini, maxi = extrema(x) - if ϵ isa AbstractFloat - edgelength = ϵ - elseif ϵ isa Int - edgeslength_nonadjusted = (maxi - mini)/ϵ - # Round-off occurs when encoding bins. Applying `nextfloat` twice seems to still - # ensure that bins cover data. See comment above. - edgelength = nextfloat(edgeslength_nonadjusted, n_eps) - else - error("Invalid ϵ for binning of a vector") - end - - RectangularBinEncoding(b, mini, edgelength) -end - -################################################################## -# Encoding bins using a fixed (user-specified) grid -################################################################## -function RectangularBinEncoding(::AbstractVector{<:Real}, - b::FixedRectangularBinning{E}; n_eps = 2) where E - - # This function always returns numbers and is type stable - ϵmin, ϵmax = b.ϵmin, b.ϵmax - mini = ϵmin - if ϵmin isa AbstractFloat && ϵmax isa AbstractFloat - edgelength_nonadjusted = (ϵmax - ϵmin) / b.N - edgelength = nextfloat(edgelength_nonadjusted, n_eps) - else - error("Invalid ϵmin or ϵmax for binning of a vector") - end - - RectangularBinEncoding(b, mini, edgelength) -end - -function RectangularBinEncoding(::AbstractDataset{D, T}, - b::FixedRectangularBinning{E}, n_eps = 2) where {D, T, E} +# fixed grid +function RectangularBinEncoding(x, b::FixedRectangularBinning{E}; n_eps = 2) where {E} + D = dimension(x) + T = eltype(x) + D ≠ length(E) && error("Dimension of data and fixed rectangular binning don't match!") # This function always returns static vectors and is type stable ϵmin, ϵmax = b.ϵmin, b.ϵmax - if E <: Float64 - mini = SVector{D, Float64}(repeat([ϵmin], D)) - maxi = SVector{D, Float64}(repeat([ϵmax], D)) + if E <: Real + mini = SVector{D, T}(repeat([ϵmin], D)) + maxi = SVector{D, T}(repeat([ϵmax], D)) elseif E <: NTuple{D} - mini = SVector{D, Float64}(ϵmin) - maxi = SVector{D, Float64}(ϵmax) + mini = SVector{D, T}(ϵmin) + maxi = SVector{D, T}(ϵmax) else error("Invalid ϵmin or ϵmax for binning of a dataset") end - edgelengths_nonadjusted = @. (maxi .- mini) / b.N edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - RectangularBinEncoding(b, mini, edgelengths) end + # This version exists if the given `ϵ`s are already tuples. # Then, the dataset doesn't need to be provided. -function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple}, n_eps = 2) - ϵmin, ϵmax = b.ϵmin, b.ϵmax - D = length(ϵmin) - mini = SVector{D, Float64}(ϵmin) - maxi = SVector{D, Float64}(ϵmax) - edgelengths_nonadjusted = @. (maxi .- mini) / b.N - edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - RectangularBinEncoding(b, mini, edgelengths) +function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple}; n_eps = 2) + D = length(E) + return RectangularBinEncoding(Dataset{D,Float64}(), b; n_eps) end ################################################################## From 6e7dffa23eb4560108a40209c730b2678754aa5c Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 11:41:03 +0100 Subject: [PATCH 06/17] make histogram size part of the bin encoding --- .../histograms/rectangular_binning.jl | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 07598da67..ebdafdac3 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -82,10 +82,11 @@ information as `ϵmin/max` is already an `NTuple`. See also: [`RectangularBinning`](@ref), [`FixedRectangularBinning`](@ref). """ -struct RectangularBinEncoding{B, V, E, C, L} <: HistogramEncoding +struct RectangularBinEncoding{B, D, T, C, L} <: HistogramEncoding binning::B # either RectangularBinning or FixedRectangularBinning - mini::V # fields are either static vectors or numbers - edgelengths::E + mini::SVector{D,T} # fields are either static vectors or numbers + edgelengths::SVector{D,T} + histsize::SVector{D,Int} ci::C # cartesian indices li::L # linear indices end @@ -125,20 +126,25 @@ function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} edgelengths = SVector{D,T}(ϵ .* v) + histsize = round.(Int, (mini .- maxi) ./ edgelengths) elseif ϵ isa Int || ϵ isa Vector{Int} edgeslengths_nonadjusted = @. (maxi - mini)/ϵ # Just taking nextfloat once here isn't enough for bins to cover data when using # `encode_as_bin` later, because subtraction and division leads to loss # of precision. We need a slightly bigger number, so apply nextfloat twice. edgelengths = SVector{D,T}(nextfloat.(edgeslengths_nonadjusted, n_eps)) + if ϵ isa Vector{Int} + histsize = SVector{D, Int}(ϵ) + else + histsize = SVector{D, Int}(fill(ϵ, D)) + end else error("Invalid ϵ for binning of a dataset") end - # Cartesian indices of the underlying histogram - ci = CartesianIndices(ntuple(i -> length(x), D)) + ci = CartesianIndices(Tuple(histsize)) li = LinearIndices(ci) - RectangularBinEncoding(b, mini, edgelengths, ci, li) + RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) end # fixed grid @@ -159,7 +165,10 @@ function RectangularBinEncoding(x, b::FixedRectangularBinning{E}; n_eps = 2) whe end edgelengths_nonadjusted = @. (maxi .- mini) / b.N edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - RectangularBinEncoding(b, mini, edgelengths) + histsize = SVector{D,Int}(fill(b.N, D)) + ci = CartesianIndices(Tuple(histsize)) + li = LinearIndices(ci) + RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) end # This version exists if the given `ϵ`s are already tuples. From 439d9b421027be597ee3e098c436f4643d5f79bb Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 11:48:54 +0100 Subject: [PATCH 07/17] massive simplification of all binning source --- .../histograms/rectangular_binning.jl | 50 ++++--------------- .../histograms/value_histogram.jl | 4 +- 2 files changed, 12 insertions(+), 42 deletions(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index ebdafdac3..23b02a12b 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -173,43 +173,18 @@ end # This version exists if the given `ϵ`s are already tuples. # Then, the dataset doesn't need to be provided. -function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple}; n_eps = 2) - D = length(E) - return RectangularBinEncoding(Dataset{D,Float64}(), b; n_eps) +function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple{D,T}}; n_eps = 2) where {D,T} + return RectangularBinEncoding(Dataset{D,T}(), b; n_eps) end ################################################################## -# Outcomes / total outcomes / extension of functions +# Outcomes / total outcomes ################################################################## -# When the grid is fixed by the user, we can always deduce the total number of bins, -# even just from the binning itself - symbolization info not needed. -total_outcomes(x, bin::AbstractBinning) = total_outcomes(RectangularBinEncoding(x, bin)) -function total_outcomes(e::RectangularBinEncoding) - if e.binning isa RectangularBinning - error("Not possible to _uniquely_ define for `RectangularBinning`.") - end - D = length(e.mini) - return e.binning.N^D -end +total_outcomes(e::RectangularBinEncoding) = prod(e.histsize) -# This function does not need `x`; all info about binning are in the encoding -outcome_space(x, bin::AbstractBinning) = outcome_space(RectangularBinEncoding(x, bin)) function outcome_space(e::RectangularBinEncoding) - if e.binning isa RectangularBinning - error("Not possible to _uniquely_ define for `RectangularBinning`.") - end - # We can be smart here. All possible bins are exactly the same thing - # as the Cartesian Indices of an array, mapped into "data" units - dims = _array_dims_from_fixed_binning(e) - bins = CartesianIndices(dims) - outcomes = map(b -> decode_from_bin(b, e), bins) - return outcomes -end - -function _array_dims_from_fixed_binning(e) - N = e.binning.N - D = length(e.mini) - return ntuple(i -> N, D) + # this is super simple :P could be optimized but its not a frequent operation + return [decode(i, e) for i in 1:total_outcomes(e)] end ################################################################## @@ -217,17 +192,12 @@ end ################################################################## # This method is called by `probabilities(x::Array_or_Dataset, est::ValueHistogram)` """ - fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) -Create an encoding for binning, then map `x` to bins, then call `fasthist!` on the bins. -Return the output counts, the bins, and the created encoder. + fasthist(x::Vector_or_Dataset, ϵ::RectangularBinEncoding) +Intermediate method that runs `fasthist!` in the encoded space +and returns the encoded space histogram (counts) and corresponding bins. """ -function fasthist(x, ϵ::AbstractBinning) - encoder = RectangularBinEncoding(x, ϵ) - hist, bins = fasthist(x, encoder) - return hist, bins, encoder -end function fasthist(x, encoder::RectangularBinEncoding) - bins = map(y -> encode_as_bin(y, encoder), x) + bins = map(y -> encode(y, encoder), x) hist = fasthist!(bins) return hist, bins end diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 2eb85e83b..294c831a0 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -70,5 +70,5 @@ function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) return Probabilities(probs), vec(outcomes) end -outcome_space(x, est::ValueHistogram) = outcome_space(x, est.encoding) -total_outcomes(x, est::ValueHistogram) = total_outcomes(x, est.encoding) +outcome_space(x, est::ValueHistogram) = outcome_space(est.encoding) +total_outcomes(x, est::ValueHistogram) = total_outcomes(est.encoding) From c993460d4eba22719c9947864454cc37c9ab3dac Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 12:33:53 +0100 Subject: [PATCH 08/17] WIP fixing tests Also ported explicitly to StateSpaceSets --- Project.toml | 2 ++ src/Entropies.jl | 8 ++++---- .../histograms/rectangular_binning.jl | 2 +- .../histograms/value_histogram.jl | 12 ++++++++---- test/estimators/histograms.jl | 15 +++++++-------- 5 files changed, 22 insertions(+), 17 deletions(-) diff --git a/Project.toml b/Project.toml index 4c47991d1..1b585cf49 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Scratch = "6c6a2e73-6563-6170-7368-637461726353" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StateSpaceSets = "40b095a5-5852-4c12-98c7-d43bf788e795" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a" @@ -29,6 +30,7 @@ Neighborhood = "0.2.4" QuadGK = "2" Scratch = "1" SpecialFunctions = "0.10, 1.0, 2" +StateSpaceSets = "0.1.2, 1" StaticArrays = "0.12, 1.0" Wavelets = "0.9" julia = "1.5" diff --git a/src/Entropies.jl b/src/Entropies.jl index febe2688e..bb64bbecc 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -9,10 +9,10 @@ To install it, run `import Pkg; Pkg.add("Entropies")`. """ module Entropies -using DelayEmbeddings -using DelayEmbeddings: AbstractDataset, Dataset, dimension -export AbstractDataset, Dataset -export DelayEmbeddings +using StateSpaceSets +export Dataset, SVector +using DelayEmbeddings: embed, genembed + const Array_or_Dataset = Union{<:AbstractArray{<:Real}, <:AbstractDataset} const Vector_or_Dataset = Union{<:AbstractVector{<:Real}, <:AbstractDataset} diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 23b02a12b..1770fe71c 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -126,7 +126,7 @@ function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} edgelengths = SVector{D,T}(ϵ .* v) - histsize = round.(Int, (mini .- maxi) ./ edgelengths) + histsize = round.(Int, (maxi .- mini) ./ edgelengths) elseif ϵ isa Int || ϵ isa Vector{Int} edgeslengths_nonadjusted = @. (maxi - mini)/ϵ # Just taking nextfloat once here isn't enough for bins to cover data when using diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 294c831a0..5d2a0d9ed 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -4,7 +4,8 @@ include("rectangular_binning.jl") include("fasthist.jl") """ - ValueHistogram(b::AbstractBinning) <: ProbabilitiesEstimator + ValueHistogram(x, b::AbstractBinning) <: ProbabilitiesEstimator + ValueHistogram(b::FixedRectangularBinning) <: ProbabilitiesEstimator A probability estimator based on binning the values of the data as dictated by the binning scheme `b` and formally computing their histogram, i.e., @@ -13,6 +14,9 @@ Available binnings are: - [`RectangularBinning`](@ref) - [`FixedRectangularBinning`](@ref) +Notice that if not using the fixed binning, `x` (the input data) must also be given +to the estimator, as it is not possible to deduce histogram size only from the binning. + The `ValueHistogram` estimator has a linearithmic time complexity (`n log(n)` for `n = length(x)`) and a linear space complexity (`l` for `l = dimension(x)`). This allows computation of probabilities (histograms) of high-dimensional @@ -20,7 +24,7 @@ datasets and with small box sizes `ε` without memory overflow and with maximum For performance reasons, the probabilities returned never contain 0s and are arbitrarily ordered. - ValueHistogram(ϵ::Union{Real,Vector}) + ValueHistogram(x, ϵ::Union{Real,Vector}) A convenience method that accepts same input as [`RectangularBinning`](@ref) and initializes this binning directly. @@ -58,11 +62,11 @@ const VisitationFrequency = ValueHistogram # The source code of `ValueHistogram` operates as rather simple calls to # the underlying encoding and the `fasthist` function and extensions. # See the `rectangular_binning.jl` file for more. -function probabilities(x::Array_or_Dataset, est::ValueHistogram) +function probabilities(x, est::ValueHistogram) Probabilities(fasthist(x, est.encoding)[1]) end -function probabilities_and_outcomes(x::Array_or_Dataset, est::ValueHistogram) +function probabilities_and_outcomes(x, est::ValueHistogram) probs, bins = fasthist(x, est.encoding) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: diff --git a/test/estimators/histograms.jl b/test/estimators/histograms.jl index 0a9cfe8b8..068e24102 100644 --- a/test/estimators/histograms.jl +++ b/test/estimators/histograms.jl @@ -1,5 +1,5 @@ using Entropies -using Entropies.DelayEmbeddings, Test +using Test using Random @testset "Rectangular binning" begin @@ -9,7 +9,7 @@ using Random push!(x, SVector(1, 1)) # All these binnings should give approximately same probabilities n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 - ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! + ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! binnings = [ RectangularBinning(n), @@ -19,8 +19,8 @@ using Random ] for bin in binnings - @testset "ϵ = $(bin.ϵ)" begin - est = ValueHistogram(bin) + @testset "ϵ isa $(typeof(bin.ϵ))" begin + est = ValueHistogram(x, bin) p = probabilities(x, est) @test length(p) == 100 @test all(e -> 0.009 ≤ e ≤ 0.011, p) @@ -29,7 +29,7 @@ using Random @testset "Check rogue 1s" begin b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! - p = probabilities(x, ValueHistogram(b)) + p = probabilities(x, ValueHistogram(x, b)) @test length(p) == 100 + 1 @test p[end] ≈ 1/100_000 atol = 1e-5 end @@ -39,9 +39,8 @@ using Random push!(x, 0, 1) n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! - binnings = RectangularBinning.((n, ε)) - for bin in binnings - p = probabilities(x, ValueHistogram(bin)) + for bin in (RectangularBinning(n), RectangularBinning(ε)) + p = probabilities(x, ValueHistogram(x, bin)) @test length(p) == 10 @test all(e -> 0.09 ≤ e ≤ 0.11, p) end From e69f3fb236b457e83fbd7a999d31e0ffcf6636d3 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 12:40:46 +0100 Subject: [PATCH 09/17] use next float also in the estimation of histogram size --- src/probabilities_estimators/histograms/rectangular_binning.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 1770fe71c..be6ef1760 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -126,7 +126,7 @@ function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} edgelengths = SVector{D,T}(ϵ .* v) - histsize = round.(Int, (maxi .- mini) ./ edgelengths) + histsize = ceil.(Int, nextfloat.((maxi .- mini), n_eps) ./ edgelengths) elseif ϵ isa Int || ϵ isa Vector{Int} edgeslengths_nonadjusted = @. (maxi - mini)/ϵ # Just taking nextfloat once here isn't enough for bins to cover data when using From 39398fdad8e319e8db6ee52e5e4a2e67166118af Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 15:44:52 +0100 Subject: [PATCH 10/17] use encding -1 for points outside histogram --- Project.toml | 2 +- .../histograms/rectangular_binning.jl | 46 +++++++++++++++---- test/estimators/histograms.jl | 15 +----- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/Project.toml b/Project.toml index 1b585cf49..760a5dbbe 100644 --- a/Project.toml +++ b/Project.toml @@ -23,7 +23,7 @@ Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a" [compat] Combinatorics = "1" -DelayEmbeddings = "2.5" +DelayEmbeddings = "2.6" Distances = "0.9, 0.10" FFTW = "1" Neighborhood = "0.2.4" diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index be6ef1760..125dd382e 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -17,6 +17,8 @@ abstract type HistogramEncoding <: Encoding end Rectangular box partition of state space using the scheme `ϵ`, deducing the coordinates of the grid axis minima from the input data. +Generally it is preferred to use [`FixedRectangularBinning`](@ref) instead, +as it has a well defined outcome space without knowledge of input data. Binning instructions are deduced from the type of `ϵ` as follows: @@ -41,17 +43,20 @@ const ValidFixedBinInputs = Union{Number, NTuple} Rectangular box partition of state space where the extent of the grid is explicitly specified by `ϵmin` and `emax`, and along each dimension, the grid is subdivided into `N` -subintervals. +subintervals. Points falling outside the partition do not attribute to probabilities. Binning instructions are deduced from the types of `ϵmin`/`emax` as follows: -1. `E<:Float64` sets the grid range along all dimensions to to `[ϵmin, ϵmax]`. -2. `E::NTuple{D, Float64}` sets ranges along each dimension individually, i.e. +1. `E<:Real` sets the grid range along all dimensions to to `[ϵmin, ϵmax]`. +2. `E::NTuple{D, <:Real}` sets ranges along each dimension individually, i.e. `[ϵmin[i], ϵmax[i]]` is the range along the `i`-th dimension. If the grid spans the range `[r1, r2]` along a particular dimension, then this range is subdivided into `N` subintervals of equal length `nextfloat((r2 - r1)/N)`. Thus, for `D`-dimensional data, there are `N^D` boxes. + +Generally it is preferred to use the second method (`E::NTuple{D, <:Real}`) which +has a well defined outcome space irrespectively of input data. """ struct FixedRectangularBinning{E} <: AbstractBinning ϵmin::E @@ -61,7 +66,7 @@ struct FixedRectangularBinning{E} <: AbstractBinning function FixedRectangularBinning(ϵmin::E1, ϵmax::E2, N::Int) where {E1 <: ValidFixedBinInputs, E2 <: ValidFixedBinInputs} f_ϵmin = float.(ϵmin) f_ϵmax = float.(ϵmax) - return new{typeof(f_ϵmin)}(f_ϵmin, f_ϵmax, N::Int) + return new{typeof(f_ϵmin)}(f_ϵmin, f_ϵmax, N) end end @@ -93,9 +98,9 @@ end function Base.show(io::IO, x::RectangularBinEncoding) return print(io, "RectangularBinEncoding\n" * - " binning: $(x.binning) \n" * " box corners: $(x.mini)\n" * - " edgelengths: $(x.edgelengths)" + " edgelengths: $(x.edgelengths)\n" * + " histogram size: $(x.histsize)" ) end @@ -103,11 +108,24 @@ function encode(point, e::RectangularBinEncoding) (; mini, edgelengths) = e # Map a data point to its bin edge (plus one because indexing starts from 1) bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 - return e.li[CartesianIndex(Tuple(bin))] + cartidx = CartesianIndex(Tuple(bin)) + # We have decided on the arbitrary convention that out of bound points + # will get the special symbol `-1`. Erroring doesn't make sense as it is expected + # that for fixed histograms there may be points outside of them. + if checkbounds(Bool, e.li, cartidx) + return @inbounds e.li[cartidx] + else + return -1 + end end -function decode(bin::Int, e::RectangularBinEncoding{B, V}) where {B, V} - cartesian = e.ci[bin] +function decode(bin::Int, e::RectangularBinEncoding{B, D, T}) where {B, D, T} + V = SVector{D,T} + if checkbounds(Bool, e.ci, bin) + @inbounds cartesian = e.ci[bin] + else + error("Cannot decode integer $(bin): out of bounds of underlying histogram.") + end (; mini, edgelengths) = e # Remove one because we want lowest value corner, and we get indices starting from 1 return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini @@ -198,6 +216,16 @@ and returns the encoded space histogram (counts) and corresponding bins. """ function fasthist(x, encoder::RectangularBinEncoding) bins = map(y -> encode(y, encoder), x) + # We discard `-1`, as it encodes points outside the histogram limit + # (which should only happen for `Fixed` binnings) + discard_minus_ones!(bins) hist = fasthist!(bins) return hist, bins end + +function discard_minus_ones!(bins) + # It's (probably) faster to first check if elements are there...? + any(isequal(-1), bins) && return bins + idxs = findall(isequal(-1), bins) + deleteat!(bins, idxs) +end diff --git a/test/estimators/histograms.jl b/test/estimators/histograms.jl index 068e24102..fe92ceb93 100644 --- a/test/estimators/histograms.jl +++ b/test/estimators/histograms.jl @@ -50,17 +50,6 @@ using Random # constructing `RectangularBinEncoding`s. # The following tests ensure with *some* certainty that this does not occur. @testset "Rogue extra bins" begin - rng = MersenneTwister(1234) - xs1D = [rand(rng, 20) for i = 1:10000]; - xs2D = [rand(rng, 1000, 2) |> Dataset for i = 1:10000] # more points to fill all bins - est = ValueHistogram(RectangularBinning(10)) - ps1D = [probabilities(x, est) for x in xs1D]; - ps2D = [probabilities(x, est) for x in xs2D]; - n_rogue_extrabin_1D = count(length.(ps1D) .> est.binning.ϵ) - n_rogue_extrabin_2D = count(length.(ps2D) .> est.binning.ϵ^2) - @test n_rogue_extrabin_1D == 0 - @test n_rogue_extrabin_2D == 0 - # Concrete examples where a rogue extra bin has appeared. x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] @@ -68,8 +57,8 @@ using Random b = RectangularBinning(N) rb1 = RectangularBinEncoding(x1, b, n_eps = 1) rb2 = RectangularBinEncoding(x1, b, n_eps = 2) - @test Entropies.encode_as_bin(maximum(x1), rb1) == 10 # shouldn't occur, but does when tolerance is too low - @test Entropies.encode_as_bin(maximum(x1), rb2) == 9 + @test Entropies.encode(maximum(x1), rb1) == 10 # shouldn't occur, but does when tolerance is too low + @test Entropies.encode(maximum(x1), rb2) == 9 rb1 = RectangularBinEncoding(x2, b, n_eps = 1) rb2 = RectangularBinEncoding(x2, b, n_eps = 2) From a3ec36cd55fa0e8faef4e2f3e6a3fbe1a16d5016 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 16:47:38 +0100 Subject: [PATCH 11/17] declare encoding api --- docs/src/devdocs.md | 9 ++-- src/Entropies.jl | 6 ++- src/encoding/all_encodings.jl | 3 ++ src/encoding/outcomes.jl | 42 ------------------- src/encodings.jl | 31 ++++++++++++++ .../histograms/rectangular_binning.jl | 3 +- test/estimators/histograms.jl | 4 +- 7 files changed, 46 insertions(+), 52 deletions(-) create mode 100644 src/encoding/all_encodings.jl delete mode 100644 src/encoding/outcomes.jl create mode 100644 src/encodings.jl diff --git a/docs/src/devdocs.md b/docs/src/devdocs.md index 7da105655..d2a1569d3 100644 --- a/docs/src/devdocs.md +++ b/docs/src/devdocs.md @@ -7,10 +7,11 @@ Good practices in developing a code base apply in every Pull Request. The [Good ### Mandatory steps 1. Decide on the outcome space and how the estimator will map probabilities to outcomes. 2. Define your type and make it subtype `ProbabilitiesEstimator`. -3. Add a docstring to your type following the style of the docstrings of other estimators. -4. Implement dispatch for [`probabilities_and_outcomes`](@ref). -5. Implement dispatch for [`outcome_space`](@ref). -6. Add your type to the list in the docstring of [`ProbabilitiyEstimator`](@ref). +4. Add a docstring to your type following the style of the docstrings of other estimators. +5. If suitable, the estimator may be able to operate based on [`Encoding`]s. If so, it is preferred to implement an `Encoding` subtype and extend the methods [`encode`](@ref) and [`decode`](@ref). This will allow your probabilities estimator to be used with a larger span of entropy and complexity methods without additional effort. +6. Implement dispatch for [`probabilities_and_outcomes`](@ref) and your probabilities estimator type. +7. Implement dispatch for [`outcome_space`](@ref) and your probabilities estimator type. +8. Add your probabilities estimator type to the list in the docstring of [`ProbabilitiyEstimator`](@ref), and if you also made an encoding, add it to the [`Encoding`](@ref) docstring. ### Optional steps You may extend any of the following functions if there are potential performance benefits in doing so: diff --git a/src/Entropies.jl b/src/Entropies.jl index bb64bbecc..9d12ab4e3 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -16,12 +16,14 @@ using DelayEmbeddings: embed, genembed const Array_or_Dataset = Union{<:AbstractArray{<:Real}, <:AbstractDataset} const Vector_or_Dataset = Union{<:AbstractVector{<:Real}, <:AbstractDataset} +# Core API types and functions include("probabilities.jl") include("entropy.jl") -include("encoding/outcomes.jl") +include("encodings.jl") +# Library implementations (files include other files) include("probabilities_estimators/probabilities_estimators.jl") include("entropies/entropies.jl") - +include("encoding/all_encodings.jl") include("deprecations.jl") diff --git a/src/encoding/all_encodings.jl b/src/encoding/all_encodings.jl new file mode 100644 index 000000000..710880726 --- /dev/null +++ b/src/encoding/all_encodings.jl @@ -0,0 +1,3 @@ +include("utils.jl") +include("gaussian_cdf.jl") +include("ordinal_pattern.jl") diff --git a/src/encoding/outcomes.jl b/src/encoding/outcomes.jl deleted file mode 100644 index 807d32716..000000000 --- a/src/encoding/outcomes.jl +++ /dev/null @@ -1,42 +0,0 @@ -export Encoding - -""" - Encoding - -The supertype for all encoding schemes, i.e. ways of encoding input data onto some -discrete set of possibilities/[outcomes](@ref). Implemented encoding schemes are - -- [`OrdinalPatternEncoding`](@ref). -- [`GaussianCDFEncoding`](@ref). -- [`RectangularBinEncoding`](@ref). - -Used internally by the various [`ProbabilitiesEstimator`](@ref)s to map input data onto -outcome spaces, over which probabilities are computed. -""" -abstract type Encoding end - -# TODO: Outcome interface for encodings is not yet defined. -# When we actually define the encodings API, ALL encodings will return -# integers. - -#= -""" - outcomes(x, scheme::Encoding) → Vector{Int} - outcomes!(s, x, scheme::Encoding) → Vector{Int} - -Map each`xᵢ ∈ x` to a distinct outcome according to the encoding `scheme`. - -Optionally, write outcomes into the pre-allocated symbol vector `s` if the `scheme` -allows for it. For usage examples, see individual encoding scheme docstrings. - -See also: [`RectangularBinEncoding`](@ref), [`GaussianCDFEncoding`](@ref), -[`OrdinalPatternEncoding`](@ref). -""" -function outcomes(x::X, ::Encoding) where X - throw(ArgumentError("`outcomes` not defined for input data of type $(X).")) -end -=# - -include("utils.jl") -include("gaussian_cdf.jl") -include("ordinal_pattern.jl") diff --git a/src/encodings.jl b/src/encodings.jl new file mode 100644 index 000000000..adf7a85de --- /dev/null +++ b/src/encodings.jl @@ -0,0 +1,31 @@ +export Encoding, encode, decode + +""" + Encoding + +The supertype for all encoding schemes. Encodings **always encode elements of +input data into the positive integers**. The encoding API is defined by the +functions [`encode`](@ref) and [`decode`](@ref). +Some probability estimators utilize encodings internally. + +Current available encodings are: + +- [`OrdinalPatternEncoding`](@ref). +- [`GaussianCDFEncoding`](@ref). +- [`RectangularBinEncoding`](@ref). +""" +abstract type Encoding end + +""" + encode(χ, e::Encoding) -> i::Int +Encoding an element `χ ∈ x` of input data `x` (those given to [`probabilities`](@ref)) +using encoding `e`. +""" +function encode end + +""" + decode(i::Int, e::Encoding) -> ω +Decode an encoded element `i::Int` into the outcome it corresponds to `ω ∈ Ω`. +`Ω` is the [`outcome_space`](@ref) of a probabilities estimator that uses encoding `e`. +""" +function decode end \ No newline at end of file diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 125dd382e..7296bc2a7 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -213,6 +213,7 @@ end fasthist(x::Vector_or_Dataset, ϵ::RectangularBinEncoding) Intermediate method that runs `fasthist!` in the encoded space and returns the encoded space histogram (counts) and corresponding bins. +Also skips any instances of out-of-bound points for the histogram. """ function fasthist(x, encoder::RectangularBinEncoding) bins = map(y -> encode(y, encoder), x) @@ -224,8 +225,6 @@ function fasthist(x, encoder::RectangularBinEncoding) end function discard_minus_ones!(bins) - # It's (probably) faster to first check if elements are there...? - any(isequal(-1), bins) && return bins idxs = findall(isequal(-1), bins) deleteat!(bins, idxs) end diff --git a/test/estimators/histograms.jl b/test/estimators/histograms.jl index fe92ceb93..bd2c45bf1 100644 --- a/test/estimators/histograms.jl +++ b/test/estimators/histograms.jl @@ -57,8 +57,8 @@ using Random b = RectangularBinning(N) rb1 = RectangularBinEncoding(x1, b, n_eps = 1) rb2 = RectangularBinEncoding(x1, b, n_eps = 2) - @test Entropies.encode(maximum(x1), rb1) == 10 # shouldn't occur, but does when tolerance is too low - @test Entropies.encode(maximum(x1), rb2) == 9 + @test encode(maximum(x1), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x1), rb2) == 9 rb1 = RectangularBinEncoding(x2, b, n_eps = 1) rb2 = RectangularBinEncoding(x2, b, n_eps = 2) From 31c6a34ddc0197663b4429e96006ff8546c6d3c0 Mon Sep 17 00:00:00 2001 From: Datseris Date: Thu, 17 Nov 2022 16:56:16 +0100 Subject: [PATCH 12/17] fix all tests! --- test/estimators/histograms.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/estimators/histograms.jl b/test/estimators/histograms.jl index bd2c45bf1..a3ae4c554 100644 --- a/test/estimators/histograms.jl +++ b/test/estimators/histograms.jl @@ -58,12 +58,12 @@ using Random rb1 = RectangularBinEncoding(x1, b, n_eps = 1) rb2 = RectangularBinEncoding(x1, b, n_eps = 2) @test encode(maximum(x1), rb1) == -1 # shouldn't occur, but does when tolerance is too low - @test encode(maximum(x1), rb2) == 9 + @test encode(maximum(x1), rb2) == 10 rb1 = RectangularBinEncoding(x2, b, n_eps = 1) rb2 = RectangularBinEncoding(x2, b, n_eps = 2) - @test Entropies.encode_as_bin(maximum(x2), rb1) == 10 # shouldn't occur, but does when tolerance is too low - @test Entropies.encode_as_bin(maximum(x2), rb2) == 9 + @test encode(maximum(x2), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x2), rb2) == 10 end end From b3490f8dae04a71eb74f47530064bf317d127282 Mon Sep 17 00:00:00 2001 From: Kristian Haaga Date: Wed, 30 Nov 2022 15:49:18 +0100 Subject: [PATCH 13/17] Use StateSpaceSets for tests, and explicit imports --- test/Project.toml | 1 + test/entropies/nearest_neighbors_direct.jl | 2 ++ test/estimators/permutation.jl | 4 ++++ test/estimators/transfer_operator.jl | 2 ++ test/utils/encoding.jl | 3 ++- 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/test/Project.toml b/test/Project.toml index 1a744b4d1..bf1ae3a25 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,6 +4,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DynamicalSystemsBase = "6e36e845-645a-534a-86f2-f5d4aa5a06b4" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StateSpaceSets = "40b095a5-5852-4c12-98c7-d43bf788e795" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/entropies/nearest_neighbors_direct.jl b/test/entropies/nearest_neighbors_direct.jl index 596f271c5..5a927f27c 100644 --- a/test/entropies/nearest_neighbors_direct.jl +++ b/test/entropies/nearest_neighbors_direct.jl @@ -1,4 +1,6 @@ using Test, StaticArrays, LinearAlgebra +using DelayEmbeddings: genembed +using StateSpaceSets: Dataset @testset "NN - Kraskov" begin m = 4 diff --git a/test/estimators/permutation.jl b/test/estimators/permutation.jl index b1f933855..0ed01c805 100644 --- a/test/estimators/permutation.jl +++ b/test/estimators/permutation.jl @@ -1,3 +1,7 @@ +using StateSpaceSets: Dataset +using DelayEmbeddings: genembed +using StaticArrays: SVector + @test SymbolicPermutation() isa SymbolicPermutation @test SymbolicPermutation(lt = Base.isless) isa SymbolicPermutation @test SymbolicPermutation(lt = Entropies.isless_rand) isa SymbolicPermutation diff --git a/test/estimators/transfer_operator.jl b/test/estimators/transfer_operator.jl index f3c6f1de8..841734bc9 100644 --- a/test/estimators/transfer_operator.jl +++ b/test/estimators/transfer_operator.jl @@ -1,3 +1,5 @@ +using StateSpaceSets: Dataset + @test TransferOperator(RectangularBinning(3)) isa TransferOperator D = Dataset(rand(100, 2)) diff --git a/test/utils/encoding.jl b/test/utils/encoding.jl index d5d39a0d4..466108be4 100644 --- a/test/utils/encoding.jl +++ b/test/utils/encoding.jl @@ -1,4 +1,5 @@ -using DelayEmbeddings: genembed, Dataset +using StateSpaceSets: Dataset +using DelayEmbeddings: genembed using StaticArrays: SVector using Entropies: encode_motif, decode_motif From bf28ae6656afd6ef767252920c7f93dcb3d6bc27 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 20:05:50 +0200 Subject: [PATCH 14/17] delete weird unknown "DS_Store" files --- .DS_Store | Bin 6148 -> 0 bytes docs/.DS_Store | Bin 6148 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store delete mode 100644 docs/.DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 9ef012718a177fe72f49d979233573ba4a677278..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!AiqG5S?wKO({YT3VK`cS~M++h?fxS4;aydN^MNhV9ZLB+CwSiu0Q0D_&v_- zZi`YqcoESVn0b@gnS^}_I~f2F!6rGp7()M;bExFg(olfiS+UEAbaqlj@&(xElSKwEuWyRtQp3#|a z=rJ5-smg9JHp`#oF){br&)q(q($FiX%JUXgegTdrNWLF!jxlN+C1N4Y0#8|u)~M2PZoAU5&G$Pf2q?! z_y*ZB1I)lj259y}X;S|``g;D?No<$_X5eo!Ae?U44RCL^w=V3GdaXpgLM5TR(%`%V j9o>pCms;^Usuqk(au9usr9q6K@Q;9|fekb8qYS(OFFj6; diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index eee7acbe18dc45cfdb9215b6eae1d5a21bf6bdac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5T5OiO%b671-&hJE!bMC7B8Xd3mDOZN^MNhV9ZL>*h4Agt}o<^_&m<+ zZl&6K5>#{sX20FpnS}YW>|_8yw9}vtPyql3l`vPvVUEx~>5Sy8rGO~(GXj5@-20L8 zXMEZ4I~kyFR|FqY2qA*$@AvHAM`1Fk)m}s{Unnk=oRYKXTzEHX=ncAqG--GHXS6z2 zD)J}uu748thMmgtzKRFkFz)qrKp6DU<@_{^12t@`G!7CS>zNIw?36o|wb7{FYF6cD zqdBh1QN31cRpr)pYdkJHE9)D3N1fZ~E>@3*Nr6XB%aX+jJmF+2r3bGcM=HKXD@EiX zfj(SK9n>F5bhFv0%KFakR};1Ad`NOa1tT-S3@`)p8L$_qvp9d7t;_&3@Q)dw`$3`- zx)w8o`s%t27)Jsr|wCGyQ4B`%oFrkPhRM-|nm~ga9>*rd`44QBdw)qhD z&BAslLcblKFSR=e*C3C~05kBOfxKB(sQw>*-v7T3;uSN%4E!kuMB%_YXk$vYw$3C+ vwN|2DqLPqbX7DWpC%P1)FO}j|R4r(iG(mJNW(LuM!ao9<1|FD!Z)M;W+cSo3 From 9be1acacdddf669c450e322c42caeb5feda57499 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 20:07:59 +0200 Subject: [PATCH 15/17] bring back lost value histogram test file --- .../estimators/value_histogram.jl | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 test/probabilities/estimators/value_histogram.jl diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl new file mode 100644 index 000000000..eea5ff5f1 --- /dev/null +++ b/test/probabilities/estimators/value_histogram.jl @@ -0,0 +1,69 @@ +using Entropies +using Test +using Random + +@testset "Rectangular binning" begin + + x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) + push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. + push!(x, SVector(1, 1)) + # All these binnings should give approximately same probabilities + n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 + ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! + + binnings = [ + RectangularBinning(n), + RectangularBinning(ε), + RectangularBinning([n, n]), + RectangularBinning([ε, ε]) + ] + + for bin in binnings + @testset "ϵ isa $(typeof(bin.ϵ))" begin + est = ValueHistogram(x, bin) + p = probabilities(x, est) + @test length(p) == 100 + @test all(e -> 0.009 ≤ e ≤ 0.011, p) + end + end + + @testset "Check rogue 1s" begin + b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! + p = probabilities(x, ValueHistogram(x, b)) + @test length(p) == 100 + 1 + @test p[end] ≈ 1/100_000 atol = 1e-5 + end + + @testset "vector" begin + x = rand(Random.MersenneTwister(1234), 100_000) + push!(x, 0, 1) + n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 + ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! + for bin in (RectangularBinning(n), RectangularBinning(ε)) + p = probabilities(x, ValueHistogram(x, bin)) + @test length(p) == 10 + @test all(e -> 0.09 ≤ e ≤ 0.11, p) + end + end + + # An extra bin might appear due to roundoff error after using nextfloat when + # constructing `RectangularBinEncoding`s. + # The following tests ensure with *some* certainty that this does not occur. + @testset "Rogue extra bins" begin + # Concrete examples where a rogue extra bin has appeared. + x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] + x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] + N = 10 + b = RectangularBinning(N) + rb1 = RectangularBinEncoding(x1, b, n_eps = 1) + rb2 = RectangularBinEncoding(x1, b, n_eps = 2) + @test encode(maximum(x1), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x1), rb2) == 10 + + rb1 = RectangularBinEncoding(x2, b, n_eps = 1) + rb2 = RectangularBinEncoding(x2, b, n_eps = 2) + @test encode(maximum(x2), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x2), rb2) == 10 + end + +end From f1e0df28b058957dbf9301873252328417d37073 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 20:33:17 +0200 Subject: [PATCH 16/17] update api to correct order for probs --- src/probabilities.jl | 1 + src/probabilities_estimators/histograms/value_histogram.jl | 4 ++-- test/probabilities/estimators/value_histogram.jl | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/probabilities.jl b/src/probabilities.jl index 0fb6dc36a..d615c996d 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -42,6 +42,7 @@ Base.IteratorSize(::Probabilities) = Base.HasLength() @inline Base.sum(::Probabilities{T}) where T = one(T) """ + ProbabilitiesEstimator The supertype for all probabilities estimators. In Entropies.jl, probability distributions are estimated from data by defining a set of diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 5d2a0d9ed..c0de5ddbf 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -62,11 +62,11 @@ const VisitationFrequency = ValueHistogram # The source code of `ValueHistogram` operates as rather simple calls to # the underlying encoding and the `fasthist` function and extensions. # See the `rectangular_binning.jl` file for more. -function probabilities(x, est::ValueHistogram) +function probabilities(est::ValueHistogram, x) Probabilities(fasthist(x, est.encoding)[1]) end -function probabilities_and_outcomes(x, est::ValueHistogram) +function probabilities_and_outcomes(est::ValueHistogram, x) probs, bins = fasthist(x, est.encoding) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index eea5ff5f1..ad6891ff7 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -21,7 +21,7 @@ using Random for bin in binnings @testset "ϵ isa $(typeof(bin.ϵ))" begin est = ValueHistogram(x, bin) - p = probabilities(x, est) + p = probabilities(est, x) @test length(p) == 100 @test all(e -> 0.009 ≤ e ≤ 0.011, p) end @@ -29,7 +29,7 @@ using Random @testset "Check rogue 1s" begin b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! - p = probabilities(x, ValueHistogram(x, b)) + p = probabilities(ValueHistogram(x, b), x) @test length(p) == 100 + 1 @test p[end] ≈ 1/100_000 atol = 1e-5 end @@ -40,7 +40,7 @@ using Random n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! for bin in (RectangularBinning(n), RectangularBinning(ε)) - p = probabilities(x, ValueHistogram(x, bin)) + p = probabilities(ValueHistogram(x, bin), x) @test length(p) == 10 @test all(e -> 0.09 ≤ e ≤ 0.11, p) end From 8a322a920aee325b04cebd3f87b4ce5982a3aec0 Mon Sep 17 00:00:00 2001 From: George Datseris Date: Fri, 16 Dec 2022 20:48:54 +0200 Subject: [PATCH 17/17] all tests are passing --- src/probabilities.jl | 4 ++-- .../histograms/value_histogram.jl | 2 +- test/probabilities/estimators/value_histogram.jl | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/probabilities.jl b/src/probabilities.jl index d615c996d..d72b13efd 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -42,7 +42,7 @@ Base.IteratorSize(::Probabilities) = Base.HasLength() @inline Base.sum(::Probabilities{T}) where T = one(T) """ - ProbabilitiesEstimator +ProbabilitiesEstimator The supertype for all probabilities estimators. In Entropies.jl, probability distributions are estimated from data by defining a set of @@ -204,5 +204,5 @@ Equivalent with `probabilities_and_outcomes(x, est)[2]`, but for some estimators it may be explicitly extended for better performance. """ function outcomes(x, est::ProbabilitiesEstimator) - return probabilities_and_outcomes(x, est)[2] + return probabilities_and_outcomes(est, x)[2] end diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index c0de5ddbf..4613e1ffa 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -70,7 +70,7 @@ function probabilities_and_outcomes(est::ValueHistogram, x) probs, bins = fasthist(x, est.encoding) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: - outcomes = map(b -> decode(b, encoder), bins) + outcomes = map(b -> decode(b, est.encoding), bins) return Probabilities(probs), vec(outcomes) end diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index ad6891ff7..058f6833a 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -24,6 +24,14 @@ using Random p = probabilities(est, x) @test length(p) == 100 @test all(e -> 0.009 ≤ e ≤ 0.011, p) + + p2, o = probabilities_and_outcomes(est, x) + @test p2 == p + @test o isa Vector{SVector{2, Float64}} + @test length(o) == length(p) + @test all(x -> x < 1, maximum(o)) + o2 = outcomes(x, est) + @test o2 == o end end