diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 9ef012718..000000000 Binary files a/.DS_Store and /dev/null differ diff --git a/Project.toml b/Project.toml index 4c47991d1..760a5dbbe 100644 --- a/Project.toml +++ b/Project.toml @@ -16,19 +16,21 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Scratch = "6c6a2e73-6563-6170-7368-637461726353" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" +StateSpaceSets = "40b095a5-5852-4c12-98c7-d43bf788e795" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Wavelets = "29a6e085-ba6d-5f35-a997-948ac2efa89a" [compat] Combinatorics = "1" -DelayEmbeddings = "2.5" +DelayEmbeddings = "2.6" Distances = "0.9, 0.10" FFTW = "1" Neighborhood = "0.2.4" QuadGK = "2" Scratch = "1" SpecialFunctions = "0.10, 1.0, 2" +StateSpaceSets = "0.1.2, 1" StaticArrays = "0.12, 1.0" Wavelets = "0.9" julia = "1.5" diff --git a/docs/.DS_Store b/docs/.DS_Store deleted file mode 100644 index eee7acbe1..000000000 Binary files a/docs/.DS_Store and /dev/null differ diff --git a/docs/src/devdocs.md b/docs/src/devdocs.md index 7da105655..d2a1569d3 100644 --- a/docs/src/devdocs.md +++ b/docs/src/devdocs.md @@ -7,10 +7,11 @@ Good practices in developing a code base apply in every Pull Request. The [Good ### Mandatory steps 1. Decide on the outcome space and how the estimator will map probabilities to outcomes. 2. Define your type and make it subtype `ProbabilitiesEstimator`. -3. Add a docstring to your type following the style of the docstrings of other estimators. -4. Implement dispatch for [`probabilities_and_outcomes`](@ref). -5. Implement dispatch for [`outcome_space`](@ref). -6. Add your type to the list in the docstring of [`ProbabilitiyEstimator`](@ref). +4. Add a docstring to your type following the style of the docstrings of other estimators. +5. If suitable, the estimator may be able to operate based on [`Encoding`]s. If so, it is preferred to implement an `Encoding` subtype and extend the methods [`encode`](@ref) and [`decode`](@ref). This will allow your probabilities estimator to be used with a larger span of entropy and complexity methods without additional effort. +6. Implement dispatch for [`probabilities_and_outcomes`](@ref) and your probabilities estimator type. +7. Implement dispatch for [`outcome_space`](@ref) and your probabilities estimator type. +8. Add your probabilities estimator type to the list in the docstring of [`ProbabilitiyEstimator`](@ref), and if you also made an encoding, add it to the [`Encoding`](@ref) docstring. ### Optional steps You may extend any of the following functions if there are potential performance benefits in doing so: diff --git a/src/Entropies.jl b/src/Entropies.jl index febe2688e..9d12ab4e3 100644 --- a/src/Entropies.jl +++ b/src/Entropies.jl @@ -9,19 +9,21 @@ To install it, run `import Pkg; Pkg.add("Entropies")`. """ module Entropies -using DelayEmbeddings -using DelayEmbeddings: AbstractDataset, Dataset, dimension -export AbstractDataset, Dataset -export DelayEmbeddings +using StateSpaceSets +export Dataset, SVector +using DelayEmbeddings: embed, genembed + const Array_or_Dataset = Union{<:AbstractArray{<:Real}, <:AbstractDataset} const Vector_or_Dataset = Union{<:AbstractVector{<:Real}, <:AbstractDataset} +# Core API types and functions include("probabilities.jl") include("entropy.jl") -include("encoding/outcomes.jl") +include("encodings.jl") +# Library implementations (files include other files) include("probabilities_estimators/probabilities_estimators.jl") include("entropies/entropies.jl") - +include("encoding/all_encodings.jl") include("deprecations.jl") diff --git a/src/encoding/all_encodings.jl b/src/encoding/all_encodings.jl new file mode 100644 index 000000000..710880726 --- /dev/null +++ b/src/encoding/all_encodings.jl @@ -0,0 +1,3 @@ +include("utils.jl") +include("gaussian_cdf.jl") +include("ordinal_pattern.jl") diff --git a/src/encoding/outcomes.jl b/src/encoding/outcomes.jl deleted file mode 100644 index 807d32716..000000000 --- a/src/encoding/outcomes.jl +++ /dev/null @@ -1,42 +0,0 @@ -export Encoding - -""" - Encoding - -The supertype for all encoding schemes, i.e. ways of encoding input data onto some -discrete set of possibilities/[outcomes](@ref). Implemented encoding schemes are - -- [`OrdinalPatternEncoding`](@ref). -- [`GaussianCDFEncoding`](@ref). -- [`RectangularBinEncoding`](@ref). - -Used internally by the various [`ProbabilitiesEstimator`](@ref)s to map input data onto -outcome spaces, over which probabilities are computed. -""" -abstract type Encoding end - -# TODO: Outcome interface for encodings is not yet defined. -# When we actually define the encodings API, ALL encodings will return -# integers. - -#= -""" - outcomes(x, scheme::Encoding) → Vector{Int} - outcomes!(s, x, scheme::Encoding) → Vector{Int} - -Map each`xᵢ ∈ x` to a distinct outcome according to the encoding `scheme`. - -Optionally, write outcomes into the pre-allocated symbol vector `s` if the `scheme` -allows for it. For usage examples, see individual encoding scheme docstrings. - -See also: [`RectangularBinEncoding`](@ref), [`GaussianCDFEncoding`](@ref), -[`OrdinalPatternEncoding`](@ref). -""" -function outcomes(x::X, ::Encoding) where X - throw(ArgumentError("`outcomes` not defined for input data of type $(X).")) -end -=# - -include("utils.jl") -include("gaussian_cdf.jl") -include("ordinal_pattern.jl") diff --git a/src/encodings.jl b/src/encodings.jl new file mode 100644 index 000000000..adf7a85de --- /dev/null +++ b/src/encodings.jl @@ -0,0 +1,31 @@ +export Encoding, encode, decode + +""" + Encoding + +The supertype for all encoding schemes. Encodings **always encode elements of +input data into the positive integers**. The encoding API is defined by the +functions [`encode`](@ref) and [`decode`](@ref). +Some probability estimators utilize encodings internally. + +Current available encodings are: + +- [`OrdinalPatternEncoding`](@ref). +- [`GaussianCDFEncoding`](@ref). +- [`RectangularBinEncoding`](@ref). +""" +abstract type Encoding end + +""" + encode(χ, e::Encoding) -> i::Int +Encoding an element `χ ∈ x` of input data `x` (those given to [`probabilities`](@ref)) +using encoding `e`. +""" +function encode end + +""" + decode(i::Int, e::Encoding) -> ω +Decode an encoded element `i::Int` into the outcome it corresponds to `ω ∈ Ω`. +`Ω` is the [`outcome_space`](@ref) of a probabilities estimator that uses encoding `e`. +""" +function decode end \ No newline at end of file diff --git a/src/probabilities.jl b/src/probabilities.jl index 0fb6dc36a..d72b13efd 100644 --- a/src/probabilities.jl +++ b/src/probabilities.jl @@ -42,6 +42,7 @@ Base.IteratorSize(::Probabilities) = Base.HasLength() @inline Base.sum(::Probabilities{T}) where T = one(T) """ +ProbabilitiesEstimator The supertype for all probabilities estimators. In Entropies.jl, probability distributions are estimated from data by defining a set of @@ -203,5 +204,5 @@ Equivalent with `probabilities_and_outcomes(x, est)[2]`, but for some estimators it may be explicitly extended for better performance. """ function outcomes(x, est::ProbabilitiesEstimator) - return probabilities_and_outcomes(x, est)[2] + return probabilities_and_outcomes(est, x)[2] end diff --git a/src/probabilities_estimators/histograms/rectangular_binning.jl b/src/probabilities_estimators/histograms/rectangular_binning.jl index 09c322cfa..1de4e7bf1 100644 --- a/src/probabilities_estimators/histograms/rectangular_binning.jl +++ b/src/probabilities_estimators/histograms/rectangular_binning.jl @@ -1,19 +1,24 @@ export RectangularBinning, FixedRectangularBinning export RectangularBinEncoding +abstract type AbstractBinning end +abstract type HistogramEncoding <: Encoding end + ################################################################## # Structs and docstrings ################################################################## # Notice that the binning types are intermediate structs that are NOT retained # in the source code. Their only purpose is instructions of how to create a # `RectangularBinEncoder`. All actual source code functionality of `ValueHistogram` -# is implemented based on `RectangularBinEncoder` and +# is implemented based on `RectangularBinEncoder`. """ RectangularBinning(ϵ) <: AbstractBinning Rectangular box partition of state space using the scheme `ϵ`, deducing the coordinates of the grid axis minima from the input data. +Generally it is preferred to use [`FixedRectangularBinning`](@ref) instead, +as it has a well defined outcome space without knowledge of input data. Binning instructions are deduced from the type of `ϵ` as follows: @@ -38,17 +43,20 @@ const ValidFixedBinInputs = Union{Number, NTuple} Rectangular box partition of state space where the extent of the grid is explicitly specified by `ϵmin` and `emax`, and along each dimension, the grid is subdivided into `N` -subintervals. +subintervals. Points falling outside the partition do not attribute to probabilities. Binning instructions are deduced from the types of `ϵmin`/`emax` as follows: -1. `E<:Float64` sets the grid range along all dimensions to to `[ϵmin, ϵmax]`. -2. `E::NTuple{D, Float64}` sets ranges along each dimension individually, i.e. +1. `E<:Real` sets the grid range along all dimensions to to `[ϵmin, ϵmax]`. +2. `E::NTuple{D, <:Real}` sets ranges along each dimension individually, i.e. `[ϵmin[i], ϵmax[i]]` is the range along the `i`-th dimension. If the grid spans the range `[r1, r2]` along a particular dimension, then this range is subdivided into `N` subintervals of equal length `nextfloat((r2 - r1)/N)`. Thus, for `D`-dimensional data, there are `N^D` boxes. + +Generally it is preferred to use the second method (`E::NTuple{D, <:Real}`) which +has a well defined outcome space irrespectively of input data. """ struct FixedRectangularBinning{E} <: AbstractBinning ϵmin::E @@ -58,7 +66,7 @@ struct FixedRectangularBinning{E} <: AbstractBinning function FixedRectangularBinning(ϵmin::E1, ϵmax::E2, N::Int) where {E1 <: ValidFixedBinInputs, E2 <: ValidFixedBinInputs} f_ϵmin = float.(ϵmin) f_ϵmax = float.(ϵmax) - return new{typeof(f_ϵmin)}(f_ϵmin, f_ϵmax, N::Int) + return new{typeof(f_ϵmin)}(f_ϵmin, f_ϵmax, N) end end @@ -79,160 +87,122 @@ information as `ϵmin/max` is already an `NTuple`. See also: [`RectangularBinning`](@ref), [`FixedRectangularBinning`](@ref). """ -struct RectangularBinEncoding{B, V, E} <: Encoding +struct RectangularBinEncoding{B, D, T, C, L} <: HistogramEncoding binning::B # either RectangularBinning or FixedRectangularBinning - mini::V # fields are either static vectors or numbers - edgelengths::E + mini::SVector{D,T} # fields are either static vectors or numbers + edgelengths::SVector{D,T} + histsize::SVector{D,Int} + ci::C # cartesian indices + li::L # linear indices end function Base.show(io::IO, x::RectangularBinEncoding) return print(io, "RectangularBinEncoding\n" * - " binning: $(x.binning) \n" * " box corners: $(x.mini)\n" * - " edgelengths: $(x.edgelengths)" + " edgelengths: $(x.edgelengths)\n" * + " histogram size: $(x.histsize)" ) end -function encode_as_bin(point, b::RectangularBinEncoding) - (; mini, edgelengths) = b - # Map a data point to its bin edge - return floor.(Int, (point .- mini) ./ edgelengths) +function encode(point, e::RectangularBinEncoding) + (; mini, edgelengths) = e + # Map a data point to its bin edge (plus one because indexing starts from 1) + bin = floor.(Int, (point .- mini) ./ edgelengths) .+ 1 + cartidx = CartesianIndex(Tuple(bin)) + # We have decided on the arbitrary convention that out of bound points + # will get the special symbol `-1`. Erroring doesn't make sense as it is expected + # that for fixed histograms there may be points outside of them. + if checkbounds(Bool, e.li, cartidx) + return @inbounds e.li[cartidx] + else + return -1 + end end -function decode_from_bin(bin, b::RectangularBinEncoding{B, V}) where {B, V} - (; mini, edgelengths) = b +function decode(bin::Int, e::RectangularBinEncoding{B, D, T}) where {B, D, T} + V = SVector{D,T} + if checkbounds(Bool, e.ci, bin) + @inbounds cartesian = e.ci[bin] + else + error("Cannot decode integer $(bin): out of bounds of underlying histogram.") + end + (; mini, edgelengths) = e # Remove one because we want lowest value corner, and we get indices starting from 1 - return (V(Tuple(bin)) .- 1) .* edgelengths .+ mini -end -function decode_from_bin(bin, b::RectangularBinEncoding{B, T}) where {B, T<:Real} - (; mini, edgelengths) = b - return (T(Tuple(bin)[1]) - 1)*edgelengths + mini + return (V(Tuple(cartesian)) .- 1) .* edgelengths .+ mini end ################################################################## -# Encoding bins using a *floating* (i.e. controlled by data) grid +# Initialization of encodings ################################################################## -function RectangularBinEncoding(x::AbstractDataset{D,T}, b::RectangularBinning; - n_eps = 2) where {D, T} +# Data-controlled grid +function RectangularBinEncoding(x, b::RectangularBinning; n_eps = 2) # This function always returns static vectors and is type stable + D = dimension(x) + T = eltype(x) ϵ = b.ϵ mini, maxi = minmaxima(x) v = ones(SVector{D,T}) if ϵ isa Float64 || ϵ isa AbstractVector{<:AbstractFloat} edgelengths = SVector{D,T}(ϵ .* v) + histsize = ceil.(Int, nextfloat.((maxi .- mini), n_eps) ./ edgelengths) elseif ϵ isa Int || ϵ isa Vector{Int} edgeslengths_nonadjusted = @. (maxi - mini)/ϵ # Just taking nextfloat once here isn't enough for bins to cover data when using # `encode_as_bin` later, because subtraction and division leads to loss # of precision. We need a slightly bigger number, so apply nextfloat twice. edgelengths = SVector{D,T}(nextfloat.(edgeslengths_nonadjusted, n_eps)) + if ϵ isa Vector{Int} + histsize = SVector{D, Int}(ϵ) + else + histsize = SVector{D, Int}(fill(ϵ, D)) + end else error("Invalid ϵ for binning of a dataset") end - - RectangularBinEncoding(b, mini, edgelengths) -end - -function RectangularBinEncoding(x::AbstractVector{<:Real}, b::RectangularBinning; n_eps = 2) - # This function always returns numbers and is type stable - ϵ = b.ϵ - mini, maxi = extrema(x) - if ϵ isa AbstractFloat - edgelength = ϵ - elseif ϵ isa Int - edgeslength_nonadjusted = (maxi - mini)/ϵ - # Round-off occurs when encoding bins. Applying `nextfloat` twice seems to still - # ensure that bins cover data. See comment above. - edgelength = nextfloat(edgeslength_nonadjusted, n_eps) - else - error("Invalid ϵ for binning of a vector") - end - - RectangularBinEncoding(b, mini, edgelength) -end - -################################################################## -# Encoding bins using a fixed (user-specified) grid -################################################################## -function RectangularBinEncoding(::AbstractVector{<:Real}, - b::FixedRectangularBinning{E}; n_eps = 2) where E - - # This function always returns numbers and is type stable - ϵmin, ϵmax = b.ϵmin, b.ϵmax - mini = ϵmin - if ϵmin isa AbstractFloat && ϵmax isa AbstractFloat - edgelength_nonadjusted = (ϵmax - ϵmin) / b.N - edgelength = nextfloat(edgelength_nonadjusted, n_eps) - else - error("Invalid ϵmin or ϵmax for binning of a vector") - end - - RectangularBinEncoding(b, mini, edgelength) + # Cartesian indices of the underlying histogram + ci = CartesianIndices(Tuple(histsize)) + li = LinearIndices(ci) + RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) end -function RectangularBinEncoding(::AbstractDataset{D, T}, - b::FixedRectangularBinning{E}, n_eps = 2) where {D, T, E} +# fixed grid +function RectangularBinEncoding(x, b::FixedRectangularBinning{E}; n_eps = 2) where {E} + D = dimension(x) + T = eltype(x) + D ≠ length(E) && error("Dimension of data and fixed rectangular binning don't match!") # This function always returns static vectors and is type stable ϵmin, ϵmax = b.ϵmin, b.ϵmax - if E <: Float64 - mini = SVector{D, Float64}(repeat([ϵmin], D)) - maxi = SVector{D, Float64}(repeat([ϵmax], D)) + if E <: Real + mini = SVector{D, T}(repeat([ϵmin], D)) + maxi = SVector{D, T}(repeat([ϵmax], D)) elseif E <: NTuple{D} - mini = SVector{D, Float64}(ϵmin) - maxi = SVector{D, Float64}(ϵmax) + mini = SVector{D, T}(ϵmin) + maxi = SVector{D, T}(ϵmax) else error("Invalid ϵmin or ϵmax for binning of a dataset") end - edgelengths_nonadjusted = @. (maxi .- mini) / b.N edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - - RectangularBinEncoding(b, mini, edgelengths) + histsize = SVector{D,Int}(fill(b.N, D)) + ci = CartesianIndices(Tuple(histsize)) + li = LinearIndices(ci) + RectangularBinEncoding(b, mini, edgelengths, histsize, ci, li) end + # This version exists if the given `ϵ`s are already tuples. # Then, the dataset doesn't need to be provided. -function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple}, n_eps = 2) - ϵmin, ϵmax = b.ϵmin, b.ϵmax - D = length(ϵmin) - mini = SVector{D, Float64}(ϵmin) - maxi = SVector{D, Float64}(ϵmax) - edgelengths_nonadjusted = @. (maxi .- mini) / b.N - edgelengths = nextfloat.(edgelengths_nonadjusted, n_eps) - RectangularBinEncoding(b, mini, edgelengths) +function RectangularBinEncoding(b::FixedRectangularBinning{<:NTuple{D,T}}; n_eps = 2) where {D,T} + return RectangularBinEncoding(Dataset{D,T}(), b; n_eps) end ################################################################## -# Outcomes / total outcomes / extension of functions +# Outcomes / total outcomes ################################################################## -# When the grid is fixed by the user, we can always deduce the total number of bins, -# even just from the binning itself - symbolization info not needed. -total_outcomes(x, bin::AbstractBinning) = total_outcomes(RectangularBinEncoding(x, bin)) -function total_outcomes(e::RectangularBinEncoding) - if e.binning isa RectangularBinning - error("Not possible to _uniquely_ define for `RectangularBinning`.") - end - D = length(e.mini) - return e.binning.N^D -end +total_outcomes(e::RectangularBinEncoding) = prod(e.histsize) -# This function does not need `x`; all info about binning are in the encoding -outcome_space(x, bin::AbstractBinning) = outcome_space(RectangularBinEncoding(x, bin)) function outcome_space(e::RectangularBinEncoding) - if e.binning isa RectangularBinning - error("Not possible to _uniquely_ define for `RectangularBinning`.") - end - # We can be smart here. All possible bins are exactly the same thing - # as the Cartesian Indices of an array, mapped into "data" units - dims = _array_dims_from_fixed_binning(e) - bins = CartesianIndices(dims) - outcomes = map(b -> decode_from_bin(b, e), bins) - return vec(outcomes) -end - -function _array_dims_from_fixed_binning(e) - N = e.binning.N - D = length(e.mini) - return ntuple(i -> N, D) + # this is super simple :P could be optimized but its not a frequent operation + return [decode(i, e) for i in 1:total_outcomes(e)] end ################################################################## @@ -240,13 +210,21 @@ end ################################################################## # This method is called by `probabilities(est::ValueHistogram, x::Array_or_Dataset)` """ - fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) -Create an encoding for binning, then map `x` to bins, then call `fasthist!` on the bins. -Return the output probabilities, the bins, and the created encoder. + fasthist(x::Vector_or_Dataset, ϵ::RectangularBinEncoding) +Intermediate method that runs `fasthist!` in the encoded space +and returns the encoded space histogram (counts) and corresponding bins. +Also skips any instances of out-of-bound points for the histogram. """ -function fasthist(x::Vector_or_Dataset, ϵ::AbstractBinning) - encoder = RectangularBinEncoding(x, ϵ) - bins = map(y -> encode_as_bin(y, encoder), x) +function fasthist(x, encoder::RectangularBinEncoding) + bins = map(y -> encode(y, encoder), x) + # We discard `-1`, as it encodes points outside the histogram limit + # (which should only happen for `Fixed` binnings) + discard_minus_ones!(bins) hist = fasthist!(bins) - return Probabilities(hist), bins, encoder + return hist, bins +end + +function discard_minus_ones!(bins) + idxs = findall(isequal(-1), bins) + deleteat!(bins, idxs) end diff --git a/src/probabilities_estimators/histograms/value_histogram.jl b/src/probabilities_estimators/histograms/value_histogram.jl index 8ae57a87c..4613e1ffa 100644 --- a/src/probabilities_estimators/histograms/value_histogram.jl +++ b/src/probabilities_estimators/histograms/value_histogram.jl @@ -1,18 +1,11 @@ -export ValueHistogram, VisitationFrequency, AbstractBinning - -""" - AbstractBinning - -The supertype of all binning schemes. -""" -abstract type AbstractBinning end - +export ValueHistogram, VisitationFrequency # We need binning to be defined first to add it as a field to a struct include("rectangular_binning.jl") include("fasthist.jl") """ - ValueHistogram(b::AbstractBinning) <: ProbabilitiesEstimator + ValueHistogram(x, b::AbstractBinning) <: ProbabilitiesEstimator + ValueHistogram(b::FixedRectangularBinning) <: ProbabilitiesEstimator A probability estimator based on binning the values of the data as dictated by the binning scheme `b` and formally computing their histogram, i.e., @@ -21,6 +14,9 @@ Available binnings are: - [`RectangularBinning`](@ref) - [`FixedRectangularBinning`](@ref) +Notice that if not using the fixed binning, `x` (the input data) must also be given +to the estimator, as it is not possible to deduce histogram size only from the binning. + The `ValueHistogram` estimator has a linearithmic time complexity (`n log(n)` for `n = length(x)`) and a linear space complexity (`l` for `l = dimension(x)`). This allows computation of probabilities (histograms) of high-dimensional @@ -28,24 +24,33 @@ datasets and with small box sizes `ε` without memory overflow and with maximum For performance reasons, the probabilities returned never contain 0s and are arbitrarily ordered. - ValueHistogram(ϵ::Union{Real,Vector}) + ValueHistogram(x, ϵ::Union{Real,Vector}) A convenience method that accepts same input as [`RectangularBinning`](@ref) and initializes this binning directly. ## Outcomes -The outcome space for `ValueHistogram` is the set of unique bins constructed +The outcome space for `ValueHistogram` is the unique bins constructed from `b`. Each bin is identified by its left (lowest-value) corner. The bins are in data units, not integer (cartesian indices units), and are returned as `SVector`s. See also: [`RectangularBinning`](@ref). """ -struct ValueHistogram{B<:AbstractBinning} <: ProbabilitiesEstimator - binning::B +struct ValueHistogram{H<:HistogramEncoding} <: ProbabilitiesEstimator + encoding::H end ValueHistogram(ϵ::Union{Real,Vector}) = ValueHistogram(RectangularBinning(ϵ)) +function ValueHistogram(x, b::AbstractBinning) + encoding = RectangularBinEncoding(x, b) + return ValueHistogram(encoding) +end +function ValueHistogram(b::FixedRectangularBinning) + encoding = RectangularBinEncoding(b) + return ValueHistogram(encoding) +end + """ VisitationFrequency @@ -54,24 +59,20 @@ An alias for [`ValueHistogram`](@ref). """ const VisitationFrequency = ValueHistogram -# For organizational outcomes we extend methods here. However, their -# source code in truth is in the binnings file using the bin encoding - -# This method is only valid for rectangular binnings, as `fasthist` -# is only valid for rectangular binnings. For more binnings, it needs to be extended. -function probabilities(est::ValueHistogram, x::Array_or_Dataset) - # and the `fasthist` actually just makes an encoding, - # this function is in `rectangular_binning.jl` - fasthist(x, est.binning)[1] +# The source code of `ValueHistogram` operates as rather simple calls to +# the underlying encoding and the `fasthist` function and extensions. +# See the `rectangular_binning.jl` file for more. +function probabilities(est::ValueHistogram, x) + Probabilities(fasthist(x, est.encoding)[1]) end -function probabilities_and_outcomes(est::ValueHistogram, x::Array_or_Dataset) - probs, bins, encoder = fasthist(x, est.binning) +function probabilities_and_outcomes(est::ValueHistogram, x) + probs, bins = fasthist(x, est.encoding) # bins are integers here unique!(bins) # `bins` is already sorted from `fasthist!` # Here we transfor the cartesian coordinate based bins into data unit bins: - outcomes = map(b -> decode_from_bin(b, encoder), bins) - return probs, outcomes + outcomes = map(b -> decode(b, est.encoding), bins) + return Probabilities(probs), vec(outcomes) end -outcome_space(x, est::ValueHistogram) = outcome_space(x, est.binning) -total_outcomes(x, est::ValueHistogram) = total_outcomes(x, est.binning) +outcome_space(x, est::ValueHistogram) = outcome_space(est.encoding) +total_outcomes(x, est::ValueHistogram) = total_outcomes(est.encoding) diff --git a/test/Project.toml b/test/Project.toml index 1a744b4d1..bf1ae3a25 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -4,6 +4,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" DynamicalSystemsBase = "6e36e845-645a-534a-86f2-f5d4aa5a06b4" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +StateSpaceSets = "40b095a5-5852-4c12-98c7-d43bf788e795" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/probabilities/estimators/permutation.jl b/test/probabilities/estimators/permutation.jl index 1cd76aac1..2393581d4 100644 --- a/test/probabilities/estimators/permutation.jl +++ b/test/probabilities/estimators/permutation.jl @@ -1,3 +1,7 @@ +using StateSpaceSets: Dataset +using DelayEmbeddings: genembed +using StaticArrays: SVector + @test SymbolicPermutation() isa SymbolicPermutation @test SymbolicPermutation(lt = Base.isless) isa SymbolicPermutation @test SymbolicPermutation(lt = Entropies.isless_rand) isa SymbolicPermutation diff --git a/test/probabilities/estimators/transfer_operator.jl b/test/probabilities/estimators/transfer_operator.jl index e4c0bac00..a5429495c 100644 --- a/test/probabilities/estimators/transfer_operator.jl +++ b/test/probabilities/estimators/transfer_operator.jl @@ -1,3 +1,5 @@ +using StateSpaceSets: Dataset + @test TransferOperator(RectangularBinning(3)) isa TransferOperator D = Dataset(rand(100, 2)) diff --git a/test/probabilities/estimators/value_histogram.jl b/test/probabilities/estimators/value_histogram.jl index 051c11232..058f6833a 100644 --- a/test/probabilities/estimators/value_histogram.jl +++ b/test/probabilities/estimators/value_histogram.jl @@ -1,81 +1,77 @@ -using Entropies -using Entropies.DelayEmbeddings, Test -using Random - -@testset "Rectangular binning" begin - - x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) - push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. - push!(x, SVector(1, 1)) - # All these binnings should give approximately same probabilities - n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 - ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! - - binnings = [ - RectangularBinning(n), - RectangularBinning(ε), - RectangularBinning([n, n]), - RectangularBinning([ε, ε]) - ] - - for bin in binnings - @testset "ϵ = $(bin.ϵ)" begin - est = ValueHistogram(bin) - p = probabilities(est, x) - @test length(p) == 100 - @test all(e -> 0.009 ≤ e ≤ 0.011, p) - end - end - - @testset "Check rogue 1s" begin - b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! - p = probabilities(ValueHistogram(b), x) - @test length(p) == 100 + 1 - @test p[end] ≈ 1/100_000 atol = 1e-5 - end - - @testset "vector" begin - x = rand(Random.MersenneTwister(1234), 100_000) - push!(x, 0, 1) - n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 - ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! - binnings = RectangularBinning.((n, ε)) - for bin in binnings - p = probabilities(ValueHistogram(bin), x) - @test length(p) == 10 - @test all(e -> 0.09 ≤ e ≤ 0.11, p) - end - end - - # An extra bin might appear due to roundoff error after using nextfloat when - # constructing `RectangularBinEncoding`s. - # The following tests ensure with *some* certainty that this does not occur. - @testset "Rogue extra bins" begin - rng = MersenneTwister(1234) - xs1D = [rand(rng, 20) for i = 1:10000]; - xs2D = [rand(rng, 1000, 2) |> Dataset for i = 1:10000] # more points to fill all bins - est = ValueHistogram(RectangularBinning(10)) - ps1D = [probabilities(est, x) for x in xs1D]; - ps2D = [probabilities(est, x) for x in xs2D]; - n_rogue_extrabin_1D = count(length.(ps1D) .> est.binning.ϵ) - n_rogue_extrabin_2D = count(length.(ps2D) .> est.binning.ϵ^2) - @test n_rogue_extrabin_1D == 0 - @test n_rogue_extrabin_2D == 0 - - # Concrete examples where a rogue extra bin has appeared. - x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] - x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] - N = 10 - b = RectangularBinning(N) - rb1 = RectangularBinEncoding(x1, b, n_eps = 1) - rb2 = RectangularBinEncoding(x1, b, n_eps = 2) - @test Entropies.encode_as_bin(maximum(x1), rb1) == 10 # shouldn't occur, but does when tolerance is too low - @test Entropies.encode_as_bin(maximum(x1), rb2) == 9 - - rb1 = RectangularBinEncoding(x2, b, n_eps = 1) - rb2 = RectangularBinEncoding(x2, b, n_eps = 2) - @test Entropies.encode_as_bin(maximum(x2), rb1) == 10 # shouldn't occur, but does when tolerance is too low - @test Entropies.encode_as_bin(maximum(x2), rb2) == 9 - end - -end +using Entropies +using Test +using Random + +@testset "Rectangular binning" begin + + x = Dataset(rand(Random.MersenneTwister(1234), 100_000, 2)) + push!(x, SVector(0, 0)) # ensure both 0 and 1 have values in, exactly. + push!(x, SVector(1, 1)) + # All these binnings should give approximately same probabilities + n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 + ε = nextfloat(0.1, 2) # this guarantees that we get the same as the `n` above! + + binnings = [ + RectangularBinning(n), + RectangularBinning(ε), + RectangularBinning([n, n]), + RectangularBinning([ε, ε]) + ] + + for bin in binnings + @testset "ϵ isa $(typeof(bin.ϵ))" begin + est = ValueHistogram(x, bin) + p = probabilities(est, x) + @test length(p) == 100 + @test all(e -> 0.009 ≤ e ≤ 0.011, p) + + p2, o = probabilities_and_outcomes(est, x) + @test p2 == p + @test o isa Vector{SVector{2, Float64}} + @test length(o) == length(p) + @test all(x -> x < 1, maximum(o)) + o2 = outcomes(x, est) + @test o2 == o + end + end + + @testset "Check rogue 1s" begin + b = RectangularBinning(0.1) # no `nextfloat` here, so the rogue (1, 1) is in extra bin! + p = probabilities(ValueHistogram(x, b), x) + @test length(p) == 100 + 1 + @test p[end] ≈ 1/100_000 atol = 1e-5 + end + + @testset "vector" begin + x = rand(Random.MersenneTwister(1234), 100_000) + push!(x, 0, 1) + n = 10 # boxes cover 0 - 1 in steps of slightly more than 0.1 + ε = nextfloat(0.1) # this guarantees that we get the same as the `n` above! + for bin in (RectangularBinning(n), RectangularBinning(ε)) + p = probabilities(ValueHistogram(x, bin), x) + @test length(p) == 10 + @test all(e -> 0.09 ≤ e ≤ 0.11, p) + end + end + + # An extra bin might appear due to roundoff error after using nextfloat when + # constructing `RectangularBinEncoding`s. + # The following tests ensure with *some* certainty that this does not occur. + @testset "Rogue extra bins" begin + # Concrete examples where a rogue extra bin has appeared. + x1 = [0.5213236385155418, 0.03516318860292644, 0.5437726723245310, 0.52598710966469610, 0.34199879802511246, 0.6017129426606275, 0.6972844365031351, 0.89163995617220900, 0.39009862510518045, 0.06296038912844315, 0.9897176284081909, 0.7935001082966890, 0.890198448900077700, 0.11762640519877565, 0.7849413168095061, 0.13768932585886573, 0.50869900547793430, 0.18042178201388548, 0.28507312391861270, 0.96480406570924970] + x2 = [0.4125754262679051, 0.52844411982339560, 0.4535277505543355, 0.25502420827802674, 0.77862522996085940, 0.6081939026664078, 0.2628674795466387, 0.18846258495465185, 0.93320375283233840, 0.40093871561247874, 0.8032730760974603, 0.3531608285217499, 0.018436525139752136, 0.55541857934068420, 0.9907521337888632, 0.15382361136212420, 0.01774321666660561, 0.67569337507728300, 0.06130971689608822, 0.31417161558476836] + N = 10 + b = RectangularBinning(N) + rb1 = RectangularBinEncoding(x1, b, n_eps = 1) + rb2 = RectangularBinEncoding(x1, b, n_eps = 2) + @test encode(maximum(x1), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x1), rb2) == 10 + + rb1 = RectangularBinEncoding(x2, b, n_eps = 1) + rb2 = RectangularBinEncoding(x2, b, n_eps = 2) + @test encode(maximum(x2), rb1) == -1 # shouldn't occur, but does when tolerance is too low + @test encode(maximum(x2), rb2) == 10 + end + +end diff --git a/test/utils/encoding.jl b/test/utils/encoding.jl index d5d39a0d4..466108be4 100644 --- a/test/utils/encoding.jl +++ b/test/utils/encoding.jl @@ -1,4 +1,5 @@ -using DelayEmbeddings: genembed, Dataset +using StateSpaceSets: Dataset +using DelayEmbeddings: genembed using StaticArrays: SVector using Entropies: encode_motif, decode_motif