diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml index 547b8cd..c77fd7f 100644 --- a/.github/workflows/python-build.yml +++ b/.github/workflows/python-build.yml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ 3.7, 3.8, 3.9, "3.10" ] + python-version: [ 3.8, 3.9, "3.10", "3.11", "3.12" ] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 8df739e..4118803 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -10,10 +10,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.10 uses: actions/setup-python@v1 with: - python-version: '3.8' + python-version: '3.10' - name: Install build dependencies run: | python -m pip install --upgrade pip diff --git a/CITATION.cff b/CITATION.cff index f376365..c2369e5 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -7,6 +7,6 @@ authors: - family-names: "Opolka" given-names: "Markus" title: "Pandas Association Measures" -version: 0.2.6 -date-released: 2022-10-15 +version: 0.3.0 +date-released: 2024-06-27 url: "https://github.com/fau-klue/pandas-association-measures" \ No newline at end of file diff --git a/Pipfile b/Pipfile index 64afb22..3267331 100644 --- a/Pipfile +++ b/Pipfile @@ -4,14 +4,15 @@ url = "https://pypi.org/simple" verify_ssl = true [dev-packages] -pytest = "==7.0.1" -pylint = "==2.13.9" -pytest-cov = "==3.0.0" -twine = "==3.7.1" -setuptools = "==59.6.0" -cython = "==0.29.30" +pytest = "==7.4.0" +pylint = "==2.17.5" +pytest-cov = "==4.1.0" +twine = "==4.0.2" +setuptools = "==68.0.0" +cython = "==3.0.0" [packages] -wheel = ">=0.37.1" -pandas = ">=1.1.5" -scipy = ">=1.5.4" +wheel = ">=0.43.0,<0.44" +pandas = ">=2.0,<3.0" +numpy = ">=1.24,<2.0" +scipy = ">=1.10.0" diff --git a/README.md b/README.md index d5d7773..1d6e22a 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,29 @@ particularly 1.059386 arrived 3.879126 ``` +## Topographic Maps + +**New since version 0.3**: You can use `association_measures.grid.topography` to create a dataframe for visualising association measures in terms of topographic maps. It yields a lograthmically scaled grid from `N1` to `N2` with values of all association measures at resaonable sampling points of all combinations of `f1` and `f2`. +```python3 +>>> from association_measures.grids import topography +>>> topography(N1=10e6, N2=10e6) + O11 O12 O21 O22 R1 R2 C1 C2 N E11 ... dice log_ratio conservative_log_ratio mutual_information local_mutual_information ipm ipm_reference ipm_expected clr_normal log_ratio_hardie +index ... +0 0 10000000.0 0 10000000.0 10000000.0 10000000.0 0 20000000.0 20000000.0 0.0 ... 0.000000 0.000000 0.000000 inf NaN 0.0 0.0 0.00 0.000000 0.000000 +1 0 10000000.0 1 9999999.0 10000000.0 10000000.0 1 19999999.0 20000000.0 0.5 ... 0.000000 -9.967226 0.000000 -2.698970 0.000000 0.0 0.1 0.05 0.000000 -9.965784 +2 0 10000000.0 2 9999998.0 10000000.0 10000000.0 2 19999998.0 20000000.0 1.0 ... 0.000000 -10.966505 0.000000 -3.000000 0.000000 0.0 0.2 0.10 0.000000 -10.965784 +3 0 10000000.0 3 9999997.0 10000000.0 10000000.0 3 19999997.0 20000000.0 1.5 ... 0.000000 -11.551228 0.000000 -3.176091 -0.000000 0.0 0.3 0.15 0.000000 -11.550747 +4 0 10000000.0 4 9999996.0 10000000.0 10000000.0 4 19999996.0 20000000.0 2.0 ... 0.000000 -11.966145 0.000000 -3.301030 -0.000000 0.0 0.4 0.20 0.000000 -11.965784 +... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +39995 10000000 0.0 7205937 2794063.0 10000000.0 10000000.0 17205937 2794063.0 20000000.0 8602968.5 ... 0.735134 0.472742 0.468813 0.065352 653516.672773 1000000.0 720593.7 860296.85 0.471159 0.472742 +39996 10000000 0.0 7821100 2178900.0 10000000.0 10000000.0 17821100 2178900.0 20000000.0 8910550.0 ... 0.718879 0.354557 0.350718 0.050095 500954.884892 1000000.0 782110.0 891055.00 0.353215 0.354557 +39997 10000000 0.0 8488779 1511221.0 10000000.0 10000000.0 18488779 1511221.0 20000000.0 9244389.5 ... 0.702031 0.236371 0.232619 0.034122 341217.643897 1000000.0 848877.9 924438.95 0.235298 0.236371 +39998 10000000 0.0 9213457 786543.0 10000000.0 10000000.0 19213457 786543.0 20000000.0 9606728.5 ... 0.684616 0.118186 0.114514 0.017424 174244.829132 1000000.0 921345.7 960672.85 0.117443 0.118186 +39999 10000000 0.0 10000000 0.0 10000000.0 10000000.0 20000000 0.0 20000000.0 10000000.0 ... 0.666667 0.000000 0.000000 0.000000 0.000000 1000000.0 1000000.0 1000000.00 0.000000 0.000000 + +[40000 rows x 29 columns] +``` + # Development The package is tested using pylint and pytest. diff --git a/association_measures/grids.py b/association_measures/grids.py new file mode 100644 index 0000000..bb49ab2 --- /dev/null +++ b/association_measures/grids.py @@ -0,0 +1,65 @@ +from itertools import product + +from numpy import exp, linspace, log +from pandas import DataFrame + +from .measures import score + + +def expand_grid(dictionary): + """Create a grid of all value combinations of all keys of the dictionary + + """ + + return DataFrame([row for row in product(*dictionary.values())], + columns=dictionary.keys()) + + +def log_seq(to=10e6, length=200, exact=50): + """Create a logarithimcally scaled sequence + + """ + + if length <= exact: + raise ValueError() + + length = length - exact + + return list(range(exact + 1)) + [int(exp(s)) for s in sorted([x for x in linspace(log(exact), log(to), length)])] + + +def log_grid(N1=10e6, N2=10e6, length1=200, length2=200, exact1=50, exact2=50): + """Create a logarithmically-scaled grid + + """ + return expand_grid({ + 'f1': log_seq(N1, length1, exact=exact1), + 'f2': log_seq(N2, length2, exact=exact2) + }).drop_duplicates().reset_index(drop=True) + + +def topography(N1=10e6, N2=10e6, length=200, length1=None, length2=None, exact=50, exact1=None, exact2=None): + """Create logarithmically scaled grid and calculcate scores + + """ + + exact1 = exact if exact1 is None else exact1 + exact2 = exact if exact2 is None else exact2 + length1 = length if length1 is None else length1 + length2 = length if length2 is None else length2 + + # support + g = log_grid(N1=N1, N2=N2, length1=length1, length2=length2, exact1=exact1, exact2=exact2) + + # add scores + scores = score(g, N1=N1, N2=N2) + # .. add alternative for CLR + scores['clr_normal'] = score( + g, N1=N1, N2=N2, boundary='normal', measures=['conservative_log_ratio'] + )['conservative_log_ratio'] + # .. add alternative for log-ratio + scores['log_ratio_hardie'] = score( + g, N1=N1, N2=N2, discounting='Hardie2014', measures=['log_ratio'] + )['log_ratio'] + + return scores diff --git a/association_measures/measures.py b/association_measures/measures.py index 5beff2d..1392c32 100644 --- a/association_measures/measures.py +++ b/association_measures/measures.py @@ -3,15 +3,15 @@ """ +from warnings import warn + import numpy as np from pandas import concat, merge -from scipy.stats import norm, beta -from warnings import warn +from scipy.stats import beta, norm from .binomial import choose from .frequencies import expected_frequencies, observed_frequencies - CHOOSE = np.vectorize(choose) @@ -20,7 +20,6 @@ def list_measures(): :return: dictionary of measures :rtype: dict - """ return { @@ -144,8 +143,7 @@ def calculate_measures(df, measures=None, freq=False, per_million=True, digits=N ############################### def z_score(df, **kwargs): - """ - Calculate z-score + """Calculate z-score :param DataFrame df: DataFrame with columns O11 and E11 :return: z-score @@ -158,8 +156,7 @@ def z_score(df, **kwargs): def t_score(df, disc=.001, **kwargs): - """ - Calculate t-score + """Calculate t-score :param DataFrame df: pd.DataFrame with columns O11 and E11 :param float disc: discounting (or smoothing) parameter for O11 == 0 @@ -174,8 +171,7 @@ def t_score(df, disc=.001, **kwargs): def log_likelihood(df, signed=True, **kwargs): - """ - Calculate log-likelihood + """Calculate log-likelihood :param DataFrame df: pd.DataFrame with columns O11..O22, E11..E22 :param bool signed: return negative values for rows with O11 < E11? @@ -204,8 +200,7 @@ def log_likelihood(df, signed=True, **kwargs): def simple_ll(df, signed=True, **kwargs): - """ - Calculate simple log-likelihood + """Calculate simple log-likelihood :param DataFrame df: pd.DataFrame with columns O11, E11 :param bool signed: return negative values for rows with O11 < E11? @@ -213,8 +208,7 @@ def simple_ll(df, signed=True, **kwargs): :rtype: pd.Series """ - # NB: discounting will not have any effect: - # term will be multiplied by original Oij = 0 + # NB: discounting will not have any effect: term will be multiplied by original Oij = 0 O11_disc = df['O11'].where(df['O11'] != 0, 1) log_term = df['O11'] * np.log(O11_disc / df['E11']) @@ -260,8 +254,7 @@ def liddell(df, **kwargs): def dice(df, **kwargs): - """ - Calculate Dice coefficient + """Calculate Dice coefficient :param DataFrame df: pd.DataFrame with columns O11, O12, O21 :return: dice @@ -301,8 +294,7 @@ def log_ratio(df, disc=.5, discounting='Walter1975', **kwargs): ####################### def hypergeometric_likelihood(df, **kwargs): - """ - Calculate hypergeometric-likelihood + """Calculate hypergeometric-likelihood :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22 :return: hypergeometric-likelihood @@ -324,8 +316,7 @@ def hypergeometric_likelihood(df, **kwargs): def binomial_likelihood(df, **kwargs): - """ - Calculate binomial-likelihood + """Calculate binomial-likelihood :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22, E11, N :return: binomial-likelihood @@ -350,13 +341,12 @@ def binomial_likelihood(df, **kwargs): # CONSERVATIVE ESTIMATES # ########################## -def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal', +def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='poisson', correct='Bonferroni', vocab=None, one_sided=False, **kwargs): - """ - Calculate conservative log-ratio, i.e. the binary logarithm of the + """Calculate conservative log-ratio, i.e. the binary logarithm of the lower bound of the confidence interval of relative risk at the - (Bonferroni-corrected) confidence level. + (Bonferroni-corrected) significance level. :param DataFrame df: pd.DataFrame with columns O11, O12, O21, O22 :param float disc: discounting (or smoothing) parameter for O11 == 0 and O21 == 0 @@ -368,7 +358,6 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal', :return: conservative log-ratio :rtype: pd.Series - """ # correction of alpha for two-sided tests @@ -394,15 +383,12 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal', # Poisson approximation (Evert 2022) if boundary == 'poisson': - # only calculate where_lower lower = beta.ppf(alpha, df['O11'], df['O21'] + 1) lower_boundary = np.log2((df['R2'] / df['R1']) * lower / (1 - lower)).clip(lower=0) - # only calculate where_upper upper = beta.ppf(1 - alpha, df['O11'] + 1, df['O21']) upper_boundary = np.log2((df['R2'] / df['R1']) * upper / (1 - upper)).clip(upper=0) - # combine, set to 0 where (df['O11'] == 0) & (df['O12'] == 0) clrr = lower_boundary.where( (df['O11'] / df['R1']) >= (df['O21'] / df['R2']), @@ -434,8 +420,7 @@ def conservative_log_ratio(df, disc=.5, alpha=.001, boundary='normal', ###################### def mutual_information(df, disc=.001, **kwargs): - """ - Calculate Mutual Information + """Calculate Mutual Information :param DataFrame df: pd.DataFrame with columns O11 and E11 :param float disc: discounting (or smoothing) parameter for O11 == 0 @@ -450,16 +435,14 @@ def mutual_information(df, disc=.001, **kwargs): def local_mutual_information(df, **kwargs): - """ - Calculate Local Mutual Information + """Calculate Local Mutual Information :param DataFrame df: pd.DataFrame with columns O11 and E11 - :return: mutual information + :return: local mutual information :rtype: pd.Series """ - # NB: discounting will not have any effect: - # term will be multiplied by original Oij = 0 + # NB: discounting will not have any effect: term will be multiplied by original Oij = 0 O11_disc = df['O11'].where(df['O11'] != 0, 1) am = df['O11'] * np.log10(O11_disc / df['E11']) diff --git a/association_measures/version.py b/association_measures/version.py index c55c3a9..bd4c933 100644 --- a/association_measures/version.py +++ b/association_measures/version.py @@ -2,5 +2,5 @@ Association measures are mathematical formulae that interpret cooccurrence frequency data. """ -VERSION = (0, 2, 7) +VERSION = (0, 3, 0) __version__ = '.'.join(map(str, VERSION)) diff --git a/performance.py b/performance.py index 23d9ebc..2cd6be1 100644 --- a/performance.py +++ b/performance.py @@ -82,7 +82,7 @@ # conservative estimates { 'name': 'conservative_log_ratio', - 'code': 'am.conservative_log_ratio(df)' + 'code': 'am.conservative_log_ratio(df, boundary="normal")' }, { 'name': 'conservative_log_ratio_poisson', diff --git a/setup.py b/setup.py index 0305884..d3e449c 100644 --- a/setup.py +++ b/setup.py @@ -2,9 +2,7 @@ import os import sys -from setuptools import find_packages, Command -from distutils.core import setup -from distutils.extension import Extension +from setuptools import find_packages, setup, Command, Extension # Package meta-data. NAME = 'association-measures' @@ -13,7 +11,7 @@ EMAIL = 'philipp.heinrich@fau.de' AUTHOR = 'Philipp Heinrich & Markus Opolka' -REQUIRES_PYTHON = '>=3.6' +REQUIRES_PYTHON = '>=3.8' REQUIRED = [ 'wheel', 'pandas', @@ -99,10 +97,11 @@ def run(self): 'License :: OSI Approved :: MIT License', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Programming Language :: Cython', ], ) diff --git a/tests/test_frequencies.py b/tests/test_frequencies.py index 7c1a25b..43725c1 100644 --- a/tests/test_frequencies.py +++ b/tests/test_frequencies.py @@ -30,15 +30,15 @@ def test_expected_frequencies(fixed_dataframe): df = fq.expected_frequencies(fixed_dataframe) - assert df['E11'][0] == 1.0 + assert df['E11'].iloc[0] == 1.0 def test_expected_frequencies_observed(fixed_dataframe): df = fq.expected_frequencies(fixed_dataframe, observed=True) - assert df['O11'][0] == 10 - assert df['E11'][0] == 1.0 + assert df['O11'].iloc[0] == 10 + assert df['E11'].iloc[0] == 1.0 def test_ucs(ucs_dataframe): @@ -53,18 +53,18 @@ def test_ucs(ucs_dataframe): # check observed frequencies obs = fq.observed_frequencies(df) - assert(obs['O11'].equals(df['O11'])) - assert(obs['O12'].equals(df['O12'])) - assert(obs['O21'].equals(df['O21'])) - assert(obs['O22'].equals(df['O22'])) + assert obs['O11'].equals(df['O11']) + assert obs['O12'].equals(df['O12']) + assert obs['O21'].equals(df['O21']) + assert obs['O22'].equals(df['O22']) # check marginals R1 = df['O11'] + df['O12'] R2 = df['O21'] + df['O22'] C1 = df['O11'] + df['O21'] C2 = df['O12'] + df['O22'] - assert((R1 + R2).equals(df['N'])) - assert((C1 + C2).equals(df['N'])) + assert (R1 + R2).equals(df['N']) + assert (C1 + C2).equals(df['N']) # get expected frequencies df['E11'] = R1 * C1 / df['N'] @@ -74,7 +74,7 @@ def test_ucs(ucs_dataframe): # check expected frequencies exp = fq.expected_frequencies(df) - assert(exp['E11'].equals(df['E11'])) - assert(exp['E12'].equals(df['E12'])) - assert(exp['E21'].equals(df['E21'])) - assert(exp['E22'].equals(df['E22'])) + assert exp['E11'].equals(df['E11']) + assert exp['E12'].equals(df['E12']) + assert exp['E21'].equals(df['E21']) + assert exp['E22'].equals(df['E22']) diff --git a/tests/test_grids.py b/tests/test_grids.py new file mode 100644 index 0000000..04c17b2 --- /dev/null +++ b/tests/test_grids.py @@ -0,0 +1,6 @@ +from association_measures.grids import topography +from pandas import DataFrame + + +def test_map(): + assert isinstance(topography(), DataFrame) diff --git a/tests/test_measures.py b/tests/test_measures.py index 81666ef..5688377 100644 --- a/tests/test_measures.py +++ b/tests/test_measures.py @@ -109,7 +109,7 @@ def test_dice_invalid(invalid_dataframe): def test_dice_zero(zero_dataframe): df = zero_dataframe df_ams = am.score(df, ['dice']) - df_ams['dice'][0] == 0.16831229174945742 + assert df_ams['dice'].iloc[0] == 0.168312 ########## @@ -145,7 +145,7 @@ def test_t_score_invalid(invalid_dataframe): def test_t_score_zero(zero_dataframe): df = zero_dataframe df_ams = am.score(df, ['t_score'], disc=.5) - df_ams['t_score'][0] == 15.532438 + assert df_ams['t_score'].iloc[0] == 15.532438 ########## @@ -173,7 +173,7 @@ def test_z_score_nan(invalid_dataframe): def test_z_score_zero(zero_dataframe): df = zero_dataframe df_ams = am.score(df, ['z_score']) - df_ams['z_score'].iloc[0] == 16.675431 + assert df_ams['z_score'].iloc[0] == 16.675431 ################# @@ -295,7 +295,7 @@ def test_binomial_likelihood_brown_overflow(brown_dataframe): def test_binomial_likelihood_zero(zero_dataframe): df = fq.expected_frequencies(zero_dataframe, observed=True) ams = am.binomial_likelihood(df) - assert isnan(ams[0]) + assert isnan(ams.iloc[0]) ############# @@ -307,7 +307,7 @@ def test_log_ratio(fixed_dataframe): df = fixed_dataframe df_ams = am.score(df, ['log_ratio'], disc=.5, discounting='Hardie2014') - assert df_ams['log_ratio'][0] == 7.491853 + assert df_ams['log_ratio'].iloc[0] == 7.491853 @pytest.mark.log_ratio @@ -325,7 +325,7 @@ def test_log_ratio_zero(zero_dataframe): df = zero_dataframe df_ams = am.score(df, ['log_ratio'], disc=.5, discounting='Hardie2014') - assert df_ams['log_ratio'][0] == 12.03645 + assert df_ams['log_ratio'].iloc[0] == 12.03645 ########################## @@ -370,7 +370,7 @@ def test_conservative_log_ratio_one_sided(fixed_dataframe): df = fq.expected_frequencies(fixed_dataframe, observed=True) df_ams = am.score(df, ['conservative_log_ratio'], boundary='normal') - df_am = am.conservative_log_ratio(df, one_sided=True) + df_am = am.conservative_log_ratio(df, one_sided=True, boundary='normal') df_am.name = 'clr_one_sided' df_ams = df_ams.join(df_am) assert (abs(df_ams['conservative_log_ratio']) <= abs(df_ams['clr_one_sided'])).all() @@ -408,7 +408,7 @@ def test_liddell(fixed_dataframe): df = fixed_dataframe df_ams = am.score(df, ['liddell']) - assert df_ams['liddell'][0] == 1 + assert df_ams['liddell'].iloc[0] == 1 @pytest.mark.liddell @@ -416,7 +416,7 @@ def test_liddell_zero(zero_dataframe): df = fq.expected_frequencies(zero_dataframe, observed=True) df_ams = am.score(df, ['liddell']) - assert df_ams['liddell'][0] == 0.143858 + assert df_ams['liddell'].iloc[0] == 0.143858 ######## @@ -516,4 +516,4 @@ def test_calculate_measures(zero_dataframe): df = zero_dataframe with pytest.deprecated_call(): df_ams = am.calculate_measures(df, ['dice']) - df_ams['dice'][0] == 0.16831229174945742 + df_ams['dice'].iloc[0] == 0.16831229174945742 diff --git a/tests/test_readme.py b/tests/test_readme.py index 51d69f8..9062af5 100644 --- a/tests/test_readme.py +++ b/tests/test_readme.py @@ -1,5 +1,6 @@ import association_measures.frequencies as fq import association_measures.measures as am +from association_measures.grids import topography def test_frequencies(ucs_dataframe): @@ -56,3 +57,8 @@ def test_measures(ucs_dataframe): df_ams = am.score(df, measures=['log_likelihood'], signed=False, freq=False) print(df_ams.head()) + + +def test_topography(): + + print(topography())