Skip to content

Commit

Permalink
Merge pull request #272 from micdavis/graph_continuous_distribution_a…
Browse files Browse the repository at this point in the history
…ttribute_properties_reformat

Graph Profile Structure Reformatting
  • Loading branch information
ssharpe42 authored Sep 29, 2022
2 parents cb8678d + 07675ee commit 0961953
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 47 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ pip install -e .
Test/Lint Dependencies

```bash
$ pip install pandas pytest pytest-cov flake8
pip install -r requirements-test.txt
```

To run tests:
```bash
$ make test_local
make test_local
```

### Referencing this library
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy>=1.22.0rc1
scikit-learn==1.0.2
scipy==1.8.0
dataprofiler==0.7.10
dataprofiler==0.8.0
14 changes: 7 additions & 7 deletions synthetic_data/graph_synthetic_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,22 +67,22 @@ def synthesize(self):
def sample_continuous(self, attribute, num_sample=1):
""" Sample continuous distributions. """
name = self._continuous_distributions[attribute]["name"]
properties = self._continuous_distributions[attribute]["properties"]
best_fit_properties = self._continuous_distributions[attribute]["properties"]["best_fit_properties"]
distribution = None
sample = 0

if name == "norm":
distribution = st.norm(loc=properties[0], scale=properties[1])
distribution = st.norm(loc=best_fit_properties[0], scale=best_fit_properties[1])
if name == "logistic":
distribution = st.logistic(loc=properties[0], scale=properties[1])
distribution = st.logistic(loc=best_fit_properties[0], scale=best_fit_properties[1])
if name == "lognorm":
distribution = st.lognorm(properties[0], loc=properties[1], scale=properties[2])
distribution = st.lognorm(a=best_fit_properties[0], loc=best_fit_properties[1], scale=best_fit_properties[2])
if name == "expon":
distribution = st.expon(loc=properties[0], scale=properties[1])
distribution = st.expon(loc=best_fit_properties[0], scale=best_fit_properties[1])
if name == "uniform":
distribution = st.uniform(loc=properties[0], scale=properties[1])
distribution = st.uniform(loc=best_fit_properties[0], scale=best_fit_properties[1])
if name == "gamma":
distribution = st.gamma(properties[0], loc=properties[1], scale=properties[2])
distribution = st.gamma(a=best_fit_properties[0], loc=best_fit_properties[1], scale=best_fit_properties[2])

sample = distribution.rvs(size=num_sample)
return sample
Expand Down
49 changes: 13 additions & 36 deletions tests/test_graph_synthetic.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
from __future__ import print_function
from cmath import exp

import matplotlib.pyplot as plt
import os
import unittest
import random

import networkx as nx
import numpy as np
import scipy.stats as st

Expand All @@ -31,8 +28,14 @@ def setUpClass(cls):
"pop": None,
"edge_weight": {
"name": "norm",
"properties": [2, 0.5, 0.5, 1, 2, 2]
},
"properties": {
"best_fit_properties": [2, 0.5, 0.5],
"mean": [1., 4.2, 1],
"variance": [8.5, 305.7, 0],
"skew": [4.9, 82.4, 0.5],
"kurtosis": [7.2, 117436.2, 0.6],
}
},
},
categorical_distribution={
"pop": {
Expand Down Expand Up @@ -63,49 +66,23 @@ def test_sample_categorical(self):
np.random.seed(1)
attribute = self.synthetic_graph._categorical_attributes[0]
self.assertEqual(4, self.synthetic_graph.sample_categorical(attribute))
def test_plot_sample_categorical(self):

def test_categorical_histogram(self):
np.random.seed(2)
attribute = self.synthetic_graph._categorical_attributes[0]
data = []
for n in range(0, 2000):
data.append(self.synthetic_graph.sample_categorical(attribute))

hist, edges = np.histogram(data, bins=[1.0, 1.75, 2.5, 3.25, 4.25, 5.25, 6.25, 7.25, 8], density=False)
self.assertEqual(list(hist), [24, 45, 374, 379, 201, 81, 72, 824])

# plots
expected_hist = self.expected_profile["categorical_distribution"]["pop"]["bin_counts"]
hist = hist/np.max(hist)
expected_hist = expected_hist/np.max(expected_hist)

num_bin = 8
bin_lims = np.linspace(0,1,num_bin+1)
bin_centers = 0.5*(bin_lims[:-1]+bin_lims[1:])
bin_widths = bin_lims[1:]-bin_lims[:-1]

fig, (ax1,ax2) = plt.subplots(nrows = 1, ncols = 2)
ax1.bar(bin_centers, hist, width = bin_widths, align = 'center')
ax2.bar(bin_centers, expected_hist, width = bin_widths, align = 'center', alpha = 0.5)
ax1.set_title('amplitude-normalized expected distribution')
ax2.set_title('amplitude-normalized computed distribution')
plt.show()

def test_plot_sample_continuous(self):
def test_continuous_properties(self):
np.random.seed(5)
attribute = self.synthetic_graph._continuous_attributes[0]
data = self.synthetic_graph.sample_continuous(attribute, 2000)
properties = self.expected_profile["continuous_distribution"][attribute]["properties"]
distribution_continuous_test = st.norm(loc=properties[0], scale=properties[1])

# plot
fig, ax1 = plt.subplots()
ax1.hist(list(data), bins=100)
pts = np.linspace(-3, 4)
ax2 = ax1.twinx()
ax2.set_ylim(0, 1)
ax2.plot(pts, distribution_continuous_test.pdf(pts), color='red')
plt.show()
self.assertEqual(properties, self.expected_profile["continuous_distribution"][attribute]["properties"])

if __name__ == "__main__":
unittest.main()
4 changes: 3 additions & 1 deletion tests/test_marginal_dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
)
from scipy import stats

np.random.seed(0)

def test_marginal_dist_detection():

np.random.seed(0)
data = datasets.load_iris(as_frame=True).frame

profile_options = dp.ProfilerOptions()
Expand Down Expand Up @@ -51,6 +51,7 @@ def test_marginal_dist_detection():

def test_discrete_dist_detection():

np.random.seed(0)
data = {
"randint": stats.randint.rvs(0, 5, size=1000),
"randint_nonzero_min": stats.randint.rvs(2, 7, size=1000),
Expand Down Expand Up @@ -79,6 +80,7 @@ def test_discrete_dist_detection():

def test_continuous_dist_detection():

np.random.seed(0)
data = {
"uniform": stats.uniform.rvs(size=1000),
"normal": stats.norm.rvs(size=1000),
Expand Down

0 comments on commit 0961953

Please sign in to comment.