Merge pull request #272 from micdavis/graph_continuous_distribution_a…

…ttribute_properties_reformat Graph Profile Structure Reformatting
capitalone · Sep 29, 2022 · 0961953 · 0961953
2 parents cb8678d + 07675ee
commit 0961953
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -42,12 +42,12 @@ pip install -e .
 Test/Lint Dependencies
 
 ```bash
-$ pip install pandas pytest pytest-cov flake8
+pip install -r requirements-test.txt
 ```
 
 To run tests:
 ```bash
-$ make test_local
+make test_local
 ```
 
 ### Referencing this library

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 numpy>=1.22.0rc1
 scikit-learn==1.0.2
 scipy==1.8.0
-dataprofiler==0.7.10
+dataprofiler==0.8.0
diff --git a/synthetic_data/graph_synthetic_data.py b/synthetic_data/graph_synthetic_data.py
@@ -67,22 +67,22 @@ def synthesize(self):
     def sample_continuous(self, attribute, num_sample=1):
         """ Sample continuous distributions. """
         name = self._continuous_distributions[attribute]["name"]
-        properties = self._continuous_distributions[attribute]["properties"]
+        best_fit_properties = self._continuous_distributions[attribute]["properties"]["best_fit_properties"]
         distribution = None
         sample = 0
 
         if name == "norm":
-            distribution = st.norm(loc=properties[0], scale=properties[1])
+            distribution = st.norm(loc=best_fit_properties[0], scale=best_fit_properties[1])
         if name == "logistic":
-            distribution = st.logistic(loc=properties[0], scale=properties[1])
+            distribution = st.logistic(loc=best_fit_properties[0], scale=best_fit_properties[1])
         if name == "lognorm":
-            distribution = st.lognorm(properties[0], loc=properties[1], scale=properties[2])
+            distribution = st.lognorm(a=best_fit_properties[0], loc=best_fit_properties[1], scale=best_fit_properties[2])
         if name == "expon":
-            distribution = st.expon(loc=properties[0], scale=properties[1])
+            distribution = st.expon(loc=best_fit_properties[0], scale=best_fit_properties[1])
         if name == "uniform":
-            distribution = st.uniform(loc=properties[0], scale=properties[1])
+            distribution = st.uniform(loc=best_fit_properties[0], scale=best_fit_properties[1])
         if name == "gamma":
-            distribution = st.gamma(properties[0], loc=properties[1], scale=properties[2])
+            distribution = st.gamma(a=best_fit_properties[0], loc=best_fit_properties[1], scale=best_fit_properties[2])
 
         sample = distribution.rvs(size=num_sample)
         return sample

diff --git a/tests/test_graph_synthetic.py b/tests/test_graph_synthetic.py
@@ -1,12 +1,9 @@
 from __future__ import print_function
-from cmath import exp
 
-import matplotlib.pyplot as plt
 import os
 import unittest
 import random
 
-import networkx as nx
 import numpy as np
 import scipy.stats as st
 
@@ -31,8 +28,14 @@ def setUpClass(cls):
                 "pop": None,
                 "edge_weight": {
                     "name": "norm",
-                    "properties": [2, 0.5, 0.5, 1, 2, 2]
-                    },
+                    "properties": {
+                        "best_fit_properties": [2, 0.5, 0.5],
+                        "mean": [1., 4.2, 1],
+                        "variance": [8.5, 305.7, 0],
+                        "skew": [4.9, 82.4, 0.5],
+                        "kurtosis": [7.2, 117436.2, 0.6],
+                    }
+                },
             },
             categorical_distribution={
                 "pop": {
@@ -63,49 +66,23 @@ def test_sample_categorical(self):
         np.random.seed(1)
         attribute = self.synthetic_graph._categorical_attributes[0]
         self.assertEqual(4, self.synthetic_graph.sample_categorical(attribute))
-    
-    def test_plot_sample_categorical(self):
+
+    def test_categorical_histogram(self):
         np.random.seed(2)
         attribute = self.synthetic_graph._categorical_attributes[0]
         data = []
         for n in range(0, 2000):
             data.append(self.synthetic_graph.sample_categorical(attribute))
-        
+
         hist, edges = np.histogram(data, bins=[1.0, 1.75, 2.5, 3.25, 4.25, 5.25, 6.25, 7.25, 8], density=False)
         self.assertEqual(list(hist), [24, 45, 374, 379, 201, 81, 72, 824])
 
-        # plots
-        expected_hist = self.expected_profile["categorical_distribution"]["pop"]["bin_counts"]
-        hist = hist/np.max(hist)
-        expected_hist = expected_hist/np.max(expected_hist)
-
-        num_bin = 8
-        bin_lims = np.linspace(0,1,num_bin+1)
-        bin_centers = 0.5*(bin_lims[:-1]+bin_lims[1:])
-        bin_widths = bin_lims[1:]-bin_lims[:-1]
-
-        fig, (ax1,ax2) = plt.subplots(nrows = 1, ncols = 2)
-        ax1.bar(bin_centers, hist, width = bin_widths, align = 'center')
-        ax2.bar(bin_centers, expected_hist, width = bin_widths, align = 'center', alpha = 0.5)
-        ax1.set_title('amplitude-normalized expected distribution')
-        ax2.set_title('amplitude-normalized computed distribution')
-        plt.show()
-
-    def test_plot_sample_continuous(self):
+    def test_continuous_properties(self):
         np.random.seed(5)
         attribute = self.synthetic_graph._continuous_attributes[0]
         data = self.synthetic_graph.sample_continuous(attribute, 2000)
         properties = self.expected_profile["continuous_distribution"][attribute]["properties"]
-        distribution_continuous_test = st.norm(loc=properties[0], scale=properties[1])
-
-        # plot
-        fig, ax1 = plt.subplots()
-        ax1.hist(list(data), bins=100)
-        pts = np.linspace(-3, 4)
-        ax2 = ax1.twinx()
-        ax2.set_ylim(0, 1)
-        ax2.plot(pts, distribution_continuous_test.pdf(pts), color='red')
-        plt.show()
+        self.assertEqual(properties, self.expected_profile["continuous_distribution"][attribute]["properties"])
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_marginal_dist.py b/tests/test_marginal_dist.py
@@ -9,10 +9,10 @@
 )
 from scipy import stats
 
-np.random.seed(0)
 
 def test_marginal_dist_detection():
 
+    np.random.seed(0)
     data = datasets.load_iris(as_frame=True).frame
 
     profile_options = dp.ProfilerOptions()
@@ -51,6 +51,7 @@ def test_marginal_dist_detection():
 
 def test_discrete_dist_detection():
 
+    np.random.seed(0)
     data = {
         "randint": stats.randint.rvs(0, 5, size=1000),
         "randint_nonzero_min": stats.randint.rvs(2, 7, size=1000), 
@@ -79,6 +80,7 @@ def test_discrete_dist_detection():
 
 def test_continuous_dist_detection():
 
+    np.random.seed(0)
     data = {
         "uniform": stats.uniform.rvs(size=1000),
         "normal": stats.norm.rvs(size=1000),