From ecf79005d2a9b1f5f54111373e63bd247853836c Mon Sep 17 00:00:00 2001 From: Dato Date: Wed, 13 Apr 2016 09:53:57 +0300 Subject: [PATCH 1/4] Added how-to find mode value in SArray --- README.md | 1 + sarray_mode.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 sarray_mode.py diff --git a/README.md b/README.md index 2f4cc27..ed359d7 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Tabular Data Transformation * [Parse a datetime column into its components (year, month, etc.)](split_datetime_column.py) * [Convert a column of datetime strings into UNIX timestamps](convert_column_to_timestamp.py) * [Expand an SFrame column of type list/dict into multiple columns](sframe_unpack.py) +* [Find the mode of an SArray](sarray_mode.py) Graph Data Transformation --------------------------- diff --git a/sarray_mode.py b/sarray_mode.py new file mode 100644 index 0000000..d1450f6 --- /dev/null +++ b/sarray_mode.py @@ -0,0 +1,27 @@ +import graphlab as gl + +def mode_sa(sa, single_mode=True): + """Return a mode of sa, or all modes if there are several. + + single_mode: whether to return a single mode or an SArray of all modes (default: True).""" + + sf = gl.SFrame({"value": sa}) + sf2 = sf.groupby("value", {"count": gl.aggregate.COUNT()}) + max_count_index = sf2["count"].argmax() + + if single_mode: + return sf2[max_count_index]["value"] + + else: + max_count = sf2[max_count_index]["count"] + return sf2[sf2["count"] == max_count]["value"] + +# Create an SArray with two modes (most-common elements: 2 and 3) +# sa = gl.SArray([1, 2, 2, 3, 3]) + +# Find one of the modes +# print mode_sa(sa) # Returns 2 + +# Find all modes +# print mode_sa(sa, single_mode=False) # Returns SArray with [2, 3] + From f2c0ac3b9472c91d70089809d0b2b4377ffa2a2d Mon Sep 17 00:00:00 2001 From: Guy Rapaport Date: Thu, 14 Apr 2016 09:00:05 +0300 Subject: [PATCH 2/4] make this runnable, although without printouts --- sarray_mode.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sarray_mode.py b/sarray_mode.py index d1450f6..79c243b 100644 --- a/sarray_mode.py +++ b/sarray_mode.py @@ -16,12 +16,16 @@ def mode_sa(sa, single_mode=True): max_count = sf2[max_count_index]["count"] return sf2[sf2["count"] == max_count]["value"] + # Create an SArray with two modes (most-common elements: 2 and 3) -# sa = gl.SArray([1, 2, 2, 3, 3]) +sa = gl.SArray([1, 2, 2, 3, 3]) # Find one of the modes -# print mode_sa(sa) # Returns 2 +single_mode = mode_sa(sa) # returns 2 # Find all modes -# print mode_sa(sa, single_mode=False) # Returns SArray with [2, 3] - +all_modes = mode_sa(sa, single_mode=False) +# Returns +# dtype: int +# Rows: 2 +# [2, 3] From 1b5992ddee6ea7384dc686173da6ed93f417829c Mon Sep 17 00:00:00 2001 From: Guy Rapaport Date: Thu, 14 Apr 2016 09:10:10 +0300 Subject: [PATCH 3/4] Added sketch examples per @znation 's comment --- sarray_mode.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sarray_mode.py b/sarray_mode.py index 79c243b..a4aba5d 100644 --- a/sarray_mode.py +++ b/sarray_mode.py @@ -29,3 +29,22 @@ def mode_sa(sa, single_mode=True): # dtype: int # Rows: 2 # [2, 3] + + +# A faster (albeit maybe less accurate) way to find the mode value is using sa.sketch_summary().frequent_items() . +# There are two caveats to this approach: +# 1. won't work for very low-frequency mode values, and +# 2. won't necessarily give the correct result if there are multiple likely candidates. + +def sketch_modes(sa, single_mode=True): + """Fast (albeit less accurate) way to find the mode value(s) of SArray sa. + + single_mode: whether to return a single mode or an SArray of all modes (default: True).""" + + frequent_items_sketch = sa.sketch_summary().frequent_items() + modes_sketch = [k for (k, v) in frequent_items_sketch.iteritems() + if v == max(frequent_items_sketch.itervalues())] + return modes_sketch[0] if single_mode else modes_sketch + +sketch_modes(sa) # returns 2 +sketch_modes(sa, single_mode=False) # returns [2, 3] From d32decada762880bb1e570f7075e6121b9f56ab4 Mon Sep 17 00:00:00 2001 From: Guy Rapaport Date: Thu, 14 Apr 2016 09:17:08 +0300 Subject: [PATCH 4/4] Add warning about empty SArrays --- sarray_mode.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/sarray_mode.py b/sarray_mode.py index a4aba5d..e0df485 100644 --- a/sarray_mode.py +++ b/sarray_mode.py @@ -4,7 +4,10 @@ def mode_sa(sa, single_mode=True): """Return a mode of sa, or all modes if there are several. single_mode: whether to return a single mode or an SArray of all modes (default: True).""" - + + if len(sa) == 0: + raise ValueError("Can't find mode(s) in empty SArray") + sf = gl.SFrame({"value": sa}) sf2 = sf.groupby("value", {"count": gl.aggregate.COUNT()}) max_count_index = sf2["count"].argmax() @@ -36,15 +39,31 @@ def mode_sa(sa, single_mode=True): # 1. won't work for very low-frequency mode values, and # 2. won't necessarily give the correct result if there are multiple likely candidates. -def sketch_modes(sa, single_mode=True): +def sketch_mode_sa(sa, single_mode=True): """Fast (albeit less accurate) way to find the mode value(s) of SArray sa. single_mode: whether to return a single mode or an SArray of all modes (default: True).""" - + + if len(sa) == 0: + raise ValueError("Can't find mode(s) in empty SArray") + frequent_items_sketch = sa.sketch_summary().frequent_items() modes_sketch = [k for (k, v) in frequent_items_sketch.iteritems() if v == max(frequent_items_sketch.itervalues())] return modes_sketch[0] if single_mode else modes_sketch -sketch_modes(sa) # returns 2 -sketch_modes(sa, single_mode=False) # returns [2, 3] +sketch_mode_sa(sa) # returns 2 +sketch_mode_sa(sa, single_mode=False) # returns [2, 3] + + +# Both approaches should handle empty SArrays. +# The implementations above will simply raise a ValueError if `sa` is empty. +try: + mode_sa(gl.SArray([])) +except ValueError: + pass + +try: + sketch_mode_sa(gl.SArray([])) +except ValueError: + pass