Merge pull request #6 from dataiku/enhancement/1.0.1

Minor enhancements
dataiku · Oct 9, 2020 · 3f4a71b · 3f4a71b
2 parents e1c0b17 + 2e6858d
commit 3f4a71b
Show file tree

Hide file tree

Showing 19 changed files with 330 additions and 114 deletions.
diff --git a/.gitignore b/.gitignore
@@ -52,6 +52,7 @@ htmlcov/
 .cache
 nosetests.xml
 coverage.xml
+unit.xml
 *.cover
 *.py,cover
 .hypothesis/
@@ -91,14 +92,14 @@ ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
-#Pipfile.lock
+Pipfile.lock
 
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
@@ -139,3 +140,92 @@ dmypy.json
 
 # pytype static type analyzer
 .pytype/
+
+# mac stuff
+.DS_Store
+
+# History files
+.Rhistory
+.Rapp.history
+
+# Session Data files
+.RData
+
+# User-specific files
+.Ruserdata
+
+# Example code in package build process
+*-Ex.R
+
+# Output files from R CMD build
+/*.tar.gz
+
+# Output files from R CMD check
+/*.Rcheck/
+
+# RStudio files
+.Rproj.user/
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
+.httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+
+# translation temp files
+po/*~
+
+# Compiled class file
+*.class
+
+# Log file
+*.log
+
+# BlueJ files
+*.ctxt
+
+# Mobile Tools for Java (J2ME)
+.mtj.tmp/
+
+# Package Files #
+*.jar
+*.war
+*.nar
+*.ear
+*.zip
+*.tar.gz
+*.rar
+
+# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
+hs_err_pid*
+
+.gradle
+**/build/
+!src/**/build/
+
+# Ignore Gradle GUI config
+gradle-app.setting
+
+# Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
+!gradle-wrapper.jar
+
+# Cache of project
+.gradletasknamecache
+
+# # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
+# gradle/wrapper/gradle-wrapper.properties
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -14,22 +14,22 @@ appearance, race, religion, or sexual identity and orientation.
 Examples of behavior that contributes to creating a positive environment
 include:
 
-* Using welcoming and inclusive language
-* Being respectful of differing viewpoints and experiences
-* Gracefully accepting constructive criticism
-* Focusing on what is best for the community
-* Showing empathy towards other community members
+- Using welcoming and inclusive language
+- Being respectful of differing viewpoints and experiences
+- Gracefully accepting constructive criticism
+- Focusing on what is best for the community
+- Showing empathy towards other community members
 
 Examples of unacceptable behavior by participants include:
 
-* The use of sexualized language or imagery and unwelcome sexual attention or
- advances
-* Trolling, insulting/derogatory comments, and personal or political attacks
-* Public or private harassment
-* Publishing others' private information, such as a physical or electronic
- address, without explicit permission
-* Other conduct which could reasonably be considered inappropriate in a
- professional setting
+- The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+- Trolling, insulting/derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+  professional setting
 
 ## Our Responsibilities
 

diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2020 Dataiku
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/Makefile b/Makefile
@@ -1,7 +1,4 @@
-# Public variable to be set by the user in the Makefile
-TARGET_DSS_VERSION=8.0
-
-# evaluate additional variable
+# Makefile variables set automatically
 plugin_id=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['id']).replace('/',''))"`
 plugin_version=`cat plugin.json | python -c "import sys, json; print(str(json.load(sys.stdin)['version']).replace('/',''))"`
 archive_file_name="dss-plugin-${plugin_id}-${plugin_version}.zip"
@@ -22,14 +19,19 @@ plugin:
 
 unit-tests:
 	@echo "[START] Running unit tests..."
+	@( \
+		PYTHON_VERSION=`python3 -V 2>&1 | sed 's/[^0-9]*//g' | cut -c 1,2`; \
+		PYTHON_VERSION_IS_CORRECT=`cat code-env/python/desc.json | python3 -c "import sys, json; print(str($$PYTHON_VERSION) in [x[-2:] for x in json.load(sys.stdin)['acceptedPythonInterpreters']]);"`; \
+		if [ ! $$PYTHON_VERSION_IS_CORRECT ]; then echo "Python version $$PYTHON_VERSION is not in acceptedPythonInterpreters"; exit 1; fi; \
+	)
 	@( \
 		python3 -m venv env/; \
 		source env/bin/activate; \
 		pip3 install --upgrade pip; \
 		pip install --no-cache-dir -r tests/python/requirements.txt; \
 		pip install --no-cache-dir -r code-env/python/spec/requirements.txt; \
 		export PYTHONPATH="$(PYTHONPATH):$(PWD)/python-lib"; \
-		pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \
+                pytest -o junit_family=xunit2 --junitxml=unit.xml tests/python/unit || true; \
 		deactivate; \
 	)
 	@echo "[SUCCESS] Running unit tests: Done!"

diff --git a/README.md b/README.md
@@ -1,9 +1,11 @@
 # Amazon Comprehend Medical Plugin
+
 ![GitHub release (latest by date)](https://img.shields.io/github/v/release/dataiku/dss-plugin-amazon-comprehend-nlp-medical) ![Build status](https://img.shields.io/badge/build-passing-brightgreen) ![Support level](https://img.shields.io/badge/support-Tier%202-yellowgreen)
 
 This Dataiku DSS plugin provides several recipes to call the [Amazon Comprehend Medical APIs](https://aws.amazon.com/comprehend/medical/).
 
 Documentation: https://www.dataiku.com/product/plugins/amazon-comprehend-nlp-medical/
 
-### Licence
-This plugin is distributed under the Apache License version 2.0
+### License
+
+This plugin is distributed under the Apache License version 2.0.
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -1,5 +1,5 @@
-boto3==1.13.13
-tqdm==4.46.0
+boto3==1.15.14
+tqdm==4.50.1
 ratelimit==2.2.1
 retry==0.9.2
-more_itertools==8.4.0
+more-itertools==8.5.0
diff --git a/custom-recipes/amazon-comprehend-nlp-medical-entity-recognition/recipe.json b/custom-recipes/amazon-comprehend-nlp-medical-entity-recognition/recipe.json
@@ -68,23 +68,10 @@
             "parameterSetId": "api-configuration",
             "mandatory": true
         },
-        {
-            "name": "separator_advanced",
-            "label": "Advanced",
-            "type": "SEPARATOR"
-        },
-        {
-            "name": "expert",
-            "label": "Expert mode",
-            "type": "BOOLEAN",
-            "defaultValue": false
-        },
         {
             "name": "entity_types",
             "label": "Entity types",
             "type": "MULTISELECT",
-            "visibilityCondition": "model.expert == true",
-            "description": "List of medical entity types to extract",
             "mandatory": true,
             "selectChoices": [
                 {
@@ -120,11 +107,22 @@
                 "TIME_EXPRESSION"
             ]
         },
+        {
+            "name": "separator_advanced",
+            "label": "Advanced",
+            "type": "SEPARATOR"
+        },
+        {
+            "name": "expert",
+            "label": "Expert mode",
+            "type": "BOOLEAN",
+            "defaultValue": false
+        },
         {
             "name": "minimum_score",
             "label": "Minimum score",
             "description": "Minimum confidence score (from 0 to 1) for the medical entity to be recognized",
-            "visibilityCondition": "model.expert == true",
+            "visibilityCondition": "model.expert",
             "type": "DOUBLE",
             "mandatory": true,
             "defaultValue": 0,
@@ -135,7 +133,7 @@
             "name": "error_handling",
             "label": "Error handling",
             "type": "SELECT",
-            "visibilityCondition": "model.expert == true",
+            "visibilityCondition": "model.expert",
             "selectChoices": [
                 {
                     "value": "FAIL",

diff --git a/custom-recipes/amazon-comprehend-nlp-medical-entity-recognition/recipe.py b/custom-recipes/amazon-comprehend-nlp-medical-entity-recognition/recipe.py
@@ -8,7 +8,8 @@
 import dataiku
 from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role
 
-from plugin_io_utils import ErrorHandlingEnum, validate_column_input, set_column_description
+from plugin_io_utils import ErrorHandlingEnum, validate_column_input
+from dku_io_utils import set_column_description
 from amazon_comprehend_medical_api_formatting import MedicalEntityTypeEnum, MedicalEntityAPIFormatter
 from amazon_comprehend_medical_api_client import API_EXCEPTIONS, get_client
 from api_parallelizer import api_parallelizer
@@ -38,7 +39,7 @@
 output_dataset_name = get_output_names_for_role("output_dataset")[0]
 output_dataset = dataiku.Dataset(output_dataset_name)
 
-input_df = input_dataset.get_dataframe()
+input_df = input_dataset.get_dataframe(infer_with_pandas=False)
 client = get_client(api_configuration_preset)
 column_prefix = "medical_entity_api"
 

diff --git a/custom-recipes/amazon-comprehend-nlp-medical-protected-health-information/recipe.json b/custom-recipes/amazon-comprehend-nlp-medical-protected-health-information/recipe.json
@@ -83,7 +83,7 @@
             "name": "minimum_score",
             "label": "Minimum score",
             "description": "Minimum confidence score (from 0 to 1) for the PHI to be extracted",
-            "visibilityCondition": "model.expert == true",
+            "visibilityCondition": "model.expert",
             "type": "DOUBLE",
             "mandatory": true,
             "defaultValue": 0,
@@ -94,7 +94,7 @@
             "name": "error_handling",
             "label": "Error handling",
             "type": "SELECT",
-            "visibilityCondition": "model.expert == true",
+            "visibilityCondition": "model.expert",
             "selectChoices": [
                 {
                     "value": "FAIL",

diff --git a/custom-recipes/amazon-comprehend-nlp-medical-protected-health-information/recipe.py b/custom-recipes/amazon-comprehend-nlp-medical-protected-health-information/recipe.py
@@ -8,7 +8,8 @@
 import dataiku
 from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role
 
-from plugin_io_utils import ErrorHandlingEnum, validate_column_input, set_column_description
+from plugin_io_utils import ErrorHandlingEnum, validate_column_input
+from dku_io_utils import set_column_description
 from amazon_comprehend_medical_api_formatting import MedicalPhiAPIFormatter
 from amazon_comprehend_medical_api_client import API_EXCEPTIONS, get_client
 from api_parallelizer import api_parallelizer
@@ -37,7 +38,7 @@
 output_dataset_name = get_output_names_for_role("output_dataset")[0]
 output_dataset = dataiku.Dataset(output_dataset_name)
 
-input_df = input_dataset.get_dataframe()
+input_df = input_dataset.get_dataframe(infer_with_pandas=False)
 client = get_client(api_configuration_preset)
 column_prefix = "medical_phi_api"
 

diff --git a/parameter-sets/api-configuration/parameter-set.json b/parameter-sets/api-configuration/parameter-set.json
@@ -75,4 +75,4 @@
             "maxI": 100
         }
     ]
-}
+}
diff --git a/plugin.json b/plugin.json
@@ -1,6 +1,6 @@
 {
     "id": "amazon-comprehend-nlp-medical",
-    "version": "1.0.0",
+    "version": "1.0.1",
     "meta": {
         "label": "Amazon Comprehend Medical",
         "category": "Natural Language Processing",

diff --git a/python-lib/amazon_comprehend_medical_api_client.py b/python-lib/amazon_comprehend_medical_api_client.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+"""Module with utility functions to call the Amazon Comprehend Medical API"""
+
 import logging
 
 import boto3

diff --git a/python-lib/amazon_comprehend_medical_api_formatting.py b/python-lib/amazon_comprehend_medical_api_formatting.py
@@ -1,4 +1,6 @@
 # -*- coding: utf-8 -*-
+"""Module with classes to format results from the Amazon Comprehend Medical API"""
+
 import logging
 from typing import AnyStr, Dict, List
 from enum import Enum
@@ -174,14 +176,15 @@ def format_row(self, row: Dict) -> Dict:
         if len(discarded_entities) != 0:
             logging.info("Discarding {} entities below the minimum score threshold".format(len(discarded_entities)))
         for entity_enum in MedicalEntityTypeEnum:
-            entity_type_column = generate_unique(
-                "entity_type_" + str(entity_enum.value).lower() + "_text", row.keys(), self.column_prefix,
-            )
-            row[entity_type_column] = [
-                e.get("Text", "")
-                for e in entities
-                if e.get("Category", "") == entity_enum.name and float(e.get("Score", 0)) >= self.minimum_score
-            ]
-            if len(row[entity_type_column]) == 0:
-                row[entity_type_column] = ""
+            if entity_enum in self.entity_types:
+                entity_type_column = generate_unique(
+                    "entity_type_" + str(entity_enum.value).lower() + "_text", row.keys(), self.column_prefix,
+                )
+                row[entity_type_column] = [
+                    e.get("Text", "")
+                    for e in entities
+                    if e.get("Category", "") == entity_enum.name and float(e.get("Score", 0)) >= self.minimum_score
+                ]
+                if len(row[entity_type_column]) == 0:
+                    row[entity_type_column] = ""
         return row
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,4 +75,4 @@ @@
                 "maxI": 100
             }
         ]
-    }
+    }