From 38386e6aed8a58b3dfb523f373264c173c4213ac Mon Sep 17 00:00:00 2001 From: Ayush Sharma Date: Fri, 6 Sep 2024 17:43:28 +0530 Subject: [PATCH 1/4] Added feature store and pyspark notebooks Feature store notebook is giving error due to pydantic library. Pyspark processing notebook is working fine and I have updated the code according to sagemaker_core SDK --- sagemaker-core/sm-feature_store_sm_core.ipynb | 1276 +++++++++++++++++ sagemaker-core/spark-processing-sm-core.ipynb | 1059 ++++++++++++++ 2 files changed, 2335 insertions(+) create mode 100644 sagemaker-core/sm-feature_store_sm_core.ipynb create mode 100644 sagemaker-core/spark-processing-sm-core.ipynb diff --git a/sagemaker-core/sm-feature_store_sm_core.ipynb b/sagemaker-core/sm-feature_store_sm_core.ipynb new file mode 100644 index 0000000000..376a72bf8f --- /dev/null +++ b/sagemaker-core/sm-feature_store_sm_core.ipynb @@ -0,0 +1,1276 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Amazon SageMaker Feature Store: Introduction to Feature Store" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "This notebook's CI test result for us-west-2 is as follows. CI test results in other regions can be found at the end of the notebook. \n", + "\n", + "![This us-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-2/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates how to get started with Feature Store, create feature groups, and ingest data into them. These feature groups are stored in your Feature Store.\n", + "\n", + "Feature groups are resources that contain metadata for all data stored in your Feature Store. A feature group is a logical grouping of features, defined in the feature store to describe records. A feature group’s definition is composed of a list of feature definitions, a record identifier name, and configurations for its online and offline store. \n", + "\n", + "### Overview\n", + "1. Set up\n", + "2. Creating a feature group\n", + "3. Ingest data into a feature group\n", + "\n", + "### Prerequisites\n", + "This notebook uses sagemaker_core SDK and `Python 3 (Data Science)` kernel. This notebook works with Studio, Jupyter, and JupyterLab. \n", + "\n", + "#### Library dependencies:\n", + "* `sagemaker_core`\n", + "* `numpy`\n", + "* `pandas`\n", + "\n", + "#### Role requirements:\n", + "**IMPORTANT**: You must attach the following policies to your execution role:\n", + "* `AmazonS3FullAccess`\n", + "* `AmazonSageMakerFeatureStoreAccess`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![policy](images/feature-store-policy.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing installation: pydantic 2.9.0\n", + "Uninstalling pydantic-2.9.0:\n", + " Successfully uninstalled pydantic-2.9.0\n", + "Found existing installation: sagemaker-core 1.0.3\n", + "Uninstalling sagemaker-core-1.0.3:\n", + " Successfully uninstalled sagemaker-core-1.0.3\n", + "\u001b[33mWARNING: Skipping sagemaker-core as it is not installed.\u001b[0m\u001b[33m\n", + "\u001b[0mCollecting sagemaker-core\n", + " Using cached sagemaker_core-1.0.3-py3-none-any.whl.metadata (4.9 kB)\n", + "Requirement already satisfied: boto3<2.0.0,>=1.34.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (1.35.8)\n", + "Collecting pydantic<3.0.0,>=1.7.0 (from sagemaker-core)\n", + " Using cached pydantic-2.9.0-py3-none-any.whl.metadata (146 kB)\n", + "Requirement already satisfied: PyYAML<7.0,>=6.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.0.1)\n", + "Requirement already satisfied: jsonschema<5.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.17.3)\n", + "Requirement already satisfied: platformdirs<5.0.0,>=4.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.2.2)\n", + "Requirement already satisfied: rich<14.0.0,>=13.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (13.7.1)\n", + "Requirement already satisfied: mock<5.0,>4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.0.3)\n", + "Requirement already satisfied: importlib-metadata<7.0,>=1.4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.10.0)\n", + "Requirement already satisfied: botocore<1.36.0,>=1.35.8 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.35.8)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (0.10.2)\n", + "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.10/site-packages (from importlib-metadata<7.0,>=1.4.0->sagemaker-core) (3.19.2)\n", + "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (23.2.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (0.20.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.23.2 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (2.23.2)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (4.12.2)\n", + "Requirement already satisfied: tzdata in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (2024.1)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (2.18.0)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (2.9.0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.26.19)\n", + "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.0.0->sagemaker-core) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.16.0)\n", + "Using cached sagemaker_core-1.0.3-py3-none-any.whl (377 kB)\n", + "Using cached pydantic-2.9.0-py3-none-any.whl (434 kB)\n", + "Installing collected packages: pydantic, sagemaker-core\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "gluonts 0.13.7 requires pydantic~=1.7, but you have pydantic 2.9.0 which is incompatible.\n", + "langchain-community 0.2.11 requires langchain<0.3.0,>=0.2.12, but you have langchain 0.2.5 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed pydantic-2.9.0 sagemaker-core-1.0.3\n" + ] + } + ], + "source": [ + "!pip uninstall pydantic sagemaker-core -y\n", + "!pip install pip --upgrade --quiet\n", + "!pip uninstall sagemaker-core -y\n", + "!pip install sagemaker-core --upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip uninstall pydantic sagemaker-core -y\n", + "!pip install --upgrade pip -q\n", + "!pip install pydantic==2.7.0\n", + "!pip install sagemaker-core" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "pydantic==2.9.0\n", + "pydantic_core==2.23.2\n" + ] + } + ], + "source": [ + "!pip freeze | grep pydantic" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import io\n", + "from sagemaker_core.helper.session_helper import get_execution_role, Session\n", + "sagemaker_session = Session()\n", + "region = \"us-east-1\"\n", + "REGION_NAME = region if region else SM_SESSION._region_name\n", + "role = get_execution_role()\n", + "s3_bucket_name = sagemaker_session.default_bucket()\n", + "prefix = \"sagemaker-featurestore-introduction\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Set up" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Inspect your data\n", + "In this notebook example we ingest synthetic data. We read from `./data/feature_store_introduction_customer.csv` and `./data/feature_store_introduction_orders.csv`." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "customer_data = pd.read_csv(\"data/feature_store_introduction_customer.csv\")\n", + "orders_data = pd.read_csv(\"data/feature_store_introduction_orders.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idcity_codestate_codecountry_code
05732911492
11093822402
28284003312
3124013452
\n", + "
" + ], + "text/plain": [ + " customer_id city_code state_code country_code\n", + "0 573291 1 49 2\n", + "1 109382 2 40 2\n", + "2 828400 3 31 2\n", + "3 124013 4 5 2" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idorder_idorder_statusstore_id
057329141321303
110938257240201
282840019420431
312401367821213
\n", + "
" + ], + "text/plain": [ + " customer_id order_id order_status store_id\n", + "0 573291 4132 1 303\n", + "1 109382 5724 0 201\n", + "2 828400 1942 0 431\n", + "3 124013 6782 1 213" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "orders_data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below is an illustration on the steps the data goes through before it is ingested into a Feature Store. In this notebook, we illustrate the use-case where you have data from multiple sources and want to store them independently in a feature store. Our example considers data from a data warehouse (customer data), and data from a real-time streaming service (order data). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a feature group\n", + "\n", + "We first start by creating feature group names for customer_data and orders_data. Following this, we create two Feature Groups, one for `customer_data` and another for `orders_data`" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from time import gmtime, strftime, sleep\n", + "\n", + "customers_feature_group_name = \"customers-feature-group-\" + strftime(\"%d-%H-%M-%S\", gmtime())\n", + "orders_feature_group_name = \"orders-feature-group-\" + strftime(\"%d-%H-%M-%S\", gmtime())" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'customers-feature-group-06-11-54-09'" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customers_feature_group_name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instantiate a FeatureGroup object for customers_data and orders_data. " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from sagemaker_core.shapes import FeatureDefinition\n", + "\n", + "CustomerFeatureDefinitions =[FeatureDefinition(feature_name='customer_id',feature_type='Integral'),\n", + " FeatureDefinition(feature_name='city_code',feature_type='Integral'),\n", + " FeatureDefinition(feature_name='state_code',feature_type='Integral'),\n", + " FeatureDefinition(feature_name='country_code',feature_type='Integral'),\n", + " FeatureDefinition(feature_name='EventTime',feature_type='Fractional')]\n", + "\n", + "OrderFeatureDefinitions = [FeatureDefinition(feature_name='customer_id',feature_type='Integral'),\n", + " FeatureDefinition(feature_name='order_id', feature_type='Integral'),\n", + " FeatureDefinition(feature_name='order_status', feature_type='Integral'),\n", + " FeatureDefinition(feature_name='store_id', feature_type='Integral'),\n", + " FeatureDefinition(feature_name='EventTime', feature_type='Fractional')]" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'customers-feature-group-06-11-54-09'" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customers_feature_group_name" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "current_time_sec = int(round(time.time()))\n", + "\n", + "record_identifier_feature_name = \"customer_id\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Append `EventTime` feature to your data frame. This parameter is required, and time stamps each data point." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "customer_data[\"EventTime\"] = pd.Series([current_time_sec] * len(customer_data), dtype=\"float64\")\n", + "orders_data[\"EventTime\"] = pd.Series([current_time_sec] * len(orders_data), dtype=\"float64\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load feature definitions to your feature group. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we call create to create two feature groups, customers_feature_group and orders_feature_group respectively" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:10                                                                                   \n",
+       "                                                                                                  \n",
+       "    7 record_identifier_feature_name= record_identifier_feature_name,                         \n",
+       "    8 event_time_feature_name=\"EventTime\",                                                    \n",
+       "    9 role_arn=role,                                                                          \n",
+       " 10 online_store_config = OnlineStoreConfig(enable_store_config=True),                      \n",
+       "   11 feature_definitions = CustomerFeatureDefinitions                                        \n",
+       "   12 #offline_store_config = OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_ur    \n",
+       "   13 )                                                                                           \n",
+       "                                                                                                  \n",
+       " /opt/conda/lib/python3.10/site-packages/pydantic/main.py:175 in __init__                         \n",
+       "                                                                                                  \n",
+       "    172 │   │   \"\"\"                                                                               \n",
+       "    173 │   │   # `__tracebackhide__` tells pytest and some other tools to omit this function fr  \n",
+       "    174 │   │   __tracebackhide__ = True                                                          \n",
+       "  175 │   │   self.__pydantic_validator__.validate_python(data, self_instance=self)             \n",
+       "    176                                                                                       \n",
+       "    177 # The following line sets a flag that we use to determine when `__init__` gets overr  \n",
+       "    178 __init__.__pydantic_base_init__ = True  # pyright: ignore[reportFunctionMemberAccess  \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "ValidationError: 1 validation error for OnlineStoreConfig\n",
+       "enable_store_config\n",
+       "  Extra inputs are not permitted [type=extra_forbidden, input_value=True, input_type=bool]\n",
+       "    For further information visit https://errors.pydantic.dev/2.7/v/extra_forbidden\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m10\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 7 \u001b[0m\u001b[2m│ \u001b[0mrecord_identifier_feature_name= record_identifier_feature_name, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 8 \u001b[0m\u001b[2m│ \u001b[0mevent_time_feature_name=\u001b[33m\"\u001b[0m\u001b[33mEventTime\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 9 \u001b[0m\u001b[2m│ \u001b[0mrole_arn=role, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m10 \u001b[2m│ \u001b[0monline_store_config = OnlineStoreConfig(enable_store_config=\u001b[94mTrue\u001b[0m), \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m11 \u001b[0m\u001b[2m│ \u001b[0mfeature_definitions = CustomerFeatureDefinitions \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m12 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m#offline_store_config = OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_ur\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m13 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/opt/conda/lib/python3.10/site-packages/pydantic/\u001b[0m\u001b[1;33mmain.py\u001b[0m:\u001b[94m175\u001b[0m in \u001b[92m__init__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 172 \u001b[0m\u001b[2;33m│ │ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 173 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# `__tracebackhide__` tells pytest and some other tools to omit this function fr\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 174 \u001b[0m\u001b[2m│ │ \u001b[0m__tracebackhide__ = \u001b[94mTrue\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 175 \u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.__pydantic_validator__.validate_python(data, self_instance=\u001b[96mself\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 176 \u001b[0m\u001b[2m│ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 177 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# The following line sets a flag that we use to determine when `__init__` gets overr\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 178 \u001b[0m\u001b[2m│ \u001b[0m\u001b[92m__init__\u001b[0m.__pydantic_base_init__ = \u001b[94mTrue\u001b[0m \u001b[2m# pyright: ignore[reportFunctionMemberAccess\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mValidationError: \u001b[0m\u001b[1;36m1\u001b[0m validation error for OnlineStoreConfig\n", + "enable_store_config\n", + " Extra inputs are not permitted \u001b[1m[\u001b[0m\u001b[38;2;215;175;0mtype\u001b[0m=\u001b[38;2;225;0;225mextra_forbidden\u001b[0m, \u001b[38;2;215;175;0minput_value\u001b[0m=\u001b[3;38;2;0;135;0mTrue\u001b[0m, \u001b[38;2;215;175;0minput_type\u001b[0m=\u001b[38;2;225;0;225mbool\u001b[0m\u001b[1m]\u001b[0m\n", + " For further information visit \u001b[4;38;2;0;105;255mhttps://errors.pydantic.dev/2.7/v/extra_forbidden\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sagemaker_core.shapes import OnlineStoreConfig,OfflineStoreConfig,S3StorageConfig\n", + "from sagemaker_core.resources import FeatureGroup\n", + "\n", + "FeatureGroup.create(\n", + " feature_group_name= customers_feature_group_name,\n", + " #s3_uri=f\"s3://{s3_bucket_name}/{prefix}\",\n", + " record_identifier_feature_name= record_identifier_feature_name,\n", + " event_time_feature_name=\"EventTime\",\n", + " role_arn=role,\n", + " online_store_config = OnlineStoreConfig(enable_store_config=True),\n", + " feature_definitions = CustomerFeatureDefinitions\n", + " #offline_store_config = OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_uri = f\"s3://{s3_bucket_name}/{prefix}\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:7                                                                                    \n",
+       "                                                                                                  \n",
+       "    4 record_identifier_feature_name= record_identifier_feature_name,                         \n",
+       "    5 event_time_feature_name=\"EventTime\",                                                    \n",
+       "    6 role_arn=role,                                                                          \n",
+       "  7 online_store_config= OnlineStoreConfig(enable_store_config=True),                       \n",
+       "    8 feature_definitions = OrderFeatureDefinitions,                                          \n",
+       "    9 offline_store_config =  OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_ur    \n",
+       "   10 )                                                                                           \n",
+       "                                                                                                  \n",
+       " /opt/conda/lib/python3.10/site-packages/pydantic/main.py:175 in __init__                         \n",
+       "                                                                                                  \n",
+       "    172 │   │   \"\"\"                                                                               \n",
+       "    173 │   │   # `__tracebackhide__` tells pytest and some other tools to omit this function fr  \n",
+       "    174 │   │   __tracebackhide__ = True                                                          \n",
+       "  175 │   │   self.__pydantic_validator__.validate_python(data, self_instance=self)             \n",
+       "    176                                                                                       \n",
+       "    177 # The following line sets a flag that we use to determine when `__init__` gets overr  \n",
+       "    178 __init__.__pydantic_base_init__ = True  # pyright: ignore[reportFunctionMemberAccess  \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "ValidationError: 1 validation error for OnlineStoreConfig\n",
+       "enable_store_config\n",
+       "  Extra inputs are not permitted [type=extra_forbidden, input_value=True, input_type=bool]\n",
+       "    For further information visit https://errors.pydantic.dev/2.7/v/extra_forbidden\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m7\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 4 \u001b[0m\u001b[2m│ \u001b[0mrecord_identifier_feature_name= record_identifier_feature_name, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 5 \u001b[0m\u001b[2m│ \u001b[0mevent_time_feature_name=\u001b[33m\"\u001b[0m\u001b[33mEventTime\u001b[0m\u001b[33m\"\u001b[0m, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 6 \u001b[0m\u001b[2m│ \u001b[0mrole_arn=role, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 7 \u001b[2m│ \u001b[0monline_store_config= OnlineStoreConfig(enable_store_config=\u001b[94mTrue\u001b[0m), \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 8 \u001b[0m\u001b[2m│ \u001b[0mfeature_definitions = OrderFeatureDefinitions, \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 9 \u001b[0m\u001b[2m│ \u001b[0moffline_store_config = OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_ur \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m10 \u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/opt/conda/lib/python3.10/site-packages/pydantic/\u001b[0m\u001b[1;33mmain.py\u001b[0m:\u001b[94m175\u001b[0m in \u001b[92m__init__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 172 \u001b[0m\u001b[2;33m│ │ \u001b[0m\u001b[33m\"\"\"\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 173 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[2m# `__tracebackhide__` tells pytest and some other tools to omit this function fr\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 174 \u001b[0m\u001b[2m│ │ \u001b[0m__tracebackhide__ = \u001b[94mTrue\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 175 \u001b[2m│ │ \u001b[0m\u001b[96mself\u001b[0m.__pydantic_validator__.validate_python(data, self_instance=\u001b[96mself\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 176 \u001b[0m\u001b[2m│ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 177 \u001b[0m\u001b[2m│ \u001b[0m\u001b[2m# The following line sets a flag that we use to determine when `__init__` gets overr\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 178 \u001b[0m\u001b[2m│ \u001b[0m\u001b[92m__init__\u001b[0m.__pydantic_base_init__ = \u001b[94mTrue\u001b[0m \u001b[2m# pyright: ignore[reportFunctionMemberAccess\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mValidationError: \u001b[0m\u001b[1;36m1\u001b[0m validation error for OnlineStoreConfig\n", + "enable_store_config\n", + " Extra inputs are not permitted \u001b[1m[\u001b[0m\u001b[38;2;215;175;0mtype\u001b[0m=\u001b[38;2;225;0;225mextra_forbidden\u001b[0m, \u001b[38;2;215;175;0minput_value\u001b[0m=\u001b[3;38;2;0;135;0mTrue\u001b[0m, \u001b[38;2;215;175;0minput_type\u001b[0m=\u001b[38;2;225;0;225mbool\u001b[0m\u001b[1m]\u001b[0m\n", + " For further information visit \u001b[4;38;2;0;105;255mhttps://errors.pydantic.dev/2.7/v/extra_forbidden\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sagemaker_core.resources import FeatureGroup\n", + "FeatureGroup.create(\n", + " feature_group_name= orders_feature_group_name,\n", + " record_identifier_feature_name= record_identifier_feature_name,\n", + " event_time_feature_name=\"EventTime\",\n", + " role_arn=role,\n", + " online_store_config= OnlineStoreConfig(enable_store_config=True),\n", + " feature_definitions = OrderFeatureDefinitions,\n", + " offline_store_config = OfflineStoreConfig(s3_storage_config = S3StorageConfig(s3_uri = f\"s3://{s3_bucket_name}/{prefix}\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To confirm that your FeatureGroup has been created we use `DescribeFeatureGroup` and `ListFeatureGroups` APIs to display the created FeatureGroup." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customers_feature_group.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "orders_feature_group.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sagemaker_session.boto_session.client(\n", + " \"sagemaker\", region_name=region\n", + ").list_feature_groups() # We use the boto client to list FeatureGroups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def check_feature_group_status(feature_group):\n", + " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + " while status == \"Creating\":\n", + " print(\"Waiting for Feature Group to be Created\")\n", + " time.sleep(5)\n", + " status = feature_group.describe().get(\"FeatureGroupStatus\")\n", + " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n", + "\n", + "\n", + "check_feature_group_status(customers_feature_group)\n", + "check_feature_group_status(orders_feature_group)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add metadata to a feature\n", + "\n", + "We can add searchable metadata fields to FeatureGroup features by using the `UpdateFeatureMetadata` API. The currently supported metadata fields are `description` and `parameters`." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker_core.resources import FeatureMetadata\n", + "from sagemaker_core.shapes import FeatureParameter\n", + "\n", + "customers_feature_group.update_feature_metadata(\n", + " feature_name=\"customer_id\",\n", + " description=\"The ID of a customer. It is also used in orders_feature_group.\",\n", + " parameter_additions=[FeatureParameter(\"idType\", \"primaryKey\")],\n", + ")\n", + "\n", + "customers_feature_metadata = FeatureMetadata(feature_group_name = customers_feature_group_name, feature_name = \"customer_id\",\n", + " description= 'The ID of a customer. It is also used in orders_feature_group.',\n", + " parameters = [FeatureParameter(key=\"idType\", value=\"primaryKey\")]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To view feature metadata, we can use `get` method to display that feature." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "customers_feature_metadata_3= FeatureMetadata.get(feature_group_name = customers_feature_group_name, feature_name = \"customer_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
[09/03/24 17:07:08] INFO     INFO:sagemaker_core.main.resources:Updating feature_metadata         resources.py:9635\n",
+       "                             resource.                                                                             \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/03/24 17:07:08]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Updating feature_metadata \u001b]8;id=883813;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=983103;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#9635\u001b\\\u001b[2m9635\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "FeatureMetadata(feature_group_name='customers-feature-group-03-16-59-32', feature_name='customer_id', feature_group_arn='arn:aws:sagemaker:us-east-1:774297356213:feature-group/customers-feature-group-03-16-59-32', feature_type='Integral', creation_time=datetime.datetime(2024, 9, 3, 17, 0, 51, 605000, tzinfo=tzlocal()), last_modified_time=datetime.datetime(2024, 9, 3, 17, 7, 8, 194000, tzinfo=tzlocal()), description='This is a test 3', parameters=[FeatureParameter(key='idType', value='primaryKey')])" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customers_feature_metadata_3.update(description=\"This is a test 3\",parameter_additions=[FeatureParameter(key=\"idType\", value=\"primaryKey\")])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Feature metadata fields are searchable. We use `search` API to find features with metadata that matches some search criteria." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Results': [{'FeatureMetadata': {'FeatureGroupArn': 'arn:aws:sagemaker:us-east-1:774297356213:feature-group/customers-feature-group-30-03-11-20',\n", + " 'FeatureGroupName': 'customers-feature-group-30-03-11-20',\n", + " 'FeatureName': 'customer_id',\n", + " 'FeatureType': 'Integral',\n", + " 'CreationTime': datetime.datetime(2024, 8, 30, 3, 11, 22, tzinfo=tzlocal()),\n", + " 'LastModifiedTime': datetime.datetime(2024, 8, 30, 3, 11, 56, tzinfo=tzlocal()),\n", + " 'Description': 'The ID of a customer. It is also used in orders_feature_group.',\n", + " 'Parameters': [{'Key': 'idType', 'Value': 'primaryKey'}]}}],\n", + " 'ResponseMetadata': {'RequestId': '2bdda8bf-fdaa-4218-9884-e96d747e936a',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amzn-requestid': '2bdda8bf-fdaa-4218-9884-e96d747e936a',\n", + " 'content-type': 'application/x-amz-json-1.1',\n", + " 'content-length': '486',\n", + " 'date': 'Fri, 30 Aug 2024 03:40:40 GMT'},\n", + " 'RetryAttempts': 0}}" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sagemaker_session.boto_session.client(\"sagemaker\", region_name=region).search(\n", + " Resource=\"FeatureMetadata\",\n", + " SearchExpression={\n", + " \"Filters\": [\n", + " {\n", + " \"Name\": \"FeatureGroupName\",\n", + " \"Operator\": \"Contains\",\n", + " \"Value\": \"customers-feature-group-\",\n", + " },\n", + " {\"Name\": \"Parameters.idType\", \"Operator\": \"Equals\", \"Value\": \"primaryKey\"},\n", + " ]\n", + " },\n", + ") # We use the boto client to search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ingest data into a feature group\n", + "\n", + "We can put data into the FeatureGroup by using the `PutRecord` API. It will take < 1 minute to ingest data." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n",
+       " in <module>:1                                                                                    \n",
+       "                                                                                                  \n",
+       " 1 customers_feature_group.ingest(data_frame=customer_data, max_workers=3, wait=True)           \n",
+       "   2                                                                                              \n",
+       "                                                                                                  \n",
+       " /opt/conda/lib/python3.10/site-packages/pydantic/main.py:828 in __getattr__                      \n",
+       "                                                                                                  \n",
+       "    825 │   │   │   │   │   │   return super().__getattribute__(item)  # Raises AttributeError i  \n",
+       "    826 │   │   │   │   │   else:                                                                 \n",
+       "    827 │   │   │   │   │   │   # this is the current error                                       \n",
+       "  828 │   │   │   │   │   │   raise AttributeError(f'{type(self).__name__!r} object has no att  \n",
+       "    829 │   │                                                                                     \n",
+       "    830 │   │   def __setattr__(self, name: str, value: Any) -> None:                             \n",
+       "    831 │   │   │   if name in self.__class_vars__:                                               \n",
+       "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
+       "AttributeError: 'FeatureGroup' object has no attribute 'ingest'\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[38;2;255;0;0m╭─\u001b[0m\u001b[38;2;255;0;0m──────────────────────────────\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[1;38;2;255;0;0mTraceback \u001b[0m\u001b[1;2;38;2;255;0;0m(most recent call last)\u001b[0m\u001b[38;2;255;0;0m \u001b[0m\u001b[38;2;255;0;0m───────────────────────────────\u001b[0m\u001b[38;2;255;0;0m─╮\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m in \u001b[92m\u001b[0m:\u001b[94m1\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m1 customers_feature_group.ingest(data_frame=customer_data, max_workers=\u001b[94m3\u001b[0m, wait=\u001b[94mTrue\u001b[0m) \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m2 \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2;33m/opt/conda/lib/python3.10/site-packages/pydantic/\u001b[0m\u001b[1;33mmain.py\u001b[0m:\u001b[94m828\u001b[0m in \u001b[92m__getattr__\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 825 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mreturn\u001b[0m \u001b[96msuper\u001b[0m().\u001b[92m__getattribute__\u001b[0m(item) \u001b[2m# Raises AttributeError i\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 826 \u001b[0m\u001b[2m│ │ │ │ │ \u001b[0m\u001b[94melse\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 827 \u001b[0m\u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[2m# this is the current error\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[31m❱ \u001b[0m 828 \u001b[2m│ │ │ │ │ │ \u001b[0m\u001b[94mraise\u001b[0m \u001b[96mAttributeError\u001b[0m(\u001b[33mf\u001b[0m\u001b[33m'\u001b[0m\u001b[33m{\u001b[0m\u001b[96mtype\u001b[0m(\u001b[96mself\u001b[0m).\u001b[91m__name__\u001b[0m\u001b[33m!r}\u001b[0m\u001b[33m object has no att\u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 829 \u001b[0m\u001b[2m│ │ \u001b[0m \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 830 \u001b[0m\u001b[2m│ │ \u001b[0m\u001b[94mdef\u001b[0m \u001b[92m__setattr__\u001b[0m(\u001b[96mself\u001b[0m, name: \u001b[96mstr\u001b[0m, value: Any) -> \u001b[94mNone\u001b[0m: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m│\u001b[0m \u001b[2m 831 \u001b[0m\u001b[2m│ │ │ \u001b[0m\u001b[94mif\u001b[0m name \u001b[95min\u001b[0m \u001b[96mself\u001b[0m.__class_vars__: \u001b[38;2;255;0;0m│\u001b[0m\n", + "\u001b[38;2;255;0;0m╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n", + "\u001b[1;91mAttributeError: \u001b[0m\u001b[38;2;0;135;0m'FeatureGroup'\u001b[0m object has no attribute \u001b[38;2;0;135;0m'ingest'\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "customers_feature_group.ingest(data_frame=customer_data, max_workers=3, wait=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "orders_feature_group.ingest(data_frame=orders_data, max_workers=3, wait=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Using an arbitrary customer record ID, 573291 we use `get_record` to check that the data has been ingested into the feature group." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_id = 573291\n", + "sample_record = sagemaker_session.boto_session.client(\n", + " \"sagemaker-featurestore-runtime\", region_name=region\n", + ").get_record(\n", + " FeatureGroupName=customers_feature_group_name, RecordIdentifierValueAsString=str(customer_id)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sample_record" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use `batch_get_record` to check that all data has been ingested into two feature groups by providing customer IDs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_records = sagemaker_session.boto_session.client(\n", + " \"sagemaker-featurestore-runtime\", region_name=region\n", + ").batch_get_record(\n", + " Identifiers=[\n", + " {\n", + " \"FeatureGroupName\": customers_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"573291\", \"109382\", \"828400\", \"124013\"],\n", + " },\n", + " {\n", + " \"FeatureGroupName\": orders_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"573291\", \"109382\", \"828400\", \"124013\"],\n", + " },\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_records" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add features to a feature group\n", + "\n", + "If we want to update a FeatureGroup that has done the data ingestion, we can use the `UpdateFeatureGroup` API and then re-ingest data by using the updated dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customers_feature_group.update(\n", + " feature_additions=[{'feature_name': 'Email', 'feature_type': 'String'},{'feature_name': 'name', 'feature_type': 'String'}]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Verify the FeatureGroup has been updated successfully or not." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def check_last_update_status(feature_group):\n", + " last_update_status = feature_group.get(\"LastUpdateStatus\")[\"Status\"]\n", + " while last_update_status == \"InProgress\":\n", + " print(\"Waiting for FeatureGroup to be updated\")\n", + " time.sleep(5)\n", + " last_update_status = feature_group.get(\"LastUpdateStatus\")\n", + " if last_update_status == \"Successful\":\n", + " print(f\"FeatureGroup {feature_group.name} successfully updated.\")\n", + " else:\n", + " print(\n", + " f\"FeatureGroup {feature_group.name} updated failed. The LastUpdateStatus is\"\n", + " + str(last_update_status)\n", + " )\n", + "\n", + "\n", + "check_last_update_status(customers_feature_group)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_data_updated = pd.read_csv(\"data/feature_store_introduction_customer_updated.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_data_updated.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Append `EventTime` feature to your data frame again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_data_updated[\"EventTime\"] = pd.Series(\n", + " [current_time_sec] * len(customer_data), dtype=\"float64\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ingest the new dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## need to see how to ingest data in new SDK\n", + "\n", + "customers_feature_group.ingest(data_frame=customer_data_updated, max_workers=3, wait=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use `batch_get_record` again to check that all updated data has been ingested into `customers_feature_group` by providing customer IDs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "updated_customers_records = sagemaker_session.boto_session.client(\n", + " \"sagemaker-featurestore-runtime\", region_name=region\n", + ").batch_get_record(\n", + " Identifiers=[\n", + " {\n", + " \"FeatureGroupName\": customers_feature_group_name,\n", + " \"RecordIdentifiersValueAsString\": [\"573291\", \"109382\", \"828400\", \"124013\"],\n", + " }\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "updated_customers_records" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clean up\n", + "Here we remove the Feature Groups we created. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customers_feature_group.delete()\n", + "orders_feature_group.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Next steps" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook you learned how to quickly get started with Feature Store and now know how to create feature groups, and ingest data into them.\n", + "\n", + "For an advanced example on how to use Feature Store for a Fraud Detection use-case, see [Fraud Detection with Feature Store](https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-featurestore/sagemaker_featurestore_fraud_detection_python_sdk.html).\n", + "\n", + "For detailed information about Feature Store, see the [Developer Guide](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Programmers note" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook we used a variety of different API calls. Most of them are accessible through the Python SDK, however some only exist within `boto3`. You can invoke the Python SDK API calls directly on your Feature Store objects, whereas to invoke API calls that exist within `boto3`, you must first access a boto client through your boto and sagemaker sessions: e.g. `sagemaker_session.boto_session.client()`.\n", + "\n", + "Below we list API calls used in this notebook that exist within the Python SDK and ones that exist in `boto3` for your reference. \n", + "\n", + "#### Python SDK API Calls\n", + "* `describe()`\n", + "* `ingest()`\n", + "* `delete()`\n", + "* `create()`\n", + "* `load_feature_definitions()`\n", + "* `update()`\n", + "* `update_feature_metadata()`\n", + "* `describe_feature_metadata()`\n", + "\n", + "#### Boto3 API Calls\n", + "* `list_feature_groups()`\n", + "* `get_record()`\n", + "* `batch_get_record()`\n", + "* `search()`\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notebook CI Test Results\n", + "\n", + "This notebook was tested in multiple regions. The test results are as follows, except for us-west-2 which is shown at the top of the notebook.\n", + "\n", + "![This us-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This us-east-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-east-2/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This us-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/us-west-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ca-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ca-central-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This sa-east-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/sa-east-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This eu-west-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This eu-west-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-2/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This eu-west-3 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-west-3/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This eu-central-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-central-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This eu-north-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/eu-north-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ap-southeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ap-southeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-southeast-2/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ap-northeast-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ap-northeast-2 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-northeast-2/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n", + "\n", + "![This ap-south-1 badge failed to load. Check your device's internet connectivity, otherwise the service is currently unavailable](https://prod.us-west-2.tcx-beacon.docs.aws.dev/sagemaker-nb/ap-south-1/prepare_data|sm-feature_store_introduction|sm-feature_store_introduction.ipynb)\n" + ] + } + ], + "metadata": { + "instance_type": "ml.t3.medium", + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/sagemaker-core/spark-processing-sm-core.ipynb b/sagemaker-core/spark-processing-sm-core.ipynb new file mode 100644 index 0000000000..27b39ab143 --- /dev/null +++ b/sagemaker-core/spark-processing-sm-core.ipynb @@ -0,0 +1,1059 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Distributed Data Processing using Apache Spark and SageMaker Processing\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Apache Spark is a unified analytics engine for large-scale data processing. The Spark framework is often used within the context of machine learning workflows to run data transformation or feature engineering workloads at scale. Amazon SageMaker provides a set of prebuilt Docker images that include Apache Spark and other dependencies needed to run distributed data processing jobs on Amazon SageMaker. This example notebook demonstrates how to use the prebuilt Spark images on SageMaker Processing using the SageMaker Core Python SDK." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Runtime\n", + "\n", + "This notebook takes approximately 22 minutes to run." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Contents\n", + "\n", + "1. [Setup](#Setup)\n", + "1. [Example 1: Running a basic PySpark application](#Example-1:-Running-a-basic-PySpark-application)\n", + "1. [Example 2: Specify additional Python and jar file dependencies](#Example-2:-Specify-additional-Python-and-jar-file-dependencies)\n", + "1. [Example 3: Run a Java/Scala Spark application](#Example-3:-Run-a-Java/Scala-Spark-application)\n", + "1. [Example 4: Specifying additional Spark configuration](#Example-4:-Specifying-additional-Spark-configuration)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Install the latest SageMaker Core Python SDK" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing installation: sagemaker-core 1.0.2\n", + "Uninstalling sagemaker-core-1.0.2:\n", + " Successfully uninstalled sagemaker-core-1.0.2\n", + "Collecting sagemaker-core\n", + " Downloading sagemaker_core-1.0.3-py3-none-any.whl.metadata (4.9 kB)\n", + "Requirement already satisfied: boto3<2.0.0,>=1.34.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (1.35.8)\n", + "Requirement already satisfied: pydantic<3.0.0,>=1.7.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (2.7.0)\n", + "Requirement already satisfied: PyYAML<7.0,>=6.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.0.1)\n", + "Requirement already satisfied: jsonschema<5.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.17.3)\n", + "Requirement already satisfied: platformdirs<5.0.0,>=4.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.2.2)\n", + "Requirement already satisfied: rich<14.0.0,>=13.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (13.7.1)\n", + "Requirement already satisfied: mock<5.0,>4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.0.3)\n", + "Requirement already satisfied: importlib-metadata<7.0,>=1.4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.10.0)\n", + "Requirement already satisfied: botocore<1.36.0,>=1.35.8 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.35.8)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.0.1)\n", + "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (0.10.2)\n", + "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.10/site-packages (from importlib-metadata<7.0,>=1.4.0->sagemaker-core) (3.19.2)\n", + "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (23.2.0)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (0.20.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.18.1 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (2.18.1)\n", + "Requirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (4.12.2)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (3.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (2.18.0)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (2.9.0)\n", + "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.26.19)\n", + "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.0.0->sagemaker-core) (0.1.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.16.0)\n", + "Downloading sagemaker_core-1.0.3-py3-none-any.whl (377 kB)\n", + "Installing collected packages: sagemaker-core\n", + "Successfully installed sagemaker-core-1.0.3\n" + ] + } + ], + "source": [ + "!pip install pip --upgrade --quiet\n", + "!pip uninstall sagemaker-core -y\n", + "!pip install sagemaker-core --upgrade" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Restart your notebook kernel after upgrading the SDK*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1: Running a basic PySpark application\n", + "\n", + "The first example is a basic Spark MLlib data processing script. This script will take a raw data set and do some transformations on it such as string indexing and one hot encoding.\n", + "\n", + "### Setup S3 bucket locations and roles\n", + "\n", + "First, setup some locations in the default SageMaker bucket to store the raw input datasets and the Spark job output. Here, you'll also define the role that will be used to run all SageMaker Processing jobs." + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "from time import gmtime, strftime\n", + "from sagemaker_core.helper.session_helper import get_execution_role, Session\n", + "\n", + "sagemaker_logger = logging.getLogger(\"sagemaker\")\n", + "sagemaker_logger.setLevel(logging.INFO)\n", + "sagemaker_logger.addHandler(logging.StreamHandler())\n", + "\n", + "sagemaker_session = Session()\n", + "region = \"us-east-1\"\n", + "REGION_NAME = region if region else SM_SESSION._region_name\n", + "role = get_execution_role()\n", + "s3_bucket_name = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, you'll download the example dataset from a SageMaker staging bucket." + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "# Fetch the dataset from the SageMaker bucket\n", + "import boto3\n", + "\n", + "s3 = boto3.client(\"s3\")\n", + "s3.download_file(\n", + " f\"sagemaker-sample-files\", \"datasets/tabular/uci_abalone/abalone.csv\", \"./data/abalone.csv\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write the PySpark script\n", + "\n", + "The source for a preprocessing script is in the cell below. The cell uses the `%%writefile` directive to save this file locally. This script does some basic feature engineering on a raw input dataset. In this example, the dataset is the [Abalone Data Set](https://archive.ics.uci.edu/ml/datasets/abalone) and the code below performs string indexing, one hot encoding, vector assembly, and combines them into a pipeline to perform these transformations in order. The script then does an 80-20 split to produce training and validation datasets as output." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting ./code/preprocess.py\n" + ] + } + ], + "source": [ + "%%writefile ./code/preprocess.py\n", + "from __future__ import print_function\n", + "from __future__ import unicode_literals\n", + "\n", + "import argparse\n", + "import csv\n", + "import os\n", + "import shutil\n", + "import sys\n", + "import time\n", + "\n", + "import pyspark\n", + "from pyspark.sql import SparkSession\n", + "from pyspark.ml import Pipeline\n", + "from pyspark.ml.feature import (\n", + " OneHotEncoder,\n", + " StringIndexer,\n", + " VectorAssembler,\n", + " VectorIndexer,\n", + ")\n", + "from pyspark.sql.functions import *\n", + "from pyspark.sql.types import (\n", + " DoubleType,\n", + " StringType,\n", + " StructField,\n", + " StructType,\n", + ")\n", + "\n", + "\n", + "def csv_line(data):\n", + " r = \",\".join(str(d) for d in data[1])\n", + " return str(data[0]) + \",\" + r\n", + "\n", + "\n", + "def main():\n", + " parser = argparse.ArgumentParser(description=\"app inputs and outputs\")\n", + " parser.add_argument(\"--s3_input_bucket\", type=str, help=\"s3 input bucket\")\n", + " parser.add_argument(\"--s3_input_key_prefix\", type=str, help=\"s3 input key prefix\")\n", + " parser.add_argument(\"--s3_output_bucket\", type=str, help=\"s3 output bucket\")\n", + " parser.add_argument(\"--s3_output_key_prefix\", type=str, help=\"s3 output key prefix\")\n", + " args = parser.parse_args()\n", + "\n", + " spark = SparkSession.builder.appName(\"PySparkApp\").getOrCreate()\n", + "\n", + " # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format\n", + " spark.sparkContext._jsc.hadoopConfiguration().set(\n", + " \"mapred.output.committer.class\", \"org.apache.hadoop.mapred.FileOutputCommitter\"\n", + " )\n", + "\n", + " # Defining the schema corresponding to the input data. The input data does not contain the headers\n", + " schema = StructType(\n", + " [\n", + " StructField(\"sex\", StringType(), True),\n", + " StructField(\"length\", DoubleType(), True),\n", + " StructField(\"diameter\", DoubleType(), True),\n", + " StructField(\"height\", DoubleType(), True),\n", + " StructField(\"whole_weight\", DoubleType(), True),\n", + " StructField(\"shucked_weight\", DoubleType(), True),\n", + " StructField(\"viscera_weight\", DoubleType(), True),\n", + " StructField(\"shell_weight\", DoubleType(), True),\n", + " StructField(\"rings\", DoubleType(), True),\n", + " ]\n", + " )\n", + "\n", + " # Downloading the data from S3 into a Dataframe\n", + " total_df = spark.read.csv(\n", + " (\"s3://\" + os.path.join(args.s3_input_bucket, args.s3_input_key_prefix, \"abalone.csv\")),\n", + " header=False,\n", + " schema=schema,\n", + " )\n", + "\n", + " # StringIndexer on the sex column which has categorical value\n", + " sex_indexer = StringIndexer(inputCol=\"sex\", outputCol=\"indexed_sex\")\n", + "\n", + " # one-hot-encoding is being performed on the string-indexed sex column (indexed_sex)\n", + " sex_encoder = OneHotEncoder(inputCol=\"indexed_sex\", outputCol=\"sex_vec\")\n", + "\n", + " # vector-assembler will bring all the features to a 1D vector for us to save easily into CSV format\n", + " assembler = VectorAssembler(\n", + " inputCols=[\n", + " \"sex_vec\",\n", + " \"length\",\n", + " \"diameter\",\n", + " \"height\",\n", + " \"whole_weight\",\n", + " \"shucked_weight\",\n", + " \"viscera_weight\",\n", + " \"shell_weight\",\n", + " ],\n", + " outputCol=\"features\",\n", + " )\n", + "\n", + " # The pipeline is comprised of the steps added above\n", + " pipeline = Pipeline(stages=[sex_indexer, sex_encoder, assembler])\n", + "\n", + " # This step trains the feature transformers\n", + " model = pipeline.fit(total_df)\n", + "\n", + " # This step transforms the dataset with information obtained from the previous fit\n", + " transformed_total_df = model.transform(total_df)\n", + "\n", + " # Split the overall dataset into 80-20 training and validation\n", + " (train_df, validation_df) = transformed_total_df.randomSplit([0.8, 0.2])\n", + "\n", + " # Convert the train dataframe to RDD to save in CSV format and upload to S3\n", + " train_rdd = train_df.rdd.map(lambda x: (x.rings, x.features))\n", + " train_lines = train_rdd.map(csv_line)\n", + " train_lines.saveAsTextFile(\n", + " \"s3://\" + os.path.join(args.s3_output_bucket, args.s3_output_key_prefix, \"train\")\n", + " )\n", + "\n", + " # Convert the validation dataframe to RDD to save in CSV format and upload to S3\n", + " validation_rdd = validation_df.rdd.map(lambda x: (x.rings, x.features))\n", + " validation_lines = validation_rdd.map(csv_line)\n", + " validation_lines.saveAsTextFile(\n", + " \"s3://\" + os.path.join(args.s3_output_bucket, args.s3_output_key_prefix, \"validation\")\n", + " )\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Run the SageMaker Processing Job\n", + "\n", + "Next, you'll use the `ProcessingJob` class to define a Spark job and run it using SageMaker Processing. A few things to note in the definition of the `ProcessingJob`:\n", + "\n", + "* This is a multi-node job with two m5.xlarge instances (which is specified via the `instance_count` and `instance_type` parameters)\n", + "* Spark framework version 3.1 image is specified via the `image_uri` parameter\n", + "* The PySpark script defined above is passed via via the `ProcessingInput` class\n", + "* Command-line arguments to the PySpark script (such as the S3 input and output locations) are passed via the `arguments` parameter\n", + "* Spark event logs will be offloaded to the S3 location in `spark_event_logs` folder.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
[09/06/24 11:22:23] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
+       "                             resource.                                                                             \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:22:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=557611;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=769880;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "336ef56bf8e043ba8bf2abd5d491bac1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[09/06/24 11:27:26] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:27:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=44518;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=290173;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
+    "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
+    "from sagemaker_core.resources import ProcessingJob\n",
+    "\n",
+    "# Upload the raw input dataset to a unique S3 location\n",
+    "timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
+    "prefix = \"sagemaker/spark-preprocess-demo/{}\".format(timestamp_prefix)\n",
+    "input_prefix_abalone = \"{}/input/raw/abalone\".format(prefix)\n",
+    "input_preprocessed_prefix_abalone = \"{}/input/preprocessed/abalone\".format(prefix)\n",
+    "\n",
+    "base_job_name = \"sm-spark\"\n",
+    "final_job_name = base_job_name + \"-\"+ timestamp_prefix\n",
+    "\n",
+    "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
+    "\n",
+    "# uploading abolone.csv to S3 bucket\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./data/abalone.csv\", bucket=s3_bucket_name, key_prefix=input_prefix_abalone\n",
+    ")\n",
+    "\n",
+    "# uploading preprocess.py to S3 bucket\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/preprocess.py\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
+    ")\n",
+    "\n",
+    "\n",
+    "processing_input = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/preprocess.py\",\n",
+    "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/code\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "processing_output_config = ProcessingOutputConfig(outputs= [ProcessingOutput(output_name = \"output-1\",s3_output=ProcessingS3Output(\n",
+    "    s3_uri=f\"s3://{s3_bucket_name}/{prefix}/spark_event_logs\",\n",
+    "    local_path=\"/opt/ml/processing/spark-events/\", s3_upload_mode=\"Continuous\"))])\n",
+    "\n",
+    "processing_resources = ProcessingResources(cluster_config=ProcessingClusterConfig\n",
+    "                                           (instance_count=2,instance_type=\"ml.m5.xlarge\",volume_size_in_gb=30))\n",
+    "\n",
+    "app_specification = AppSpecification(image_uri = \"173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.1-cpu\",\n",
+    "                                    container_entrypoint = [\"smspark-submit\",\n",
+    "                                                            \"--local-spark-event-logs-dir\",\n",
+    "                                                            \"/opt/ml/processing/spark-events/\",\n",
+    "                                                            \"/opt/ml/processing/input/code/preprocess.py\"],\n",
+    "                                    container_arguments = [\"--s3_input_bucket\",\n",
+    "                                                           f\"{s3_bucket_name}\",\n",
+    "                                                           \"--s3_input_key_prefix\",\n",
+    "                                                           f\"{input_prefix_abalone}\",\n",
+    "                                                           \"--s3_output_bucket\",\n",
+    "                                                           f\"{s3_bucket_name}\",\n",
+    "                                                           \"--s3_output_key_prefix\",\n",
+    "                                                           f\"{input_preprocessed_prefix_abalone}\"])\n",
+    "\n",
+    "# Run the processing job\n",
+    "processing_job_obj = ProcessingJob.create(processing_job_name = final_job_name,\n",
+    "                            processing_resources=processing_resources,\n",
+    "                            app_specification=app_specification,\n",
+    "                            role_arn=role,\n",
+    "                            processing_inputs=[processing_input],\n",
+    "                            processing_output_config=processing_output_config)\n",
+    "\n",
+    "processing_job_obj.wait()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Validate Data Processing Results\n",
+    "\n",
+    "Next, validate the output of our data preprocessing job by looking at the first 5 rows of the output dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Top 5 rows from s3://sagemaker-us-east-1-774297356213/sagemaker/spark-preprocess-demo/2024-09-06-11-22-23/input/preprocessed/abalone/train/\n",
+      "5.0,0.0,0.0,0.275,0.195,0.07,0.08,0.031,0.0215,0.025\n",
+      "6.0,0.0,0.0,0.29,0.21,0.075,0.275,0.113,0.0675,0.035\n",
+      "5.0,0.0,0.0,0.29,0.225,0.075,0.14,0.0515,0.0235,0.04\n",
+      "7.0,0.0,0.0,0.305,0.225,0.07,0.1485,0.0585,0.0335,0.045\n",
+      "7.0,0.0,0.0,0.305,0.23,0.08,0.156,0.0675,0.0345,0.048\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Top 5 rows from s3://{}/{}/train/\".format(s3_bucket_name, input_preprocessed_prefix_abalone))\n",
+    "!aws s3 cp --quiet s3://$s3_bucket_name/$input_preprocessed_prefix_abalone/train/part-00000 - | head -n5"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 2: Specify additional Python and jar file dependencies\n",
+    "\n",
+    "The next example demonstrates a scenario where additional Python file dependencies are required by the PySpark script. You'll use a sample PySpark script that requires additional user-defined functions (UDFs) defined in a local module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./code/hello_py_spark_app.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./code/hello_py_spark_app.py\n",
+    "import argparse\n",
+    "import time\n",
+    "\n",
+    "# Import local module to test spark-submit--py-files dependencies\n",
+    "import hello_py_spark_udfs as udfs\n",
+    "from pyspark.sql import SparkSession, SQLContext\n",
+    "from pyspark.sql.functions import udf\n",
+    "from pyspark.sql.types import IntegerType\n",
+    "import time\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    print(\"Hello World, this is PySpark!\")\n",
+    "\n",
+    "    parser = argparse.ArgumentParser(description=\"inputs and outputs\")\n",
+    "    parser.add_argument(\"--input\", type=str, help=\"path to input data\")\n",
+    "    parser.add_argument(\"--output\", required=False, type=str, help=\"path to output data\")\n",
+    "    args = parser.parse_args()\n",
+    "    spark = SparkSession.builder.appName(\"SparkTestApp\").getOrCreate()\n",
+    "    sqlContext = SQLContext(spark.sparkContext)\n",
+    "\n",
+    "    # Load test data set\n",
+    "    inputPath = args.input\n",
+    "    outputPath = args.output\n",
+    "    salesDF = spark.read.json(inputPath)\n",
+    "    salesDF.printSchema()\n",
+    "\n",
+    "    salesDF.createOrReplaceTempView(\"sales\")\n",
+    "\n",
+    "    # Define a UDF that doubles an integer column\n",
+    "    # The UDF function is imported from local module to test spark-submit--py-files dependencies\n",
+    "    double_udf_int = udf(udfs.double_x, IntegerType())\n",
+    "\n",
+    "    # Save transformed data set to disk\n",
+    "    salesDF.select(\"date\", \"sale\", double_udf_int(\"sale\").alias(\"sale_double\")).write.json(\n",
+    "        outputPath\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./code/hello_py_spark_udfs.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./code/hello_py_spark_udfs.py\n",
+    "def double_x(x):\n",
+    "    return x + x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create a processing job with Python file dependencies\n",
+    "\n",
+    "Then, you'll create a processing job where the additional Python file dependencies are specified via the `py-files` input name in the `ProcessingInput` class."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
[09/06/24 11:27:28] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
+       "                             resource.                                                                             \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:27:28]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=734529;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=826930;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a3de84148b9e4ca0acaf549cc0e649e7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[09/06/24 11:32:31] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:32:31]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=832796;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=802957;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
+    "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
+    "from sagemaker_core.resources import ProcessingJob\n",
+    "\n",
+    "# Upload the raw input dataset to a unique S3 location\n",
+    "timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
+    "prefix = \"sagemaker/spark-preprocess-demo/{}\".format(timestamp_prefix)\n",
+    "input_prefix_sales = \"{}/input/sales\".format(prefix)\n",
+    "output_prefix_sales = \"{}/output/sales\".format(prefix)\n",
+    "input_s3_uri = \"s3://{}/{}\".format(s3_bucket_name, input_prefix_sales)\n",
+    "output_s3_uri = \"s3://{}/{}\".format(s3_bucket_name, output_prefix_sales)\n",
+    "\n",
+    "base_job_name = \"sm-spark-udfs\"\n",
+    "final_job_name = base_job_name + \"-\"+ timestamp_prefix\n",
+    "\n",
+    "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
+    "input_pyfiles_prefix = \"{}/input/py-files\".format(final_job_name)\n",
+    "\n",
+    "# uploading data.jsonl to S3\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./data/data.jsonl\", bucket=s3_bucket_name, key_prefix=input_prefix_sales\n",
+    ")\n",
+    "\n",
+    "# uploading hello_py_spark_app.py to S3\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/hello_py_spark_app.py\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
+    ")\n",
+    "\n",
+    "# uploading hello_py_spark_udfs.py to S3\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/hello_py_spark_udfs.py\", bucket=s3_bucket_name, key_prefix=input_pyfiles_prefix\n",
+    ")\n",
+    "\n",
+    "# providing processing script\n",
+    "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/hello_py_spark_app.py\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/code\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "#providing py files\n",
+    "processing_input_pyfiles = ProcessingInput(input_name=\"py-files\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/py-files\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/py-files\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "# providing processing resources\n",
+    "processing_resources = ProcessingResources(cluster_config=ProcessingClusterConfig\n",
+    "                                           (instance_count=2,instance_type=\"ml.m5.xlarge\",volume_size_in_gb=30))\n",
+    "\n",
+    "# providing app specification\n",
+    "app_specification = AppSpecification(image_uri = \"173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.1-cpu\",\n",
+    "                                    container_entrypoint = [\"smspark-submit\",\n",
+    "                                                            \"--py-files\",\n",
+    "                                                            \"/opt/ml/processing/input/py-files\",\n",
+    "                                                            \"/opt/ml/processing/input/code/hello_py_spark_app.py\"],\n",
+    "                                    container_arguments = [\"--input\",\n",
+    "                                                           f\"s3://{s3_bucket_name}/{input_prefix_sales}\",\n",
+    "                                                           \"--output\",\n",
+    "                                                           f\"s3://{s3_bucket_name}/{output_prefix_sales}\",\n",
+    "                                                           ])\n",
+    "\n",
+    "# Run the processing job\n",
+    "processing_job_obj = ProcessingJob.create(processing_job_name = final_job_name,\n",
+    "                            processing_resources=processing_resources,\n",
+    "                            app_specification=app_specification,\n",
+    "                            role_arn=role,\n",
+    "                            processing_inputs=[processing_input_code,processing_input_pyfiles])\n",
+    "\n",
+    "processing_job_obj.wait()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Validate Data Processing Results\n",
+    "\n",
+    "Next, validate the output of the Spark job by ensuring that the output URI contains the Spark `_SUCCESS` file along with the output json lines file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Output files in s3://sagemaker-us-east-1-774297356213/sagemaker/spark-preprocess-demo/2024-09-06-11-27-28/output/sales\n",
+      "2024-09-06 11:30:09          0 _SUCCESS\n",
+      "2024-09-06 11:30:09      51313 part-00000-346694e9-6b36-4497-9433-ee018ed72f32-c000.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Output files in {}\".format(output_s3_uri))\n",
+    "!aws s3 ls $output_s3_uri/"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 3: Run a Java/Scala Spark application\n",
+    "\n",
+    "In the next example, you'll take a Spark application jar (located in `./code/spark-test-app.jar`) that is already built and run it using SageMaker Processing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
[09/06/24 11:33:07] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
+       "                             resource.                                                                             \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:33:07]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=667142;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=638690;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "344088d3f0d24754bf75419cb33c9489", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[09/06/24 11:38:10] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:38:10]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=947038;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=166883;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
+    "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
+    "from sagemaker_core.resources import ProcessingJob\n",
+    "\n",
+    "# Upload the raw input dataset to S3\n",
+    "timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
+    "prefix = \"sagemaker/spark-preprocess-demo/{}\".format(timestamp_prefix)\n",
+    "input_prefix_sales = \"{}/input/sales\".format(prefix)\n",
+    "output_prefix_sales = \"{}/output/sales\".format(prefix)\n",
+    "input_s3_uri = \"s3://{}/{}\".format(s3_bucket_name, input_prefix_sales)\n",
+    "output_s3_uri = \"s3://{}/{}\".format(s3_bucket_name, output_prefix_sales)\n",
+    "\n",
+    "\n",
+    "base_job_name = \"sm-spark-java\"\n",
+    "final_job_name = base_job_name + \"-\"+ timestamp_prefix\n",
+    "\n",
+    "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
+    "input_pyfiles_prefix = \"{}/input/py-files\".format(final_job_name)\n",
+    "\n",
+    "# uploading data.jsonl to S3\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./data/data.jsonl\", bucket=s3_bucket_name, key_prefix=input_prefix_sales\n",
+    ")\n",
+    "\n",
+    "# uploading spark-test-app.jar to S3\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/spark-test-app.jar\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
+    ")\n",
+    "\n",
+    "\n",
+    "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/spark-test-app.jar\",\n",
+    "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/code\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "# providing processing resources\n",
+    "processing_resources = ProcessingResources(cluster_config=ProcessingClusterConfig\n",
+    "                                           (instance_count=2,instance_type=\"ml.m5.xlarge\",volume_size_in_gb=30))\n",
+    "\n",
+    "#providing app specification\n",
+    "app_specification = AppSpecification(image_uri = \"173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.1-cpu\",\n",
+    "                                    container_entrypoint = [\"smspark-submit\",\n",
+    "                                                            \"--class\",\n",
+    "                                                            \"com.amazonaws.sagemaker.spark.test.HelloJavaSparkApp\",\n",
+    "                                                            \"/opt/ml/processing/input/code/spark-test-app.jar\"],\n",
+    "                                    container_arguments = [\"--input\",\n",
+    "                                                           f\"s3://{s3_bucket_name}/{input_prefix_sales}\",\n",
+    "                                                           \"--output\",\n",
+    "                                                           f\"s3://{s3_bucket_name}/{output_prefix_sales}\",\n",
+    "                                                           ])\n",
+    "\n",
+    "# Run the processing job\n",
+    "processing_job_obj = ProcessingJob.create(processing_job_name = final_job_name,\n",
+    "                            processing_resources=processing_resources,\n",
+    "                            app_specification=app_specification,\n",
+    "                            role_arn=role,\n",
+    "                            processing_inputs=[processing_input_code,processing_input_pyfiles])\n",
+    "\n",
+    "processing_job_obj.wait()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example 4: Specifying additional Spark configuration\n",
+    "\n",
+    "Overriding Spark configuration is crucial for a number of tasks such as tuning your Spark application or configuring the Hive metastore. Using the SageMaker Python SDK, you can easily override Spark/Hive/Hadoop configuration.\n",
+    "\n",
+    "The next example demonstrates this by overriding Spark executor memory/cores.\n",
+    "\n",
+    "For more information on configuring your Spark application, see the EMR documentation on [Configuring Applications](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html)\n",
+    "\n",
+    "**Below code is to create configuration.json file for overriding Spark executor memory/cores**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting ./code/configuration.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile ./code/configuration.json\n",
+    "[{\"Classification\": \"spark-defaults\", \"Properties\": {\"spark.executor.memory\": \"2g\", \"spark.executor.cores\": \"1\"}}]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "
[09/06/24 11:43:28] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
+       "                             resource.                                                                             \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:43:28]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=267427;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=778838;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", + "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7467eda716b64be098a868ba5d1b9b03", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
[09/06/24 11:48:31] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[2;36m[09/06/24 11:48:31]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=928893;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=46636;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
+    "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
+    "from sagemaker_core.resources import ProcessingJob\n",
+    "\n",
+    "# Upload the raw input dataset to a unique S3 location\n",
+    "timestamp_prefix = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
+    "prefix = \"sagemaker/spark-preprocess-demo/{}\".format(timestamp_prefix)\n",
+    "input_prefix_abalone = \"{}/input/raw/abalone\".format(prefix)\n",
+    "input_preprocessed_prefix_abalone = \"{}/input/preprocessed/abalone\".format(prefix)\n",
+    "\n",
+    "base_job_name = \"sm-spark\"\n",
+    "final_job_name = base_job_name + \"-\"+ timestamp_prefix\n",
+    "\n",
+    "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
+    "input_conf_prefix = \"{}/input/conf\".format(final_job_name)\n",
+    "\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./data/abalone.csv\", bucket=s3_bucket_name, key_prefix=input_prefix_abalone\n",
+    ")\n",
+    "\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/preprocess.py\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
+    ")\n",
+    "\n",
+    "sagemaker_session.upload_data(\n",
+    "    path=\"./code/configuration.json\", bucket=s3_bucket_name, key_prefix=input_conf_prefix\n",
+    ")\n",
+    "\n",
+    "\n",
+    "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/preprocess.py\",\n",
+    "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/code\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "processing_input_conf = ProcessingInput(input_name=\"conf\",s3_input = ProcessingS3Input(\n",
+    "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/conf/configuration.json\",\n",
+    "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
+    "                                        s3_data_type=\"S3Prefix\",\n",
+    "                                        local_path = \"/opt/ml/processing/input/conf\",\n",
+    "                                        s3_input_mode=\"File\"\n",
+    "                                        ))\n",
+    "\n",
+    "\n",
+    "processing_resources = ProcessingResources(cluster_config=ProcessingClusterConfig\n",
+    "                                           (instance_count=2,instance_type=\"ml.m5.xlarge\",volume_size_in_gb=30))\n",
+    "\n",
+    "app_specification = AppSpecification(image_uri = \"173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.1-cpu\",\n",
+    "                                    container_entrypoint = [\"smspark-submit\",\n",
+    "                                                            \"/opt/ml/processing/input/code/preprocess.py\"],\n",
+    "                                    container_arguments = [\"--s3_input_bucket\",\n",
+    "                                                           s3_bucket_name,\n",
+    "                                                           \"--s3_input_key_prefix\",\n",
+    "                                                           input_prefix_abalone,\n",
+    "                                                           \"--s3_output_bucket\",\n",
+    "                                                           s3_bucket_name,\n",
+    "                                                           \"--s3_output_key_prefix\",\n",
+    "                                                           input_preprocessed_prefix_abalone])\n",
+    "\n",
+    "# Run the processing job\n",
+    "processing_job_obj = ProcessingJob.create(processing_job_name = final_job_name,\n",
+    "                            processing_resources=processing_resources,\n",
+    "                            app_specification=app_specification,\n",
+    "                            role_arn=role,\n",
+    "                            processing_inputs=[processing_input_code,processing_input_conf])\n",
+    "\n",
+    "processing_job_obj.wait()"
+   ]
+  }
+ ],
+ "metadata": {
+  "instance_type": "ml.t3.medium",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From ccc1759aacf8404e2fdf9db9e25dd70513b8f6d1 Mon Sep 17 00:00:00 2001
From: Ayush Sharma 
Date: Thu, 12 Sep 2024 14:08:49 +0530
Subject: [PATCH 2/4] added modified spark-processing-sm-core.ipynb

removed outputs, used default bucket only and wrote some comments for verbosity
---
 sagemaker-core/spark-processing-sm-core.ipynb | 388 +++---------------
 1 file changed, 47 insertions(+), 341 deletions(-)

diff --git a/sagemaker-core/spark-processing-sm-core.ipynb b/sagemaker-core/spark-processing-sm-core.ipynb
index 27b39ab143..5fb290a2a9 100644
--- a/sagemaker-core/spark-processing-sm-core.ipynb
+++ b/sagemaker-core/spark-processing-sm-core.ipynb
@@ -53,52 +53,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Found existing installation: sagemaker-core 1.0.2\n",
-      "Uninstalling sagemaker-core-1.0.2:\n",
-      "  Successfully uninstalled sagemaker-core-1.0.2\n",
-      "Collecting sagemaker-core\n",
-      "  Downloading sagemaker_core-1.0.3-py3-none-any.whl.metadata (4.9 kB)\n",
-      "Requirement already satisfied: boto3<2.0.0,>=1.34.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (1.35.8)\n",
-      "Requirement already satisfied: pydantic<3.0.0,>=1.7.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (2.7.0)\n",
-      "Requirement already satisfied: PyYAML<7.0,>=6.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.0.1)\n",
-      "Requirement already satisfied: jsonschema<5.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.17.3)\n",
-      "Requirement already satisfied: platformdirs<5.0.0,>=4.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.2.2)\n",
-      "Requirement already satisfied: rich<14.0.0,>=13.0.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (13.7.1)\n",
-      "Requirement already satisfied: mock<5.0,>4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (4.0.3)\n",
-      "Requirement already satisfied: importlib-metadata<7.0,>=1.4.0 in /opt/conda/lib/python3.10/site-packages (from sagemaker-core) (6.10.0)\n",
-      "Requirement already satisfied: botocore<1.36.0,>=1.35.8 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.35.8)\n",
-      "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (1.0.1)\n",
-      "Requirement already satisfied: s3transfer<0.11.0,>=0.10.0 in /opt/conda/lib/python3.10/site-packages (from boto3<2.0.0,>=1.34.0->sagemaker-core) (0.10.2)\n",
-      "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.10/site-packages (from importlib-metadata<7.0,>=1.4.0->sagemaker-core) (3.19.2)\n",
-      "Requirement already satisfied: attrs>=17.4.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (23.2.0)\n",
-      "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema<5.0.0->sagemaker-core) (0.20.0)\n",
-      "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (0.7.0)\n",
-      "Requirement already satisfied: pydantic-core==2.18.1 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (2.18.1)\n",
-      "Requirement already satisfied: typing-extensions>=4.6.1 in /opt/conda/lib/python3.10/site-packages (from pydantic<3.0.0,>=1.7.0->sagemaker-core) (4.12.2)\n",
-      "Requirement already satisfied: markdown-it-py>=2.2.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (3.0.0)\n",
-      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich<14.0.0,>=13.0.0->sagemaker-core) (2.18.0)\n",
-      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (2.9.0)\n",
-      "Requirement already satisfied: urllib3!=2.2.0,<3,>=1.25.4 in /opt/conda/lib/python3.10/site-packages (from botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.26.19)\n",
-      "Requirement already satisfied: mdurl~=0.1 in /opt/conda/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich<14.0.0,>=13.0.0->sagemaker-core) (0.1.2)\n",
-      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.36.0,>=1.35.8->boto3<2.0.0,>=1.34.0->sagemaker-core) (1.16.0)\n",
-      "Downloading sagemaker_core-1.0.3-py3-none-any.whl (377 kB)\n",
-      "Installing collected packages: sagemaker-core\n",
-      "Successfully installed sagemaker-core-1.0.3\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "!pip install pip --upgrade --quiet\n",
     "!pip uninstall sagemaker-core -y\n",
+    "!pip install pip --upgrade --quiet\n",
     "!pip install sagemaker-core --upgrade"
    ]
   },
@@ -124,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -137,10 +99,9 @@
     "sagemaker_logger.addHandler(logging.StreamHandler())\n",
     "\n",
     "sagemaker_session = Session()\n",
-    "region = \"us-east-1\"\n",
-    "REGION_NAME = region if region else SM_SESSION._region_name\n",
+    "REGION_NAME = sagemaker_session._region_name\n",
     "role = get_execution_role()\n",
-    "s3_bucket_name  = sagemaker_session.default_bucket()"
+    "s3_bucket_name = sagemaker_session.default_bucket()"
    ]
   },
   {
@@ -152,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -176,17 +137,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ./code/preprocess.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ./code/preprocess.py\n",
     "from __future__ import print_function\n",
@@ -328,64 +281,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": null,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
[09/06/24 11:22:23] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
-       "                             resource.                                                                             \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:22:23]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=557611;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=769880;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "336ef56bf8e043ba8bf2abd5d491bac1", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[09/06/24 11:27:26] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:27:26]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=44518;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=290173;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
     "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
@@ -402,6 +302,7 @@
     "\n",
     "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
     "\n",
+    "# uploading required data to S3 for reference\n",
     "# uploading abolone.csv to S3 bucket\n",
     "sagemaker_session.upload_data(\n",
     "    path=\"./data/abalone.csv\", bucket=s3_bucket_name, key_prefix=input_prefix_abalone\n",
@@ -412,7 +313,7 @@
     "    path=\"./code/preprocess.py\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
     ")\n",
     "\n",
-    "\n",
+    "# initializing ProcessingInputs,ProcessingResources,ProcessingOutputConfig and AppSpecification configurations\n",
     "processing_input = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
     "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/preprocess.py\",\n",
     "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
@@ -464,22 +365,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Top 5 rows from s3://sagemaker-us-east-1-774297356213/sagemaker/spark-preprocess-demo/2024-09-06-11-22-23/input/preprocessed/abalone/train/\n",
-      "5.0,0.0,0.0,0.275,0.195,0.07,0.08,0.031,0.0215,0.025\n",
-      "6.0,0.0,0.0,0.29,0.21,0.075,0.275,0.113,0.0675,0.035\n",
-      "5.0,0.0,0.0,0.29,0.225,0.075,0.14,0.0515,0.0235,0.04\n",
-      "7.0,0.0,0.0,0.305,0.225,0.07,0.1485,0.0585,0.0335,0.045\n",
-      "7.0,0.0,0.0,0.305,0.23,0.08,0.156,0.0675,0.0345,0.048\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Top 5 rows from s3://{}/{}/train/\".format(s3_bucket_name, input_preprocessed_prefix_abalone))\n",
     "!aws s3 cp --quiet s3://$s3_bucket_name/$input_preprocessed_prefix_abalone/train/part-00000 - | head -n5"
@@ -496,17 +384,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ./code/hello_py_spark_app.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ./code/hello_py_spark_app.py\n",
     "import argparse\n",
@@ -547,19 +427,18 @@
     "    )"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Creating `hello_py_spark_udfs.py` inside `code` folder"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ./code/hello_py_spark_udfs.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ./code/hello_py_spark_udfs.py\n",
     "def double_x(x):\n",
@@ -577,62 +456,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
[09/06/24 11:27:28] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
-       "                             resource.                                                                             \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:27:28]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=734529;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=826930;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a3de84148b9e4ca0acaf549cc0e649e7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[09/06/24 11:32:31] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:32:31]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=832796;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=802957;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
     "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
@@ -652,6 +478,7 @@
     "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
     "input_pyfiles_prefix = \"{}/input/py-files\".format(final_job_name)\n",
     "\n",
+    "# uploading required data to S3 for reference\n",
     "# uploading data.jsonl to S3\n",
     "sagemaker_session.upload_data(\n",
     "    path=\"./data/data.jsonl\", bucket=s3_bucket_name, key_prefix=input_prefix_sales\n",
@@ -667,6 +494,7 @@
     "    path=\"./code/hello_py_spark_udfs.py\", bucket=s3_bucket_name, key_prefix=input_pyfiles_prefix\n",
     ")\n",
     "\n",
+    "# initializing ProcessingInputs,ProcessingResources and AppSpecification configurations\n",
     "# providing processing script\n",
     "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
     "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/hello_py_spark_app.py\",\n",
@@ -720,19 +548,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Output files in s3://sagemaker-us-east-1-774297356213/sagemaker/spark-preprocess-demo/2024-09-06-11-27-28/output/sales\n",
-      "2024-09-06 11:30:09          0 _SUCCESS\n",
-      "2024-09-06 11:30:09      51313 part-00000-346694e9-6b36-4497-9433-ee018ed72f32-c000.json\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(\"Output files in {}\".format(output_s3_uri))\n",
     "!aws s3 ls $output_s3_uri/"
@@ -749,62 +567,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
[09/06/24 11:33:07] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
-       "                             resource.                                                                             \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:33:07]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=667142;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=638690;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "344088d3f0d24754bf75419cb33c9489", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[09/06/24 11:38:10] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:38:10]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=947038;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=166883;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
     "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
@@ -825,17 +590,16 @@
     "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
     "input_pyfiles_prefix = \"{}/input/py-files\".format(final_job_name)\n",
     "\n",
-    "# uploading data.jsonl to S3\n",
+    "# uploading required data to S3 for reference\n",
     "sagemaker_session.upload_data(\n",
     "    path=\"./data/data.jsonl\", bucket=s3_bucket_name, key_prefix=input_prefix_sales\n",
     ")\n",
     "\n",
-    "# uploading spark-test-app.jar to S3\n",
     "sagemaker_session.upload_data(\n",
     "    path=\"./code/spark-test-app.jar\", bucket=s3_bucket_name, key_prefix=input_script_prefix\n",
     ")\n",
     "\n",
-    "\n",
+    "# initializing ProcessingInputs,ProcessingResources and AppSpecification configurations\n",
     "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
     "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/spark-test-app.jar\",\n",
     "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
@@ -844,11 +608,10 @@
     "                                        s3_input_mode=\"File\"\n",
     "                                        ))\n",
     "\n",
-    "# providing processing resources\n",
     "processing_resources = ProcessingResources(cluster_config=ProcessingClusterConfig\n",
     "                                           (instance_count=2,instance_type=\"ml.m5.xlarge\",volume_size_in_gb=30))\n",
     "\n",
-    "#providing app specification\n",
+    "\n",
     "app_specification = AppSpecification(image_uri = \"173754725891.dkr.ecr.us-east-1.amazonaws.com/sagemaker-spark-processing:3.1-cpu\",\n",
     "                                    container_entrypoint = [\"smspark-submit\",\n",
     "                                                            \"--class\",\n",
@@ -867,6 +630,7 @@
     "                            role_arn=role,\n",
     "                            processing_inputs=[processing_input_code,processing_input_pyfiles])\n",
     "\n",
+    "# waiting for the processing job to be completed\n",
     "processing_job_obj.wait()"
    ]
   },
@@ -882,22 +646,14 @@
     "\n",
     "For more information on configuring your Spark application, see the EMR documentation on [Configuring Applications](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html)\n",
     "\n",
-    "**Below code is to create configuration.json file for overriding Spark executor memory/cores**"
+    "#### Creating configuration.json file for overriding Spark executor memory/cores "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting ./code/configuration.json\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile ./code/configuration.json\n",
     "[{\"Classification\": \"spark-defaults\", \"Properties\": {\"spark.executor.memory\": \"2g\", \"spark.executor.cores\": \"1\"}}]"
@@ -905,62 +661,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "
[09/06/24 11:43:28] INFO     INFO:sagemaker_core.main.resources:Creating processing_job          resources.py:23658\n",
-       "                             resource.                                                                             \n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:43:28]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Creating processing_job \u001b]8;id=267427;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=778838;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23658\u001b\\\u001b[2m23658\u001b[0m\u001b]8;;\u001b\\\n", - "\u001b[2;36m \u001b[0m resource. \u001b[2m \u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7467eda716b64be098a868ba5d1b9b03", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
[09/06/24 11:48:31] INFO     INFO:sagemaker_core.main.resources:Final Resource Status: Completed resources.py:23850\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[2;36m[09/06/24 11:48:31]\u001b[0m\u001b[2;36m \u001b[0m\u001b[1;38;2;0;105;255mINFO \u001b[0m INFO:sagemaker_core.main.resources:Final Resource Status: \u001b[1mCompleted\u001b[0m \u001b]8;id=928893;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py\u001b\\\u001b[2mresources.py\u001b[0m\u001b]8;;\u001b\\\u001b[2m:\u001b[0m\u001b]8;id=46636;file:///opt/conda/lib/python3.10/site-packages/sagemaker_core/main/resources.py#23850\u001b\\\u001b[2m23850\u001b[0m\u001b]8;;\u001b\\\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from sagemaker_core.shapes import ProcessingInput,ProcessingResources,AppSpecification,ProcessingS3Input,ProcessingOutputConfig\n",
     "from sagemaker_core.shapes import ProcessingResources,ProcessingClusterConfig,ProcessingOutput,ProcessingS3Output\n",
@@ -972,12 +675,14 @@
     "input_prefix_abalone = \"{}/input/raw/abalone\".format(prefix)\n",
     "input_preprocessed_prefix_abalone = \"{}/input/preprocessed/abalone\".format(prefix)\n",
     "\n",
+    "#base job name\n",
     "base_job_name = \"sm-spark\"\n",
     "final_job_name = base_job_name + \"-\"+ timestamp_prefix\n",
     "\n",
     "input_script_prefix = \"{}/input/code\".format(final_job_name)\n",
     "input_conf_prefix = \"{}/input/conf\".format(final_job_name)\n",
     "\n",
+    "# uploading required data to S3 for reference\n",
     "sagemaker_session.upload_data(\n",
     "    path=\"./data/abalone.csv\", bucket=s3_bucket_name, key_prefix=input_prefix_abalone\n",
     ")\n",
@@ -990,7 +695,7 @@
     "    path=\"./code/configuration.json\", bucket=s3_bucket_name, key_prefix=input_conf_prefix\n",
     ")\n",
     "\n",
-    "\n",
+    "# initializing ProcessingInputs,ProcessingResources and AppSpecification configurations\n",
     "processing_input_code = ProcessingInput(input_name=\"code\",s3_input = ProcessingS3Input(\n",
     "                                        s3_uri = f\"s3://{s3_bucket_name}/{final_job_name}/input/code/preprocess.py\",\n",
     "                                        #s3_uri=\"s3://sagemaker-us-east-1-774297356213/sm-spark-2024-08-30-05-25-18-294/input/code/preprocess.py\",\n",
@@ -1030,6 +735,7 @@
     "                            role_arn=role,\n",
     "                            processing_inputs=[processing_input_code,processing_input_conf])\n",
     "\n",
+    "# waiting for the processing job to be completed\n",
     "processing_job_obj.wait()"
    ]
   }

From 033e5a4b2a8e4d54ba552ab7a5202f739176f928 Mon Sep 17 00:00:00 2001
From: Ayush Sharma 
Date: Thu, 12 Sep 2024 14:24:09 +0530
Subject: [PATCH 3/4] modified pyspark processing notebook

removed outputs, used default s3 bucket and added additional comments for verbosity

From cd25c80fa6e0c5b2a1947bbcee3b1714fcfe2c18 Mon Sep 17 00:00:00 2001
From: Ayush Sharma 
Date: Thu, 12 Sep 2024 14:27:40 +0530
Subject: [PATCH 4/4] uploaded modified spark-processing-sm-core.ipynb

removed outputs, used default bucket and added additional comments