diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000..7492737 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,52 @@ +name: Docker Image CI + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + + - name: Check Out Repo + uses: actions/checkout@v2 + + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v1 + + + + - name: Build and push cpu + id: docker_build_cpu + uses: docker/build-push-action@v2 + with: + context: ./ + file: ./NLP/Dockerfile.cpu + push: true + tags: torlof/nlp-cpu-docker-keras:latest + log-level: error + + - name: Build and push gpu + id: docker_build_gpu + uses: docker/build-push-action@v2 + with: + context: ./ + file: ./NLP/Dockerfile.gpu + push: true + tags: torlof/nlp-nvidia-docker-keras:latest + + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/Dockerfile.cpu b/Image/Dockerfile.cpu similarity index 100% rename from Dockerfile.cpu rename to Image/Dockerfile.cpu diff --git a/Dockerfile.gpu b/Image/Dockerfile.gpu similarity index 100% rename from Dockerfile.gpu rename to Image/Dockerfile.gpu diff --git a/src/requirements.txt b/Image/src/requirements.txt similarity index 100% rename from src/requirements.txt rename to Image/src/requirements.txt diff --git a/src/train.py b/Image/src/train.py similarity index 100% rename from src/train.py rename to Image/src/train.py diff --git a/NLP/Dockerfile.cpu b/NLP/Dockerfile.cpu new file mode 100644 index 0000000..4a00ccf --- /dev/null +++ b/NLP/Dockerfile.cpu @@ -0,0 +1,11 @@ +FROM tensorflow/tensorflow:2.3.1 + +COPY ./NLP/src /src + +WORKDIR /src + +RUN pip install -r requirements.txt + +ENV PYTHONPATH='/src/:$PYTHONPATH' + +CMD ["python", "train.py"] diff --git a/NLP/Dockerfile.gpu b/NLP/Dockerfile.gpu new file mode 100644 index 0000000..b363080 --- /dev/null +++ b/NLP/Dockerfile.gpu @@ -0,0 +1,11 @@ +FROM tensorflow/tensorflow:2.3.1-gpu + +COPY ./NLP/src /src + +WORKDIR /src + +RUN pip install -r requirements.txt + +ENV PYTHONPATH='/src/:$PYTHONPATH' + +CMD ["python", "train.py"] diff --git a/NLP/Dockerfile.tensorflow b/NLP/Dockerfile.tensorflow new file mode 100644 index 0000000..50a76f5 --- /dev/null +++ b/NLP/Dockerfile.tensorflow @@ -0,0 +1,62 @@ +FROM ubuntu:20.04 + +RUN apt update +RUN apt install -y python3-dev python3-pip python3-venv wget curl gnupg git sudo + +RUN ln -s /usr/bin/python3 /usr/bin/python && \ + ln -s /usr/bin/pip3 /usr/bin/pip + +ENV PYTHONPATH=/tensorflow/lib \ + PYTHON_ARG=/tensorflow/lib \ + USE_BAZEL_VERSION=3.3.0 \ + TF_NEED_CUDA=0 \ + TF_NEED_GCP=0 \ + TF_CUDA_COMPUTE_CAPABILITIES=5.2,3.5 \ + TF_NEED_HDFS=0 \ + TF_NEED_OPENCL=0 \ + TF_NEED_JEMALLOC=0 \ + TF_ENABLE_XLA=0 \ + TF_NEED_VERBS=0 \ + TF_CUDA_CLANG=0 \ + TF_DOWNLOAD_CLANG=0 \ + TF_NEED_MKL=0 \ + TF_DOWNLOAD_MKL=0 \ + TF_NEED_MPI=0 \ + TF_NEED_S3=1 \ + TF_NEED_KAFKA=0 \ + TF_NEED_GDR=0 \ + TF_NEED_OPENCL_SYCL=0 \ + TF_SET_ANDROID_WORKSPACE=0 \ + TF_NEED_AWS=0 \ + TF_NEED_IGNITE=0 \ + TF_NEED_ROCM=0 \ + GCC_HOST_COMPILER_PATH="/usr/bin/gcc" \ + CC_OPT_FLAGS="-march=native" + +RUN pip3 install pip six 'numpy<1.19.0' wheel setuptools mock 'future>=0.17.1' && \ + pip3 install keras_applications --no-deps && \ + pip3 install keras_preprocessing --no-deps + +RUN git clone https://github.com/tensorflow/tensorflow.git +WORKDIR /tensorflow +RUN git checkout r2.3 + +RUN curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor > bazel.gpg +RUN mv bazel.gpg /etc/apt/trusted.gpg.d/ +RUN echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list +RUN apt update && apt install -y bazel-3.3.0 +RUN ln -s /usr/bin/bazel-3.3.0 /usr/bin/bazel + +RUN ./configure + +RUN bazel build -c opt \ + --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-mfpmath=both \ + --copt=-w \ + --jobs=26 \ + //tensorflow/tools/pip_package:build_pip_package + + +RUN ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg + +RUN pip install /tmp/tensorflow_pkg/tensorflow-2.3.2-cp38-cp38-linux_x86_64.whl + diff --git a/NLP/src/requirements.txt b/NLP/src/requirements.txt new file mode 100644 index 0000000..e69dc5c --- /dev/null +++ b/NLP/src/requirements.txt @@ -0,0 +1,4 @@ +numpy +tensorflow-datasets +tensorflow-text==2.3.0 +tf-models-official==2.3.0 \ No newline at end of file diff --git a/NLP/src/train.py b/NLP/src/train.py new file mode 100644 index 0000000..c930ee7 --- /dev/null +++ b/NLP/src/train.py @@ -0,0 +1,288 @@ +import time +import numpy as np +import string +import re +import os +import sys +import shutil +from typing import Dict, Tuple +import tensorflow as tf +from official.nlp import optimization +import tensorflow_hub as hub +import tensorflow_text as text + + +def print_devices() -> None: + """ + Print number of gpu devices to be used + """ + print("\n------------------------") + print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) + print("------------------------\n") + + +def mlp(vocab_size : int, embedding_dim : int, max_length : int, no_classes : int) -> tf.keras.Model: + """ + Build multi-layer perceptron model + + :param int vocab_size: vocabulary size + :param int embedding_dim: embedding size + :param int max_length: maximal length of the padded sequence + :param int no_classes: number of classes / output layer size + :return: model object + :rtype: tf.keras.Model + """ + sequence_input = tf.keras.layers.Input(shape=(max_length,), dtype='int32', name="input0") + embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True) + x = embedding_layer(sequence_input) + x = tf.keras.layers.GlobalAveragePooling1D()(x) + x = tf.keras.layers.Dense(512, activation='relu')(x) + x = tf.keras.layers.Dense(512, activation='relu')(x) + output = tf.keras.layers.Dense(no_classes, activation='sigmoid', name="output0")(x) + model = tf.keras.Model(sequence_input, output) + model.compile(optimizer='adam', + loss=tf.keras.losses.BinaryCrossentropy(), + metrics=['accuracy']) + model.summary() + return model + + +def bert(train_ds : tf.data.Dataset, epochs : int, no_classes) -> tf.keras.Model: + """ + Build bert model + + :param tf.data.Dataset train_ds: training dataset + :param int epochs: no epochs + :param int no_classes: number of classes / output layer size + :return: model object + :rtype: tf.keras.Model + """ + tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1" + tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3" + + text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text') + preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing') + encoder_inputs = preprocessing_layer(text_input) + encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder') + outputs = encoder(encoder_inputs) + net = outputs['pooled_output'] + x = tf.keras.layers.Dense(512, activation='relu')(net) + x = tf.keras.layers.Dense(512, activation='relu')(x) + output = tf.keras.layers.Dense(no_classes, activation='sigmoid', name="output0")(x) + model = tf.keras.Model(text_input, output) + loss = tf.keras.losses.BinaryCrossentropy() + metrics = tf.metrics.BinaryAccuracy() + model.compile() + steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy() + num_train_steps = steps_per_epoch * epochs + num_warmup_steps = int(0.1 * num_train_steps) + + init_lr = 3e-5 + optimizer = optimization.create_optimizer(init_lr=init_lr, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + optimizer_type='adamw') + model.compile(optimizer=optimizer, + loss=loss, + metrics=metrics) + model.summary() + return model + + +def custom_standardization(input_data): + """ + Function of standardizing text + + """ + lowercase = tf.strings.lower(input_data) + stripped_html = tf.strings.regex_replace(lowercase, '
', ' ') + return tf.strings.regex_replace(stripped_html, + '[%s]' % re.escape(string.punctuation), '') + + +def get_data_from_aclImdb() -> tf.data.Dataset: + """ + Load aclImdb_v1 dataset from internet + + :return: dataset object + :rtype: tf.data.Dataset + """ + url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + + dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, + untar=True, cache_dir='.', + cache_subdir='') + + dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb') + train_dir = os.path.join(dataset_dir, 'train') + remove_dir = os.path.join(train_dir, 'unsup') + shutil.rmtree(remove_dir) + seed = 123 + raw_ds = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train', batch_size=50, seed=seed) + return raw_ds + + +def preprocess_mlp_text(dataset : tf.data.Dataset, parameter: Dict) -> np.ndarray: + """ + Perform tokenization for MLP model + + :param tf.data.Dataset dataset: dataset containing text and label data + :param Dict parameter: parameter object containing vocab_size and sequence_lenght parameter + :return: tokenized and padded text + :rtype: np.ndarray + """ + vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization( + standardize=custom_standardization, + max_tokens=parameter["vocab_size"], + output_mode='int', + output_sequence_length=parameter["sequence_length"]) + + def vectorize_text_func(text, label): + text = tf.expand_dims(text, -1) + return vectorize_layer(text), label + + text_ds = dataset.map(lambda x, y: x) + vectorize_layer.adapt(text_ds) + text_ds = dataset.map(vectorize_text_func) + + tokenized_texts = [] + for text, _ in text_ds: + tokenized_texts.append(text.numpy()) + + return np.vstack(tokenized_texts) + + +def preprocess_bert_text(dataset : tf.data.Dataset) -> np.ndarray: + """ + Perform tokenization for BERT model + + :param tf.data.Dataset dataset: dataset containing text and label data + :return: tokenized and padded text + :rtype: np.ndarray + """ + + text_ds = dataset.map(lambda x, y: x) + tokenized_texts = [] + for text in text_ds: + tokenized_texts += text.numpy().tolist() + + return np.vstack(tokenized_texts) + + +def prepare_dataset(text_data: np.ndarray, parameter: Dict, no_samples : int) -> tf.data.Dataset: + """ + To simulate multi-class problem, we randomly generate labels here + + :param np.ndarray text_data: text data + :param Dict parameter: parameter object + :param int no_samples: dataset size + :return: dataset object + :rtype: tf.data.Dataset + """ + print(f"\nDataset contains {no_samples} samples\n") + labels = np.random.randint(low=0, high=parameter["no_classes"], size=(no_samples)) + category_labels_mat = tf.keras.utils.to_categorical(labels, num_classes=parameter["no_classes"]) + + train_ds = tf.data.Dataset.from_tensor_slices( + (tf.convert_to_tensor(text_data), + tf.convert_to_tensor(category_labels_mat))) + train_ds = train_ds.shuffle(buffer_size=10000) + train_ds = train_ds.batch(parameter["batch_size"]).prefetch(tf.data.experimental.AUTOTUNE) + return train_ds + + +def run_mlp_test_track(train_ds: tf.data.Dataset, parameter: Dict) -> Tuple[float, float]: + """ + Perform training time and inference time test for multi-layer-perceptron model + + :param tf.data.Dataset train_ds: + :param Dict parameter: + :return: training and inference time + :rtype: Tuple of float + """ + # build model + print("create mlp model") + model = mlp(vocab_size=parameter["vocab_size"], embedding_dim=parameter["embedding_dim"], + max_length=parameter["sequence_length"], no_classes=parameter["no_classes"]) + print("complete") + # start training + print("start training") + start_time = time.time() + model.fit(train_ds, epochs=parameter["epochs"], verbose=2) + train_time = time.time() - start_time + print(f"complete in {train_time} [sec]") + # start batch interference + print("start inference test") + start_time = time.time() + model.predict(train_ds) + inference_time = (time.time() - start_time) / len(train_ds) + print(f"complete in {inference_time} [sec]") + return train_time, inference_time + + +def run_bert_test_track(train_ds: tf.data.Dataset, parameter: Dict) -> Tuple[float, float]: + """ + Perform training time and inference time test for bert model + + :param tf.data.Dataset train_ds: + :param Dict parameter: + :return: training and inference time + :rtype: Tuple of float + """ + # build model + print("create bert model") + model = bert(train_ds=train_ds, epochs=parameter["epochs"], no_classes=parameter["no_classes"]) + print("complete") + # start training + print("train model") + start_time = time.time() + model.fit(train_ds, epochs=parameter["epochs"], verbose=2) + train_time = time.time() - start_time + print(f"complete in {train_time} [sec]") + # start batch interference + print("start inference test") + start_time = time.time() + _= model.predict(train_ds) + inference_time = (time.time() - start_time) / len(train_ds) + print(f"complete in {inference_time} [sec]") + return train_time, inference_time + + +def main(): + parameter = {"vocab_size": 80000, + "sequence_length": 150, + "embedding_dim": 100, + "batch_size": 128, + "epochs" : 2} + + no_classes = 10000 + print_devices() + train_dataset = get_data_from_aclImdb() + mlp_text_data = preprocess_mlp_text(dataset=train_dataset, parameter=parameter) + bert_text_data = preprocess_bert_text(dataset=train_dataset) + print("Load and prepare dataset") + + results ={"mlp" : {"no classes": [], "training time": [], "inference time": []}, + "bert" : {"no classes": [], "training time": [], "inference time": []}} + + for batch_size in [128, 500, 1000, 5000, 10000]: + parameter["no_classes"] = no_classes + parameter["batch_size"] = batch_size + train_ds = prepare_dataset(text_data=mlp_text_data, parameter=parameter, no_samples=mlp_text_data.shape[0]) + runtimes = run_mlp_test_track(train_ds=train_ds, parameter=parameter) + results["mlp"]["batch_size"].append(batch_size) + results["mlp"]["training time"].append(runtimes[0]) + results["mlp"]["inference time"].append(runtimes[1]) + print(results) + + #train_ds = prepare_dataset(text_data=bert_text_data, parameter=parameter, no_samples=bert_text_data.shape[0]) + #runtimes = run_bert_test_track(train_ds=train_ds, parameter=parameter) + #results["bert"]["batch_size"].append(batch_size) + #results["bert"]["training time"].append(runtimes[0]) + #results["bert"]["inference time"].append(runtimes[1]) + + + + +if __name__ == '__main__': + main() diff --git a/README.md b/README.md index 3ea70bc..67bf631 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,15 @@ # Train Neural Networks on Amazon EC2 with GPU support -Workflow that shows how to train neural networks on EC2 instances with GPU support. The goal is to present a simple and stable setup to train on GPU instances by using **Docker** and the NVIDIA Container Runtime **nvidia-docker**. A minimal example is given to train a small CNN built in Keras on MNIST. We achieve a 30-fold speedup in training time when training on GPU versus CPU. +Workflow that shows how to train neural networks on EC2 instances with GPU support. +The goal is to present a simple and stable setup to train on GPU instances by using **Docker** and the NVIDIA Container +Runtime **nvidia-docker**. + +Two minimal examples are given to train: + - a small built in Keras on MNIST image dataset + - a multi-layer perceptron (MLP) and BERT model on [aclImdb](https://ai.stanford.edu/~amaas/data/sentiment) sentiment dataset for NLP + +It is shown that a 30-fold speedup in training time when training on GPU versus CPU can be achieved, **but** only in respect +to the applied model. E.g. for the MLP using CPU instances can be preferable. ## Getting started @@ -14,14 +23,16 @@ Workflow that shows how to train neural networks on EC2 instances with GPU suppo ## Train locally on CPU -1. Build Docker image for CPU +1. Build Docker images for CPU ``` -docker build -t docker-keras . -f Dockerfile.cpu +docker build -t docker-keras-image . -f Image/Dockerfile.cpu +docker build -t docker-keras-nlp . -f NLP/Dockerfile.cpu ``` 2. Run training container (**NB:** you might have to increase the container resources [[link](https://docs.docker.com/config/containers/resource_constraints/)]) ``` -docker run docker-keras +docker run docker-keras-image +docker run docker-keras-nlp ``` @@ -39,46 +50,47 @@ For example, to launch a **p2.xlarge** EC2 instance named **ec2-p2** with a Tesl ``` docker-machine create --driver amazonec2 \ - --amazonec2-region eu-west-1 \ - --amazonec2-ami ami-58d7e821 \ + --amazonec2-region eu-central-1 \ + --amazonec2-ami ami-0ae9bf04fb7c502ea \ --amazonec2-instance-type p2.xlarge \ - --amazonec2-vpc-id vpc-abc \ - ec2-p2 + --amazonec2-vpc-id vpc-b0ec4fda \ + --amazonec2-root-size 100 \ + ec2-gpu +``` +``` +docker-machine create --driver amazonec2 \ + --amazonec2-region eu-central-1 \ + --amazonec2-ami ami-0ae9bf04fb7c502ea \ + --amazonec2-instance-type c5n.xlarge \ + --amazonec2-root-size 100 \ + --amazonec2-vpc-id vpc-b0ec4fda ec2-cpu ``` - 3. ssh into instance ``` -docker-machine ssh ec2-p2 +docker-machine ssh ec2-mp5 ``` -4. Update NVIDIA drivers and install **nvidia-docker** (see this [blog post](https://towardsdatascience.com/using-docker-to-set-up-a-deep-learning-environment-on-aws-6af37a78c551) for more details) +4. Run training container on CPU instance ``` -# update NVIDIA drivers -sudo add-apt-repository ppa:graphics-drivers/ppa -y -sudo apt-get update -sudo apt-get install -y nvidia-375 nvidia-settings nvidia-modprobe - -# install nvidia-docker -wget -P /tmp https://github.com/NVIDIA/nvidia-docker/releases/download/v1.0.1/nvidia-docker_1.0.1-1_amd64.deb -sudo dpkg -i /tmp/nvidia-docker_1.0.1-1_amd64.deb && rm /tmp/nvidia-docker_1.0.1-1_amd64.deb +sudo nvidia-docker run torlof/nlp-cpu-docker-keras ``` - 5. Run training container on GPU instance ``` -sudo nvidia-docker run idealo/nvidia-docker-keras +sudo nvidia-docker run torlof/nlp-nvidia-docker-keras ``` -This will pull the Docker image `idealo/nvidia-docker-keras` from [DockerHub](https://hub.docker.com/r/idealo/nvidia-docker-keras) and start the training. -The corresponding Dockerfile can be found under `Dockerfile.gpu` for reference. - +This will pull the Docker image from [Dockerhub](https://hub.docker.com/r/torlof/nlp-nvidia-docker-keras). +The corresponding Dockerfile for image example is `idealo/nvidia-docker-keras` and can be found +at [Idealo DockerHub](https://hub.docker.com/r/idealo/nvidia-docker-keras). -## Training time comparison +## Image classification example +### Training time comparison We trained MNIST for 3 epochs (~98% accuracy on validation set): @@ -88,6 +100,44 @@ We trained MNIST for 3 epochs (~98% accuracy on validation set): • p3.2xlarge (Tesla V100): **20 seconds** +## Text classification example +For NLP use-case we trained on aclImdb dataset. + +We want to test two text classification architectures: + - large BERT model + - lightweight MLP with a rather simple embedding table + +Since BERT is known for good performances on various tasks, lightweight MLP has its justification as it combines +computational efficient wiht convincing performances. + +In addition we want to investigate the runtime performance for the low dimensional output (2 to 10 classes) and +large-scale (up to 20000) cases. + +Since aclImdb is a binary classification problem we extend the dataset for multi-label classification by randomly +relabel the existing sample. (Note this makes the accuracy metrics useless.) Our focus in on runtime comparison. + +We have captured the training and inference runtime. Finally we are interested what instance provides use the +best cost efficiency therefore we caputred the cost for training and inference. + +### Training Runtime and Pricing + + + + + + +You can find these plots as interactive html plot at the **./plots** directory. + +### Inference Runtime and Pricing + + + + + + + + +You can find these plots as interactive html plot at the **./plots** directory. ## Copyright diff --git a/plots/Inference_BERT_Model__Price_.html b/plots/Inference_BERT_Model__Price_.html new file mode 100644 index 0000000..49c3047 --- /dev/null +++ b/plots/Inference_BERT_Model__Price_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_BERT_Model__Price_.png b/plots/Inference_BERT_Model__Price_.png new file mode 100644 index 0000000..005bbe1 Binary files /dev/null and b/plots/Inference_BERT_Model__Price_.png differ diff --git a/plots/Inference_BERT_Model__Runtime_.html b/plots/Inference_BERT_Model__Runtime_.html new file mode 100644 index 0000000..d02d7e4 --- /dev/null +++ b/plots/Inference_BERT_Model__Runtime_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_BERT_Model__Runtime_.png b/plots/Inference_BERT_Model__Runtime_.png new file mode 100644 index 0000000..4c9d01b Binary files /dev/null and b/plots/Inference_BERT_Model__Runtime_.png differ diff --git a/plots/Inference_BERT_Model__Runtime_vs_Costs_.html b/plots/Inference_BERT_Model__Runtime_vs_Costs_.html new file mode 100644 index 0000000..daeeab7 --- /dev/null +++ b/plots/Inference_BERT_Model__Runtime_vs_Costs_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_BERT_Model__Runtime_vs_Costs_.png b/plots/Inference_BERT_Model__Runtime_vs_Costs_.png new file mode 100644 index 0000000..074f955 Binary files /dev/null and b/plots/Inference_BERT_Model__Runtime_vs_Costs_.png differ diff --git a/plots/Inference_Multilayer_Perceptron_Model__Price_.html b/plots/Inference_Multilayer_Perceptron_Model__Price_.html new file mode 100644 index 0000000..c6c9d80 --- /dev/null +++ b/plots/Inference_Multilayer_Perceptron_Model__Price_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_Multilayer_Perceptron_Model__Price_.png b/plots/Inference_Multilayer_Perceptron_Model__Price_.png new file mode 100644 index 0000000..413cd0b Binary files /dev/null and b/plots/Inference_Multilayer_Perceptron_Model__Price_.png differ diff --git a/plots/Inference_Multilayer_Perceptron_Model__Runtime_.html b/plots/Inference_Multilayer_Perceptron_Model__Runtime_.html new file mode 100644 index 0000000..66ff9c5 --- /dev/null +++ b/plots/Inference_Multilayer_Perceptron_Model__Runtime_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_Multilayer_Perceptron_Model__Runtime_.png b/plots/Inference_Multilayer_Perceptron_Model__Runtime_.png new file mode 100644 index 0000000..592455a Binary files /dev/null and b/plots/Inference_Multilayer_Perceptron_Model__Runtime_.png differ diff --git a/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html b/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html new file mode 100644 index 0000000..b278909 --- /dev/null +++ b/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png b/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png new file mode 100644 index 0000000..a1c3f0a Binary files /dev/null and b/plots/Inference_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png differ diff --git a/plots/Training_BERT_Model__Price_.html b/plots/Training_BERT_Model__Price_.html new file mode 100644 index 0000000..b56474f --- /dev/null +++ b/plots/Training_BERT_Model__Price_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_BERT_Model__Price_.png b/plots/Training_BERT_Model__Price_.png new file mode 100644 index 0000000..3880ae8 Binary files /dev/null and b/plots/Training_BERT_Model__Price_.png differ diff --git a/plots/Training_BERT_Model__Runtime_.html b/plots/Training_BERT_Model__Runtime_.html new file mode 100644 index 0000000..7b267c3 --- /dev/null +++ b/plots/Training_BERT_Model__Runtime_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_BERT_Model__Runtime_.png b/plots/Training_BERT_Model__Runtime_.png new file mode 100644 index 0000000..fafdce6 Binary files /dev/null and b/plots/Training_BERT_Model__Runtime_.png differ diff --git a/plots/Training_BERT_Model__Runtime_vs_Costs_.html b/plots/Training_BERT_Model__Runtime_vs_Costs_.html new file mode 100644 index 0000000..ad37eaf --- /dev/null +++ b/plots/Training_BERT_Model__Runtime_vs_Costs_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_BERT_Model__Runtime_vs_Costs_.png b/plots/Training_BERT_Model__Runtime_vs_Costs_.png new file mode 100644 index 0000000..64ca84e Binary files /dev/null and b/plots/Training_BERT_Model__Runtime_vs_Costs_.png differ diff --git a/plots/Training_Multilayer_Perceptron_Model__Price_.html b/plots/Training_Multilayer_Perceptron_Model__Price_.html new file mode 100644 index 0000000..fdde673 --- /dev/null +++ b/plots/Training_Multilayer_Perceptron_Model__Price_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_Multilayer_Perceptron_Model__Price_.png b/plots/Training_Multilayer_Perceptron_Model__Price_.png new file mode 100644 index 0000000..7559ac9 Binary files /dev/null and b/plots/Training_Multilayer_Perceptron_Model__Price_.png differ diff --git a/plots/Training_Multilayer_Perceptron_Model__Runtime_.html b/plots/Training_Multilayer_Perceptron_Model__Runtime_.html new file mode 100644 index 0000000..f56b190 --- /dev/null +++ b/plots/Training_Multilayer_Perceptron_Model__Runtime_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_Multilayer_Perceptron_Model__Runtime_.png b/plots/Training_Multilayer_Perceptron_Model__Runtime_.png new file mode 100644 index 0000000..e3bcbf6 Binary files /dev/null and b/plots/Training_Multilayer_Perceptron_Model__Runtime_.png differ diff --git a/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html b/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html new file mode 100644 index 0000000..efbffd0 --- /dev/null +++ b/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.html @@ -0,0 +1,85 @@ + + + +
+ + + +
+ +
+ + \ No newline at end of file diff --git a/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png b/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png new file mode 100644 index 0000000..2e9303d Binary files /dev/null and b/plots/Training_Multilayer_Perceptron_Model__Runtime_vs_Costs_.png differ diff --git a/plots/make_plots.py b/plots/make_plots.py new file mode 100644 index 0000000..b08f390 --- /dev/null +++ b/plots/make_plots.py @@ -0,0 +1,276 @@ +import numpy as np +import plotly.express as px +cpu_result = { + "c5n.18xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [13.540891170501709, 12.476744413375854, 11.147228717803955, 17.353014707565308, 21.962525606155396], + 'inference time': [0.0021244105027646435, 0.0019159694107211366, 0.0020370142800467356, 0.01768301214490618, 0.031411869185311456]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1323.5947971343994, 1315.2153856754303, 1324.9581191539764, 1343.9357023239136, 1361.5333795547485], + 'inference time': [1.105727233448807, 1.0654324025523907, 1.0785843681315987, 1.1530808222537139, 1.1640879536161617]}, + "price": 4.428}, + "c5n.9xlarge" : + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [11.189261436462402, 9.973739385604858, 8.801320791244507, 16.157137632369995, 21.00126004219055], + 'inference time': [0.0016932609129925163, 0.001653958340080417, 0.0017271272990168358, 0.0160597453311998, 0.03037759479211301]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1274.8850712776184, 1252.0746772289276, 1245.7388393878937, 1254.435753583908, 1267.532795906067], + 'inference time': [1.0891822272417497, 1.0944765854854972, 1.0928822792306239, 1.1291938995828434, 1.1484798740367501]}, + "price": 2.214}, + "c5n.4xlarge" : + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [17.158438682556152, 13.947082281112671, 15.241694211959839, 20.412732362747192, 27.380159616470337], + 'inference time': [0.0019228640867739307, 0.0019206051923790757, 0.0019435675776734644, 0.01729791140069767, 0.032036843348522574]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1637.8760719299316, 1620.9459342956543, 1620.8361439704895, 1633.9754033088684, 1637.9916734695435], + 'inference time': [1.4184645949577799, 1.4048790737074248, 1.3930077564959624, 1.4117846683580049, 1.4288307124254656]}, + "price" : 0.984}, + + "c5n.2xlarge" : + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [14.361419200897217, 13.354588985443115, 13.591758489608765, 23.872464418411255, 37.43416905403137], + 'inference time': [0.002182399740024489, 0.0022328167545552155, 0.002277036102450624, 0.019505660144650206, 0.041018303559750925]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [2447.5564823150635, 2455.2421414852142, 2459.3994550704956, 2476.5737640857697, 2503.035465478897], + 'inference time': [1.9077403946798674, 1.9332553440210771, 1.948980034614096, 2.001873848389606, 2.013627045008601]}, + "price" : 0.492}, + + "m5.2xlarge" : + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [21.01449203491211, 14.700647830963135, 15.15757966041565, 27.494479656219482, 39.67811942100525], + 'inference time': [0.002862286810972253, 0.002527393856827094, 0.0025834514170276876, 0.021360832817700445, 0.03851592784025231]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [2731.6758856773376, 2757.0264995098114, 2768.441976070404, 2793.7750267982483, 2828.0964863300323], + 'inference time': [2.1820361796690495, 2.1762729554760214, 2.198035161105954, 2.296724497055521, 2.3419864579122893]}, + "price" : 0.46}, + "m5.4xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [12.471782684326172, 10.90599536895752, 10.597107887268066, 19.06341576576233, 27.280228853225708], + 'inference time': [0.0020289056155146385, 0.0020649542613905302, 0.0020645114840293416, 0.018404644362780512, 0.03332033084363353]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1813.6761920452118, 1794.9994223117828, 1794.1460082530975, 1826.1661627292633, 1843.6001889705658], + 'inference time': [1.5287325637681144, 1.5295593957511746, 1.522513641386616, 1.5916012574215324, 1.6103330205897897]}, + "price": 0.92}, + "m5.8xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [10.512910604476929, 9.75018835067749, 9.38976502418518, 15.95421051979065, 21.958120584487915], + 'inference time': [0.001863421226034359, 0.0018840760600810148, 0.0019181687004712162, 0.017100306189790065, 0.032234095797246815]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1418.350023984909, 1392.7381534576416, 1392.2020711898804, 1406.4629576206207, 1411.3244981765747], + 'inference time': [1.237773832009763, 1.220512145636033, 1.2142973785497704, 1.2685545622086039, 1.2907001303166759]}, + "price": 1.84}, + "m5.12xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [9.613126993179321, 10.028851747512817, 9.355329275131226, 15.407365560531616, 20.388332843780518], + 'inference time': [0.001749417003320188, 0.0017537936872365524, 0.0017739400571706344, 0.016351620761715636, 0.030961886960632946]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [1266.4482853412628, 1255.6161534786224, 1258.1703221797943, 1271.8952269554138, 1285.798241853714], + 'inference time': [1.0944540257356605, 1.0703771199498857, 1.079936146736145, 1.1171560068519748, 1.1532167816648677]}, + "price": 2.76}, + +"g3.4xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [32.5131995677948, 19.993014574050903, 19.49515128135681, 24.664000511169434, 30.96623969078064], + 'inference time': [0.004747893129076276, 0.003015046217003647, 0.0029536595149916045, 0.023322056750861967, 0.04413742556863902]}, + 'bert': {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [291.1605463027954, 282.62183809280396, 283.8296422958374, 284.0868787765503, 290.077486038208], + 'inference time': [0.2634515287924786, 0.2599753190060051, 0.25928513249572444, 0.2771172231557418, 0.2983498244869466]}, + "price": 1.425 }, +"g4dn.2xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [18.047739505767822, 16.740589380264282, 16.48866629600525, 21.18340301513672, 24.423452138900757], + 'inference time': [0.0027235089516153142, 0.0026681289380910446, 0.002651785101209368, 0.019234569705262477, 0.03599388867008443]}, + 'bert': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [289.02646255493164, 297.60623145103455, 296.0161476135254, 299.6972622871399, 304.22694396972656], + 'inference time': [0.26689156099241607, 0.2696997730099425, 0.268976198167217, 0.2827193955985867, 0.30081511395318167]}, + "price": 0.94 + }, + +"p2.large": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [22.68250060081482, 21.693998098373413, 21.162556648254395, 27.44784450531006, 36.32786679267883], + 'inference time': [0.003396384570063377, 0.003343067607101129, 0.0033055738526947646, 0.02271166383003702, 0.042332296468773664]}, + 'bert': {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [497.88060331344604, 500.3348813056946, 503.30378127098083, 509.1608729362488, 510.04041266441345], + 'inference time': [0.45726126918987353, 0.49957541665252375, 0.454925858244604, 0.4603731084843071, 0.4907277056149074]}, + "price" : 1.326}, + +"p3.2xlarge": + {'mlp': + {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [31.207317352294922, 17.045449256896973, 17.23259949684143, 20.84071135520935, 24.6668918132782], + 'inference time': [0.006852999025461625, 0.0026244095393589567, 0.0026388253484453473, 0.02073337593857123, 0.040009379386901855]}, + 'bert': {'no classes': [2, 50, 100, 10000, 20000], + 'training time': [159.59198307991028, 144.85900735855103, 142.45676064491272, 149.44550561904907, 153.42596101760864], + 'inference time': [0.16683880893551573, 0.1748491002588856, 0.17342416850887998, 0.18658071026510123, 0.2051472955820512]}, + "price" : 3.823}, +} + +import plotly.graph_objects as go + +def _get_color(n: int) -> str: + color_lst = px.colors.qualitative.Set1 + return color_lst[n % len(color_lst)] + +def plot(cpu_result, model_name, method, show_price, title, x_axis, y_axis): + + fig = go.Figure() + for n, cpu_name in enumerate(["c5n.2xlarge", "c5n.4xlarge","c5n.9xlarge","c5n.18xlarge"]): + item = cpu_result[cpu_name][model_name] + no_x = len(item[method]) + y = np.array(item[method]) + if show_price: + y = y * cpu_result[cpu_name]["price"] / (60 * 60) + x = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict(size=[8+n*3] * no_x, + color=[_get_color(1)] * no_x))) + + + for n, cpu_name in enumerate(["m5.2xlarge", "m5.4xlarge", "m5.8xlarge", "m5.12xlarge"]): + item = cpu_result[cpu_name][model_name] + y = np.array(item[method]) + if show_price: + y = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict(size=[8+n*3] * no_x, + color=[_get_color(2)] * no_x))) + #hovertemplate= 'id: %{y}' +'%{text}',text=text)) + + for n, cpu_name in enumerate(["g3.4xlarge", "p2.large", "g4dn.2xlarge", "p3.2xlarge"]): + item = cpu_result[cpu_name][model_name] + y = np.array(item[method]) + if show_price: + y = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict( + size=[8+n*3] * no_x, + color=[_get_color(0)] * no_x + ))) + + fig.update_xaxes(title=x_axis) + fig.update_yaxes(title=y_axis) + fig.update_layout( + title_text=title, + width=500, height=500, + xaxis = dict( + tickmode='array', + tickvals=x, + ticktext=item["no classes"])) + fig.show() + filename = title.replace(' ', '_').replace('[', '_').replace("]", "_") + fig.write_html(file=f"{filename}.html", auto_open=False) + +def plot_scatter(cpu_result, model_name, method, title, x_axis, y_axis): + + fig = go.Figure() + for n, cpu_name in enumerate(["c5n.2xlarge", "c5n.4xlarge","c5n.9xlarge","c5n.18xlarge"]): + item = cpu_result[cpu_name][model_name] + no_x = len(item[method]) + y = np.array(item[method]) + x = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict(size=[8+n*3] * no_x, + color=[_get_color(1)] * no_x))) + + + for n, cpu_name in enumerate(["m5.2xlarge", "m5.4xlarge", "m5.8xlarge", "m5.12xlarge"]): + item = cpu_result[cpu_name][model_name] + y = np.array(item[method]) + x = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict(size=[8+n*3] * no_x, + color=[_get_color(2)] * no_x))) + #hovertemplate= 'id: %{y}' +'%{text}',text=text)) + + for n, cpu_name in enumerate(["g3.4xlarge", "p2.large", "g4dn.2xlarge", "p3.2xlarge"]): + item = cpu_result[cpu_name][model_name] + y = np.array(item[method]) + x = y * cpu_result[cpu_name]["price"] / (60 * 60) + fig.add_trace( + go.Scatter(x=x, + y=y, + name=cpu_name, + mode="markers", + marker=dict( + size=[8+n*3] * no_x, + color=[_get_color(0)] * no_x + ))) + + fig.update_xaxes(title=x_axis) + fig.update_yaxes(title=y_axis) + fig.update_layout( + title_text=title, + width=600, height=600) + fig.show() + filename = title.replace(' ', '_').replace('[', '_').replace("]", "_") + fig.write_html(file=f"{filename}.html", auto_open=False) + +plot(cpu_result, "mlp", show_price=False, y_axis="runtime [sec]", x_axis="no. classes", method="training time", + title = "Training Multilayer Perceptron Model [Runtime]") +plot(cpu_result, "bert", show_price=False, y_axis="runtime [sec]", x_axis="no. classes", method="training time", + title = "Training BERT Model [Runtime]") +plot(cpu_result, "mlp", show_price=True, y_axis="price [usd]", x_axis="no. classes", method="training time", + title = "Training Multilayer Perceptron Model [Price]") +plot(cpu_result, "bert", show_price=True, y_axis="price [usd]", x_axis="no. classes", method="training time", + title="Training BERT Model [Price]") + +plot(cpu_result, "mlp", show_price=False, y_axis="runtime [sec]", x_axis="no. classes", method="inference time", + title = "Inference Multilayer Perceptron Model [Runtime]") +plot(cpu_result, "bert", show_price=False, y_axis="runtime [sec]", x_axis="no. classes", method="met", + title = "Inference BERT Model [Runtime]") +plot(cpu_result, "mlp", show_price=True, y_axis="price [usd]", x_axis="no. classes", method="inference time", + title = "Inference Multilayer Perceptron Model [Price]") +plot(cpu_result, "bert", show_price=True, y_axis="price [usd]", x_axis="no. classes", method="inference time", + title="Inference BERT Model [Price]") + + +plot_scatter(cpu_result, "mlp", y_axis="runtime [sec]", x_axis="costs [usd]", method="training time", + title = "Training Multilayer Perceptron Model [Runtime vs Costs]") +plot_scatter(cpu_result, "bert", y_axis="runtime [sec]", x_axis="costs [usd]", method="training time", + title = "Training BERT Model [Runtime vs Costs]") + +plot_scatter(cpu_result, "mlp", y_axis="runtime [sec]", x_axis="costs [usd]", method="inference time", + title = "Inference Multilayer Perceptron Model [Runtime vs Costs]") +plot_scatter(cpu_result, "bert", y_axis="runtime [sec]", x_axis="costs [usd]", method="inference time", + title = "Inference BERT Model [Runtime vs Costs]")