diff --git a/README.md b/README.md index bf3ed816..f8e9a533 100644 --- a/README.md +++ b/README.md @@ -10,127 +10,125 @@ neural network (GNN) that predicts an association to a target molecule, e.g., a DeepFPlearn+ is an extension of deepFPlearn[[2]](#2), which uses binary fingerprints to represent the molecule's structure computationally. -## Setting up Python environment +## Installation The DFPL package requires a particular Python environment to work properly. It consists of a recent Python interpreter and packages for data-science and neural networks. -The exact dependencies can be found in the -[`requirements.txt`](requirements.txt) (which is used when installing the package with pip) -and [`environment.yml`](environment.yml) (for installation with conda). You have several ways to provide the correct environment to run code from the DFPL package. -1. Use the automatically built docker/Singularity containers -2. Build your own container [following the steps here](container/README.md) -3. Setup a python virtual environment -4. Set up a conda environment install the requirements via conda and the DFPL package via pip +1. Use conda (bioconda) to install the package +2. Set up a Python virtual environment +3. Use the automatically built Docker +4. Use the automatically built Singularity containers -In the following, you find details for option 1., 3., and 4. +### Conda (bioconda) + +The package is also available on Bioconda. You can find the Bioconda recipe here and +[![install with bioconda](http://bioconda.github.io/recipes/deepfplearn/README.html)] + +First create an environment with the following command: + +```shell +conda create --override-channels --channel conda-forge --channel bioconda -n dfpl python3.8 deepfplearn +``` + +If you have a GPU available you can install the package with additional tensorflow-gpu package: + +```shell +conda create --override-channels --channel conda-forge --channel bioconda -n dfpl python3.8 deepfplearn tensorflow-gpu==2.9.3 +``` + +Then activate the environment: + +```shell +conda activate dfpl +``` + +### Set up DFPL in a python virtual environment + +From within the `deepFPlearn` directory call + +``` +virtualenv -p python3 ENV_PATH +. ENV_PATH/bin/activate +pip install ./ +``` + +replace `ENV_PATH` by the directory where the python virtual environment should be created. +If your system has only python3 installed `-p python3` may be removed. + +In order to use the environment, it needs to be activated with `. ENV_PATH/bin/activate`. ### Docker container -You need docker installed on you machine. +You need docker installed on your machine. If you don't have it installed yet, you can find the installation +instructions [here](https://docs.docker.com/engine/install/). -In order to run DFPL use the following command line +In order to run DFPL pull the image using the following command line: ```shell -docker run --gpus GPU_REQUEST registry.hzdr.de/department-computational-biology/deepfplearn/deepfplearn:TAG dfpl DFPL_ARGS +docker pull quay.io/biocontainers/deepfplearn:TAG +``` +Then mount the directory containing the data you want to process and run the container with the following command: + +```shell +docker run -v /path/to/local/repo quay.io/biocontainers/deepfplearn:TAG dfpl DFPL_ARGS +``` +And then you can run the container with the following command: + +```shell +docker run quay.io/biocontainers/deepfplearn:TAG dfpl DFPL_ARGS ``` where you replace -- `TAG` by the version you want to use or `latest` if you want to use latest available version) -- You can see available tags - here https://gitlab.hzdr.de/department-computational-biology/deepfplearn/container_registry/5827. +- `TAG` by the version you want to use +- You can see available tags in [biocontainers](https://biocontainers.pro/tools/deepfplearn). In general a container should be available for each released version of DFPL. -- `GPU_REQUEST` by the GPUs you want to use or `all` if all GPUs should be used (remove `--gpus GPU_REQUEST` if only the - CPU should) - `DFPL_ARGS` by arguments that should be passed to DFPL (use `--help` to see available options) In order to get an interactive bash shell in the container use: ```shell -docker run -it --gpus GPU_REQUEST registry.hzdr.de/department-computational-biology/deepfplearn/deepfplearn:TAG bash +docker run -it quay.io/biocontainers/deepfplearn:TAG bash ``` + ### Singularity container -You need Singularity installed on your machine. You can download a container with +You need Singularity installed on your machine. You can find the installation instructions +[here](https://apptainer.org/user-docs/master/quick_start.html). ```shell -singularity pull dfpl.TAG.sif docker://registry.hzdr.de/department-computational-biology/deepfplearn/deepfplearn:TAG +singularity pull dfpl.TAG.sif docker://quay.io/biocontainers/deepfplearn:TAG ``` -- replace `TAG` by the version you want to use or `latest` if you want to use latest available version) +- replace `TAG` by the version you want to use - You can see available tags - here https://gitlab.hzdr.de/department-computational-biology/deepfplearn/container_registry/5827. - In general a container should be available for each released version of DFPL. + [here](https://biocontainers.pro/tools/deepfplearn). This stores the container as a file `dfpl.TAG.sif` which can be run as follows: ```shell script -singularity run --nv dfpl.TAG.sif dfpl DFPL_ARGS +singularity run dfpl.TAG.sif dfpl DFPL_ARGS ``` - replace `DFPL_ARGS` by arguments that should be passed to DFPL (use `--help` to see available options) -- omit the `--nv` tag if you don't want to use GPUs or you can start a shell script (look at [run-all-cases.sh](scripts/run-all-cases.sh) for an example) -```shell script -singularity run --nv dfpl.sif ". ./example/run-multiple-cases.sh" -``` - It's also possible to get an interactive shell into the container ```shell script -singularity shell --nv dfpl.TAG.sif +singularity shell dfpl.TAG.sif ``` **Note:** The Singularity container is intended to be used on HPC cluster where your ability to install software might be limited. -For local testing or development, setting up the conda environment is preferable. - -### Set up DFPL in a python virtual environment - -From within the `deepFPlearn` directory call - -``` -virtualenv -p python3 ENV_PATH -. ENV_PATH/bin/activate -pip install ./ -``` - -replace `ENV_PATH` by the directory where the python virtual environment should be created. -If your system has only python3 installed `-p python3` may be removed. - -In order to use the environment it needs to be activated with `. ENV_PATH/bin/activate`. - -### Set up DFPL in a conda environment - -To use this tool in a conda environment: - -1. Create the conda env from scratch - - From within the `deepFPlearn` directory, you can create the conda environment with the provided yaml file that - contains all information and necessary packages - - ```shell - conda env create -f environment.yml - ``` - -2. Activate the `dfpl_env` environment with - - ```shell - conda activate dfpl_env - ``` - -3. Install the local `dfpl` package by calling +For local testing or development, setting up the bioconda environment is preferable. - ```shell - pip install --no-deps ./ - ``` ## Prepare data diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..d3858942 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,12 +1,10 @@ import dataclasses import logging import os.path -import pathlib from argparse import Namespace from os import path import chemprop as cp -import pandas as pd from keras.models import load_model from dfpl import autoencoder as ac @@ -17,43 +15,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +24,29 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation mean_score, std_score = cp.train.cross_validate( args=opts, train_func=cp.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + + cp.train.make_predictions(args=opts) def train(opts: options.Options): @@ -116,9 +54,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +63,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,11 +77,12 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": + else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile == "": encoder = load_model(opts.ecModelDir) else: autoencoder.load_weights( @@ -154,14 +90,15 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent: + ac.visualize_fingerprints( + df, + before_col="fp", + after_col="fpcompressed", + train_indices=train_indices, + test_indices=test_indices, + save_as=f"UMAP_{opts.aeSplitType}.png", + ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -257,7 +194,7 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": @@ -267,12 +204,8 @@ def main(): test_path=makePathAbsolute(predictgnn_opts.test_path), preds_path=makePathAbsolute(predictgnn_opts.preds_path), ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + createLogger("predictgnn.log") + predictdmpnn(fixed_opts) elif prog_args.method == "train": train_opts = options.Options.fromCmdArgs(prog_args) @@ -298,8 +231,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..b2b13d76 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,19 +1,18 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model, load_model from dfpl import callbacks from dfpl import history as ht @@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer - # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -145,37 +147,8 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -286,30 +259,35 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) + # Re-define autoencoder and encoder using your function + callback_autoencoder = load_model(filepath=save_path) + _, callback_encoder = define_ac_model(opts) + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = os.path.join(save_path, "encoder_model") + callback_encoder.save(filepath=encoder_save_path) else: encoder.save(filepath=save_path) # Return the encoder model of the trained autoencoder diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 6eae7965..8bf157fd 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -22,15 +22,25 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - period=settings.ac_train_check_period, - save_best_only=True, - save_weights_only=True, - ) + if opts.aeType == "deterministic": + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + ) + else: + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + save_weights_only=True, + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore @@ -43,7 +53,6 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) - if opts.aeWabTracking and not opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -65,7 +74,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - period=settings.nn_train_check_period, + save_freq="epoch", save_best_only=True, monitor="val_loss", mode="min", diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..4cf35a0f 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,12 @@ import argparse from dataclasses import dataclass from pathlib import Path - +from typing import Optional import jsonpickle import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import parseCmdArgs @dataclass @@ -17,51 +17,51 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." - outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + configFile: str = None + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" + ecWeightsFile: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False # only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +72,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -134,37 +100,19 @@ class GnnOptions(TrainArgs): save_preds: bool = True @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) - @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -225,7 +173,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +182,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -243,8 +191,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/", ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -252,7 +202,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp", ) general_args.add_argument( "-thr", @@ -260,47 +210,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None, ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048, ) general_args.add_argument( - "-c", "--compressFeatures", - metavar="BOOL", - type=bool, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, + action="store_true", + help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False, ) general_args.add_argument( - "-m", "--enableMultiLabel", - metavar="BOOL", - type=bool, + action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, + default=False, ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -309,14 +253,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="", ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, + default="example/results_train/AE_encoder/", ) autoencoder_args.add_argument( "--aeType", @@ -324,21 +268,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -346,21 +290,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, + default="relu", ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", @@ -368,7 +312,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, + default="random", ) autoencoder_args.add_argument( "-d", @@ -376,7 +320,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256, + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, ) # Training Configuration training_args.add_argument( @@ -385,15 +335,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, + default="random", ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "-K", @@ -401,7 +350,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, + default=1, ) training_args.add_argument( "-v", @@ -411,21 +360,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=True, ) training_args.add_argument( "--sampleFractionOnes", @@ -433,14 +380,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, + default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "-e", @@ -448,52 +395,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, + default=100, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, + default="bce", ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, + default="Adam", ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -501,7 +456,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, + default="relu", ) # Tracking Configuration tracking_args.add_argument( @@ -509,14 +464,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTarget", @@ -524,7 +479,112 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting.", + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", + ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + ( + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) ) @@ -575,9 +635,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -1034,91 +1091,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--outputFile", - metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", - type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, - ) - - def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") data_args = parser.add_argument_group("Data Configuration") @@ -1139,9 +1111,6 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: choices=list(range(torch.cuda.device_count())), help="Which GPU to use", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" - ) general_args.add_argument( "--num_workers", type=int, diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..191690ba 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -333,12 +333,17 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -596,11 +601,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..bea7006f 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -5,8 +5,14 @@ import warnings from collections import defaultdict from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Union, Type, TypeVar +# Define a type variable + +from pathlib import Path +import argparse +import jsonpickle +import sys import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -15,6 +21,44 @@ from tqdm import tqdm RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -31,20 +75,34 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + + with open(jsonFile, "r") as f: data = json.load(f) + + # Check each key in the JSON file against command-line arguments for key, value in data.items(): if key not in ignore_elements: + # Prepare the command-line argument format + cli_arg_key = f"--{key}" + + # Check if this argument is provided in the command line + if cli_arg_key in sys.argv: + # Find the index of the argument in sys.argv and get its value + arg_index = sys.argv.index(cli_arg_key) + 1 + if arg_index < len(sys.argv): + cli_value = sys.argv[arg_index] + value = cli_value # Override JSON value with command-line value + + # Append the argument and its value to the list if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") + arguments.append(cli_arg_key) arguments.extend(value) else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + return arguments diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..45cfda7a 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,8 +1,6 @@ -import csv import logging import math import os.path -from os.path import basename from typing import Tuple import numpy as np @@ -26,22 +24,26 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = ( + opts.encFPSize + ) # This should be the intended size of your latent space, e.g., 256 + + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) hidden_layer_count = round(math.log2(input_size / encoding_dim)) - # the input placeholder input_vec = Input(shape=(input_size,)) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! + # 1st hidden layer if opts.aeActivationFunction != "selu": encoded = Dense( units=int(input_size / 2), activation=opts.aeActivationFunction @@ -53,87 +55,81 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mo kernel_initializer="lecun_normal", )(input_vec) - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) + # encoding layers + for i in range( + 1, hidden_layer_count - 1 + ): # Adjust the range to stop before the latent space layers + factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, )(encoded) else: - z_mean = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", )(encoded) - z_log_var = Dense( + + # latent space layers + if opts.aeActivationFunction != "selu": + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + else: + z_mean = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + + # sampling layer + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(0.5 * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + decoded = z + + # decoding layers + for i in range(hidden_layer_count - 2, 0, -1): + factor_units = 2**i + if opts.aeActivationFunction != "selu": + decoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + )(decoded) + else: + decoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", - )(encoded) + )(decoded) - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) - - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # output layer + decoded = Dense( + units=input_size, activation="sigmoid", bias_initializer=output_bias + )(decoded) autoencoder = Model(input_vec, decoded) + encoder = Model(input_vec, z) + autoencoder.summary(print_fn=logging.info) # KL divergence loss def kl_loss(z_mean, z_log_var): @@ -155,9 +151,6 @@ def vae_loss(y_true, y_pred): optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] ) - # build encoder model - encoder = Model(input_vec, z_mean) - return autoencoder, encoder @@ -175,39 +168,9 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) + # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), @@ -219,17 +182,17 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: ) assert 0.0 <= opts.testSize <= 0.5 if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) + logging.info("Training autoencoder using random split") + initial_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: # Split data into test and training data if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: x_train = fp_matrix @@ -255,6 +218,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() else: x_train = fp_matrix x_test = None @@ -262,7 +231,6 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: logging.info("Training autoencoder using molecular weight split") train_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: - # if opts.aeWabTracking: train_data, val_data, test_data = weight_split( df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" ) @@ -276,16 +244,21 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + # Get the sorted indices from the sorted DataFrame + sorted_indices = df_sorted.index.to_numpy() + + # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame + train_indices = sorted_indices[df.index.isin(train_data.index)] + # val_indices = sorted_indices[df.index.isin(val_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] else: x_train = fp_matrix x_test = None else: raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None + + # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints ids, counts = np.unique(x_train.flatten(), return_counts=True) count_dict = dict(zip(ids, counts)) if count_dict[0] == 0: @@ -304,34 +277,34 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data + callback_list = callbacks.autoencoder_callback( + checkpoint_path=f"{save_path}.h5", opts=opts + ) + vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, + callbacks=[callback_list], validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), + base_file_name=save_path, hist=vae_hist, ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Re-define autoencoder and encoder using your function + callback_autoencoder, callback_encoder = define_vae_model(opts) + callback_autoencoder.load_weights(filepath=f"{save_path}.h5") + + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = f"{save_path}_encoder.h5" + callback_encoder.save_weights(filepath=encoder_save_path) + return encoder, train_indices, test_indices diff --git a/environment.yml b/environment.yml deleted file mode 100644 index 3c2e7a6c..00000000 --- a/environment.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: dfplenv -channels: - - conda-forge - - defaults -dependencies: - # dev requirements - - conda-build=3.21.8 - - conda=4.12.0 - - pip=22.0.4 - - pytest=7.1.1 - # application requirements - - jsonpickle=2.1 - - matplotlib=3.5.1 - - numpy=1.19.5 - - pandas=1.4.2 - - rdkit=2022.03.1 - - scikit-learn=1.0.2 - - seaborn=0.12.2 - - tensorflow-gpu=2.6.0 - - wandb=0.12 - - umap=0.1.1 - - pip: - - git+https://github.com/soulios/chemprop.git@1d73523e49aa28a90b74edc04aaf45d7e124e338 \ No newline at end of file diff --git a/example/predict.json b/example/predict.json index 252965e3..e3305c7c 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,12 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", + "ecModelDir": "example/results_train/random_split_autoencoder", + "ecWeightsFile": "random_split_autoencoder_encoder.h5", + "fnnModelDir": "example/results_train/NR-AR-1_best_saved_model", + "aeType": "variational", "compressFeatures": true, - "trainAC": false, "trainFNN": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..813cf0c5 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "preds_path": "example/results_gnn.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..53575adc 100755 --- a/example/train.json +++ b/example/train.json @@ -1,22 +1,23 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/S_dataset.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecModelDir": "example/results_train/random_split_autoencoder/", + "ecWeightsFile": "", "verbose": 2, - "trainAC": true, - "compressFeatures": true, + "trainAC": false, + "compressFeatures": false, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 2, + "aeEpochs": 4, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, + "aeLearningRateDecay": 0.96, "aeType": "deterministic", "type": "smiles", @@ -29,7 +30,7 @@ "gpu": "", "trainFNN": true, - "kFolds": 1, + "kFolds": 2, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", @@ -40,6 +41,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 714fa80a..fa2b714f 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,8 +2,8 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 2, - "num_folds": 2, + "epochs": 4, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random",