diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 56df68fc..44189eec 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -75,6 +75,20 @@ jobs: echo "predict result directory missing" >&2 exit 1 fi + rm -rf example/results_predict/ + rm -rf example/results_train/ + dfpl train -f example/train.json --trainAC TRUE --compressFeatures TRUE --aeType deterministic + if [ ! -d example/results_train/ ]; then + echo "training result directory missing" >&2 + exit 1 + fi + tree example + + dfpl predict -f example/predict.json --compressFeatures TRUE --aeType deterministic + if [ ! -d example/results_predict/ ]; then + echo "predict result directory missing" >&2 + exit 1 + fi echo "result lines "$(wc -l example/results_predict/smiles.csv) if [ "$(cat example/results_predict/smiles.csv | wc -l)" -lt "6" ]; then echo "predict result should have at least 6 lines. But had only $(cat example/results_predict/smiles.csv | wc -l)" >&2 diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..35ba44d0 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,13 +1,10 @@ import dataclasses import logging import os.path -import pathlib from argparse import Namespace from os import path import chemprop as cp -import pandas as pd -from keras.models import load_model from dfpl import autoencoder as ac from dfpl import feedforwardNN as fNN @@ -17,43 +14,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +23,29 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation mean_score, std_score = cp.train.cross_validate( args=opts, train_func=cp.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + + cp.train.make_predictions(args=opts) def train(opts: options.Options): @@ -116,9 +53,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +62,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,26 +76,27 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) + elif opts.aeType == "deterministic": + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) else: - autoencoder.load_weights( - os.path.join(opts.ecModelDir, opts.ecWeightsFile) - ) + raise ValueError(f"Unknown autoencoder type: {opts.aeType}") + + if opts.ecWeightsFile != "": + encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent and opts.trainAC: + logging.info("Visualizing latent space") + ac.visualize_fingerprints( + df, + before_col="fp", + after_col="fpcompressed", + train_indices=train_indices, + test_indices=test_indices, + save_as=f"UMAP_{opts.aeSplitType}.png", + ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -193,10 +128,10 @@ def predict(opts: options.Options) -> None: if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) # Load trained model for autoencoder - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) - else: + if opts.ecWeightsFile != "": encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) + else: + raise ValueError("No weights file specified for encoder") df = ac.compress_fingerprints(df, encoder) # Run predictions on the compressed fingerprints and store the results in a dataframe @@ -257,36 +192,22 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - predictgnn_opts, - test_path=makePathAbsolute(predictgnn_opts.test_path), - preds_path=makePathAbsolute(predictgnn_opts.preds_path), - ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + createLogger("predictgnn.log") + predictdmpnn(predictgnn_opts) elif prog_args.method == "train": train_opts = options.Options.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - train_opts, - inputFile=makePathAbsolute(train_opts.inputFile), - outputDir=makePathAbsolute(train_opts.outputDir), - ) - createDirectory(fixed_opts.outputDir) - createLogger(path.join(fixed_opts.outputDir, "train.log")) + createDirectory(train_opts.outputDir) + createLogger(path.join(train_opts.outputDir, "train.log")) logging.info( - f"The following arguments are received or filled with default values:\n{fixed_opts}" + f"The following arguments are received or filled with default values:\n{train_opts}" ) - train(fixed_opts) + train(train_opts) elif prog_args.method == "predict": predict_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( @@ -298,8 +219,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..6a180489 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,14 +1,13 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers @@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer - # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -145,37 +147,9 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + os.makedirs(opts.ecModelDir, exist_ok=True) + save_path = os.path.join(opts.ecModelDir, "autoencoder_weights.h5") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -286,32 +260,29 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") - if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) + autoencoder.load_weights(save_path) + # Save the encoder weights + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) + # Return the encoder model of the trained autoencoder return encoder, train_indices, test_indices @@ -386,7 +357,6 @@ def visualize_fingerprints( palette = {"train": "blue", "test": "red"} # Create the scatter plot - sns.set(style="white") fig, ax = plt.subplots(figsize=(10, 8)) split = save_as.split("_", 1) part_after_underscore = split[1] diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..f138a305 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,13 @@ import argparse from dataclasses import dataclass from pathlib import Path +from typing import Optional import jsonpickle import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import parseCmdArgs @dataclass @@ -36,6 +37,7 @@ class Options: trainAC: bool = True # if set to False, an AC weight file must be provided! trainFNN: bool = True compressFeatures: bool = True + visualizeLatent: bool = False sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] sampleDown: bool = False split_type: str = "random" @@ -44,13 +46,14 @@ class Options: aeEpochs: int = 3000 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 + aeLearningRateDecay: float = 0.97 aeActivationFunction: str = "relu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 @@ -72,42 +75,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -134,37 +103,19 @@ class GnnOptions(TrainArgs): save_preds: bool = True @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) - @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -294,6 +245,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Should the fingerprints be compressed or not. Activates the autoencoder. ", default=argparse.SUPPRESS, ) + general_args.add_argument( + "--visualizeLatent", + metavar="BOOL", + type=bool, + help="Visualize the latent space of the autoencoder.", + default=argparse.SUPPRESS, + ) general_args.add_argument( "-m", "--enableMultiLabel", @@ -495,6 +453,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Learning rate size in FNN training.", default=argparse.SUPPRESS, ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate decay in FNN training.", + default=argparse.SUPPRESS, + ) training_args.add_argument( "--activationFunction", metavar="STRING", @@ -1117,6 +1082,16 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "Provide a full path here.", default=argparse.SUPPRESS, ) + general_args.add_argument( + "--compressFeatures", type=bool, metavar="BOOL", default=False + ) + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + default="deterministic", + choices=["variational", "deterministic"], + ) def parsePredictGnn(parser: argparse.ArgumentParser) -> None: diff --git a/dfpl/plot.py b/dfpl/plot.py index 30d9503f..bd92e5cc 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -7,7 +7,7 @@ from matplotlib.axes import Axes # for NN model functions -from tensorflow.keras.callbacks import History +from tensorflow.python.keras.callbacks import History def get_max_validation_accuracy(history: History) -> str: @@ -46,8 +46,8 @@ def get_max_training_accuracy(history: History) -> str: return "Max training accuracy ≈ " + str(round(y_max, 3) * 100) + "%" -def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: - smoothed_points: List[float] = [] +def smooth_curve(points: np.ndarray, factor: float = 0.8) -> List[float]: + smoothed_points = [] for point in points: if smoothed_points: previous = smoothed_points[-1] @@ -57,81 +57,46 @@ def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: return smoothed_points +# Plot the accuracy and loss data with enhanced visuals def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None: - (train, valid) = (None, None) - - if which_graph == "acc": - train = smooth_curve(history.history["accuracy"]) - valid = smooth_curve(history.history["val_accuracy"]) - - if which_graph == "loss": - train = smooth_curve(history.history["loss"]) - valid = smooth_curve(history.history["val_loss"]) - - # plt.xkcd() # make plots look like xkcd + if which_graph == "balanced_acc": + # Plot balanced accuracy when "acc" is specified + train = smooth_curve(np.array(history.history["balanced_accuracy"])) + valid = smooth_curve(np.array(history.history["val_balanced_accuracy"])) + label = "Balanced Accuracy" + elif which_graph == "loss": + train = smooth_curve(np.array(history.history["loss"])) + valid = smooth_curve(np.array(history.history["val_loss"])) + label = "Loss" + else: + return epochs = range(1, len(train) + 1) - trim = 0 # remove first 5 epochs - # when graphing loss the first few epochs may skew the (loss) graph - - ax.plot(epochs[trim:], train[trim:], "dodgerblue", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], train[trim:], "dodgerblue", label="Training") - - ax.plot(epochs[trim:], valid[trim:], "g", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], valid[trim:], "g", label="Validation") + # Plot training and validation data with styles + ax.plot(epochs, train, color="dodgerblue", linewidth=2, label=f"Training {label}") + ax.plot( + epochs, + valid, + color="green", + linestyle="--", + linewidth=2, + label=f"Validation {label}", + ) + ax.set_ylabel(label) + ax.legend(loc="best") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) def plot_history(history: History, file: str) -> None: - fig, (ax1, ax2) = plt.subplots( - nrows=2, - ncols=1, - figsize=(10, 6), - sharex="all", - gridspec_kw={"height_ratios": [5, 2]}, - ) - - set_plot_history_data(ax1, history, "acc") + fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 8), sharex="all") + set_plot_history_data(ax1, history, "balanced_acc") set_plot_history_data(ax2, history, "loss") - # Accuracy graph - ax1.set_ylabel("Accuracy") - ax1.set_ylim(bottom=0.5, top=1) - ax1.legend(loc="lower right") - ax1.spines["top"].set_visible(False) - ax1.spines["right"].set_visible(False) - ax1.xaxis.set_ticks_position("none") - ax1.spines["bottom"].set_visible(False) - - # max accuracy text - plt.text( - 0.5, - 0.6, - get_max_validation_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - plt.text( - 0.5, - 0.8, - get_max_training_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - - # Loss graph - ax2.set_ylabel("Loss") - ax2.set_yticks([]) - ax2.plot(legend=False) + # Set shared x-axis label and save the plot ax2.set_xlabel("Epochs") - ax2.spines["top"].set_visible(False) - ax2.spines["right"].set_visible(False) - plt.tight_layout() plt.savefig(fname=file, format="svg") plt.close() @@ -195,6 +160,7 @@ def plot_history_vis( ) +# Enhanced AUC plot def plot_auc( fpr: np.ndarray, tpr: np.ndarray, @@ -203,25 +169,14 @@ def plot_auc( filename: str, wandb_logging: bool = False, ) -> None: - """ - Plot the area under the curve to the provided file - - :param fpr: An array containing the false positives - :param tpr: An array containing the true positives - :param auc_value: The value of the area under the curve - :param target: The name of the training target - :param filename: The filename to which the plot should be stored - :param wandb_logging: Whether to log the plot to wandb - :rtype: None - """ - # Create a boolean mask to filter out zero values - plt.figure() - plt.plot([0, 1], [0, 1], "k--") - plt.plot(fpr, tpr, label=f"Keras (area = {auc_value:.3f})") - plt.xlabel("False positive rate") - plt.ylabel("True positive rate") - plt.title("ROC curve " + target) - plt.legend(loc="best") + plt.figure(figsize=(8, 6)) + plt.plot([0, 1], [0, 1], "k--", linewidth=1) + plt.plot(fpr, tpr, color="darkorange", linewidth=2, label=f"AUC = {auc_value:.3f}") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title(f"ROC Curve - {target}") + plt.legend(loc="lower right") + plt.grid(True, linestyle="--", alpha=0.5) plt.savefig(fname=filename, format="png") if wandb_logging: wandb.log({"roc_plot": plt}) diff --git a/dfpl/predictions.py b/dfpl/predictions.py index 29e73142..55bd34d7 100644 --- a/dfpl/predictions.py +++ b/dfpl/predictions.py @@ -1,39 +1,55 @@ import logging +import os import numpy as np import pandas as pd -import tensorflow.keras.models from dfpl import options, settings +from dfpl import single_label_model as sl def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: """ Predict a set of chemicals using a selected model. - :param df: - :param opts: - :return: + :param df: Input DataFrame containing the features (either compressed or uncompressed). + :param opts: Model options including paths, feature types, and prediction preferences. + :return: DataFrame with predictions. """ - model = tensorflow.keras.models.load_model(opts.fnnModelDir, compile=False) - model.compile(loss=opts.lossFunction, optimizer=opts.optimizer) - if opts.compressFeatures: - sub_df = df[df["fpcompressed"].notnull()] - x = np.array( - sub_df["fpcompressed"].to_list(), - dtype=settings.nn_fp_compressed_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Compressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df - else: - sub_df = df[df["fp"].notnull()] - x = np.array( - sub_df["fp"].to_list(), - dtype=settings.nn_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Uncompressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df + + # Determine the correct feature column and input size + feature_column = "fpcompressed" if opts.compressFeatures else "fp" + sub_df = df[df[feature_column].notnull()] + + if sub_df.empty: + logging.warning(f"No valid features found in column '{feature_column}'") + return pd.DataFrame() + + # Prepare the feature matrix for prediction + x = np.array( + sub_df[feature_column].to_list(), + dtype=settings.nn_fp_compressed_numpy_type + if opts.compressFeatures + else settings.nn_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + logging.info( + f"{'Compressed' if opts.compressFeatures else 'Uncompressed'} FP matrix with shape {x.shape} and type {x.dtype}" + ) + + # Define the model architecture based on the feature size + feature_input_size = x.shape[1] + model = sl.define_single_label_model(input_size=feature_input_size, opts=opts) + + # Load the model weights + weights_path = os.path.join(opts.fnnModelDir, "model_weights.h5") + model.load_weights(weights_path) + logging.info(f"Model weights loaded from {weights_path}") + + # Make predictions + predictions = model.predict(x) + + # Add predictions to the DataFrame + sub_df["predicted"] = predictions + + return sub_df diff --git a/dfpl/settings.py b/dfpl/settings.py index 64eac190..20435290 100644 --- a/dfpl/settings.py +++ b/dfpl/settings.py @@ -43,7 +43,7 @@ # Training settings of the AC that were magic numbers in the code before. ac_train_min_delta = 0.0001 -ac_train_check_period = 5 +ac_train_check_period = 2 ac_train_patience = 5 # Training settings of the FNN that were magic numbers in the code before. diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..6068e229 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -333,15 +333,21 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: - logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") - sys.exit("Unsupported optimizer") + raise ValueError( + f'Option Optimizer is not "Adam" or "SGD", but {opts.optimizer}.' + ) # Set the type of neural network according to the option selected if opts.fnnType == "FNN": @@ -398,7 +404,7 @@ def evaluate_model( "target": target, "fold": fold, } - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.csv") + ).to_csv(path_or_buf=path.join(file_prefix, "predicted.testdata.csv")) ) # Compute the confusion matrix @@ -411,7 +417,7 @@ def evaluate_model( prf = pd.DataFrame.from_dict(precision_recall)[["0", "1"]] # Add balanced accuracy to the computed metrics - prf.to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.prec_rec_f1.csv") + prf.to_csv(path_or_buf=path.join(file_prefix, "testdata.prec_rec_f1.csv")) # Evaluate the model on the validation set and log the results loss, acc, auc_value, precision, recall, balanced_acc = tuple( @@ -450,14 +456,14 @@ def evaluate_model( ) ), columns=["fpr", "tpr", "auc_value", "target", "fold"], - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.aucdata.csv") + ).to_csv(path_or_buf=path.join(file_prefix, "predicted.testdata.aucdata.csv")) # Generate and save AUC-ROC curve plot pl.plot_auc( fpr=FPR, tpr=TPR, target=target, auc_value=AUC, - filename=f"{file_prefix}_auc_data.png", + filename=path.join(file_prefix, "auc_data.png"), wandb_logging=False, ) @@ -495,9 +501,11 @@ def fit_and_evaluate_model( logging.info(f"Training of fold number: {fold}") # Define file name prefix for saving models - model_file_prefix = path.join( - opts.outputDir, f"{target}_{opts.split_type}_single-labeled_Fold-{fold}" - ) + if fold > 1: + model_file_prefix = path.join("tmp", f"{target}/fold-{fold}") + else: + model_file_prefix = path.join(opts.outputDir, target) + os.makedirs(model_file_prefix, exist_ok=True) # Compute class imbalance ids, counts = np.unique(y_train, return_counts=True) @@ -517,7 +525,7 @@ def fit_and_evaluate_model( ) # Define checkpoint to save model weights during training - checkpoint_model_weights_path = f"{model_file_prefix}.model.weights.hdf5" + checkpoint_model_weights_path = os.path.join(model_file_prefix, "model_weights.h5") callback_list = cb.nn_callback( checkpoint_path=checkpoint_model_weights_path, opts=opts ) @@ -539,8 +547,10 @@ def fit_and_evaluate_model( ) # Save and plot model history - pd.DataFrame(hist.history).to_csv(path_or_buf=f"{model_file_prefix}.history.csv") - pl.plot_history(history=hist, file=f"{model_file_prefix}.history.svg") + pd.DataFrame(hist.history).to_csv( + path_or_buf=path.join(model_file_prefix, "history.csv") + ) + pl.plot_history(history=hist, file=path.join(model_file_prefix, "history.svg")) # Evaluate model callback_model = define_single_label_model(input_size=x_train.shape[1], opts=opts) callback_model.load_weights(filepath=checkpoint_model_weights_path) @@ -596,11 +606,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available @@ -662,21 +668,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: opts=opts, ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_saved_model") - ) elif 1 < opts.kFolds < 10: # int(x.shape[0] / 100): # do a kfold cross-validation @@ -720,24 +711,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - # create output directory and store complete model - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}-{fold_no}_saved_model" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -746,31 +719,19 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Best_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_saved_model" - ) - dst_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_best_saved_model" - ) + # Ensure the destination directory exists + os.makedirs(src, exist_ok=True) + os.makedirs(dst, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") - # save complete model else: logging.info( "Your selected number of folds for Cross validation is out of range. " @@ -829,20 +790,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_0" - ) - ) + elif opts.kFolds > 1: for fold_no in range(1, opts.kFolds + 1): print(f"Splitting data with seed {fold_no}") @@ -880,20 +828,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{fold_no}" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -901,30 +835,18 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_BEST_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{best_fold}" - ) - dst_dir = os.path.join( - opts.outputDir, - f"{target}_scaffold_saved_model_BEST_FOLD_{best_fold}", - ) + # Ensure the destination directory exists + os.makedirs(dst, exist_ok=True) + os.makedirs(src, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") else: logging.info( @@ -978,18 +900,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_weight_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_weight_saved_model_0") - ) elif opts.kFolds > 1: raise Exception( f"Unsupported number of folds: {opts.kFolds} for {opts.split_type} split.\ diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..0e4c8098 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,12 +1,16 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union +import jsonpickle import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -15,6 +19,44 @@ from tqdm import tqdm RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -31,20 +73,34 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + + with open(jsonFile, "r") as f: data = json.load(f) + + # Check each key in the JSON file against command-line arguments for key, value in data.items(): if key not in ignore_elements: + # Prepare the command-line argument format + cli_arg_key = f"--{key}" + + # Check if this argument is provided in the command line + if cli_arg_key in sys.argv: + # Find the index of the argument in sys.argv and get its value + arg_index = sys.argv.index(cli_arg_key) + 1 + if arg_index < len(sys.argv): + cli_value = sys.argv[arg_index] + value = cli_value # Override JSON value with command-line value + + # Append the argument and its value to the list if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") + arguments.append(cli_arg_key) arguments.extend(value) else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + return arguments diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..11fdb8a1 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,8 +1,6 @@ -import csv import logging import math import os.path -from os.path import basename from typing import Tuple import numpy as np @@ -26,22 +24,26 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = ( + opts.encFPSize + ) # This should be the intended size of your latent space, e.g., 256 + + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) hidden_layer_count = round(math.log2(input_size / encoding_dim)) - # the input placeholder input_vec = Input(shape=(input_size,)) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! + # 1st hidden layer if opts.aeActivationFunction != "selu": encoded = Dense( units=int(input_size / 2), activation=opts.aeActivationFunction @@ -53,87 +55,81 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mo kernel_initializer="lecun_normal", )(input_vec) - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) + # encoding layers + for i in range( + 1, hidden_layer_count - 1 + ): # Adjust the range to stop before the latent space layers + factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, )(encoded) else: - z_mean = Dense( + encoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", )(encoded) - z_log_var = Dense( + + # latent space layers + if opts.aeActivationFunction != "selu": + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + else: + z_mean = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + + # sampling layer + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(0.5 * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + decoded = z + + # decoding layers + for i in range(hidden_layer_count - 1, 0, -1): + factor_units = 2**i + if opts.aeActivationFunction != "selu": + decoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + )(decoded) + else: + decoded = Dense( units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal", - )(encoded) - - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) + )(decoded) - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) - - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # output layer + decoded = Dense( + units=input_size, activation="sigmoid", bias_initializer=output_bias + )(decoded) autoencoder = Model(input_vec, decoded) + encoder = Model(input_vec, z) + autoencoder.summary(print_fn=logging.info) # KL divergence loss def kl_loss(z_mean, z_log_var): @@ -155,9 +151,6 @@ def vae_loss(y_true, y_pred): optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] ) - # build encoder model - encoder = Model(input_vec, z_mean) - return autoencoder, encoder @@ -175,39 +168,10 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + os.makedirs(opts.ecModelDir, exist_ok=True) + save_path = os.path.join(opts.ecModelDir, "vae_weights.h5") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) + # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), @@ -219,17 +183,17 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: ) assert 0.0 <= opts.testSize <= 0.5 if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) + logging.info("Training autoencoder using random split") + initial_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: # Split data into test and training data if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: x_train = fp_matrix @@ -255,6 +219,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() else: x_train = fp_matrix x_test = None @@ -262,7 +232,6 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: logging.info("Training autoencoder using molecular weight split") train_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: - # if opts.aeWabTracking: train_data, val_data, test_data = weight_split( df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" ) @@ -276,16 +245,21 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + # Get the sorted indices from the sorted DataFrame + sorted_indices = df_sorted.index.to_numpy() + + # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame + train_indices = sorted_indices[df.index.isin(train_data.index)] + # val_indices = sorted_indices[df.index.isin(val_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] else: x_train = fp_matrix x_test = None else: raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None + + # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints ids, counts = np.unique(x_train.flatten(), return_counts=True) count_dict = dict(zip(ids, counts)) if count_dict[0] == 0: @@ -304,34 +278,25 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) + vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, + callbacks=[callback_list], validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), + base_file_name=save_path, hist=vae_hist, ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Re-define autoencoder and encoder using your function + vae.load_weights(save_path) + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) + return encoder, train_indices, test_indices diff --git a/example/predict.json b/example/predict.json index 252965e3..534f75e3 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,11 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", - "compressFeatures": true, - "trainAC": false, - "trainFNN": false + "ecModelDir": "example/results_train/autoencoder", + "ecWeightsFile": "encoder_weights.h5", + "fnnModelDir": "example/results_train/AR", + "aeType": "variational", + "compressFeatures": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..e087f8aa 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "preds_path": "preds_dmpnn/DMPNN_preds.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..04953ba3 100755 --- a/example/train.json +++ b/example/train.json @@ -2,22 +2,23 @@ "py/object": "dfpl.options.Options", "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecModelDir": "example/results_train/autoencoder", + "ecWeightsFile": "", "verbose": 2, - "trainAC": true, - "compressFeatures": true, + "trainAC": false, + "compressFeatures": false, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 2, + "aeEpochs": 6, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, - "aeType": "deterministic", + "aeLearningRateDecay": 0.96, + "aeType": "variational", "type": "smiles", "fpType": "topological", @@ -29,7 +30,7 @@ "gpu": "", "trainFNN": true, - "kFolds": 1, + "kFolds": 2, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", @@ -40,6 +41,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 714fa80a..fa2b714f 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,8 +2,8 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 2, - "num_folds": 2, + "epochs": 4, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random", diff --git a/tests/run_fnntraining.py b/tests/run_fnntraining.py index 4146ad4a..3ebdf349 100644 --- a/tests/run_fnntraining.py +++ b/tests/run_fnntraining.py @@ -14,7 +14,7 @@ f"{project_directory}/output/fnnTrainingCompressed/" ), outputDir=utils.makePathAbsolute(f"{project_directory}/output/fnnTraining"), - ecWeightsFile="", + ecWeightsFile="/encoder_weights.h5", type="smiles", fpType="topological", epochs=11, @@ -26,6 +26,7 @@ verbose=2, trainAC=False, trainFNN=True, + compressFeatures=True, ) @@ -53,7 +54,8 @@ def run_single_label_training(opts: opt.Options) -> None: # encoder.save_weights(opts.acFile) else: logging.info("Using trained autoencoder") - (_, encoder) = ac.define_ac_model(opts) + (autoencoder, encoder) = ac.define_ac_model(opts) + encoder.load_weights(opts.ecWeightsFile) df = ac.compress_fingerprints(df, encoder)