yigbt
diff --git a/‎.github/workflows/pr.yml‎
Lines changed: 14 additions & 0 deletions b/‎.github/workflows/pr.yml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎dfpl/__main__.py‎
Lines changed: 34 additions & 115 deletions b/‎dfpl/__main__.py‎
Lines changed: 34 additions & 115 deletions
diff --git a/‎dfpl/autoencoder.py‎
Lines changed: 16 additions & 46 deletions b/‎dfpl/autoencoder.py‎
Lines changed: 16 additions & 46 deletions
@@ -75,6 +75,20 @@ jobs:
           echo "predict result directory missing" >&2 
           exit 1
         fi
+        rm -rf example/results_predict/
+        rm -rf example/results_train/
+        dfpl train -f example/train.json --trainAC TRUE --compressFeatures TRUE --aeType deterministic
+        if [ ! -d example/results_train/ ]; then
+          echo "training result directory missing" >&2 
+          exit 1
+        fi
+        tree example
+
+        dfpl predict -f example/predict.json --compressFeatures TRUE --aeType deterministic
+        if [ ! -d example/results_predict/ ]; then
+          echo "predict result directory missing" >&2 
+          exit 1
+        fi
         echo "result lines "$(wc -l example/results_predict/smiles.csv)
         if [ "$(cat example/results_predict/smiles.csv | wc -l)" -lt "6" ]; then
           echo "predict result should have at least 6 lines. But had only $(cat example/results_predict/smiles.csv | wc -l)" >&2 
 
@@ -1,13 +1,10 @@
 import dataclasses
 import logging
 import os.path
-import pathlib
 from argparse import Namespace
 from os import path
 
 import chemprop as cp
-import pandas as pd
-from keras.models import load_model
 
 from dfpl import autoencoder as ac
 from dfpl import feedforwardNN as fNN
@@ -17,108 +14,45 @@
 from dfpl import vae as vae
 from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute
 
-project_directory = pathlib.Path(".").parent.parent.absolute()
-test_train_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    type="smiles",
-    fpType="topological",
-    epochs=100,
-    batchSize=1024,
-    fpSize=2048,
-    encFPSize=256,
-    enableMultiLabel=False,
-    testSize=0.2,
-    kFolds=2,
-    verbose=2,
-    trainAC=False,
-    trainFNN=True,
-    compressFeatures=True,
-    activationFunction="selu",
-    lossFunction="bce",
-    optimizer="Adam",
-    fnnType="FNN",
-)
 
-test_pred_opts = options.Options(
-    inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
-    outputDir=f"{project_directory}/output_data/console_test",
-    outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
-    ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
-    fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
-    type="smiles",
-    fpType="topological",
-)
-
-
-def traindmpnn(opts: options.GnnOptions):
+def traindmpnn(opts: options.GnnOptions) -> None:
     """
     Train a D-MPNN model using the given options.
     Args:
     - opts: options.GnnOptions instance containing the details of the training
     Returns:
     - None
     """
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-    ignore_elements = ["py/object"]
     # Load options from a JSON file and replace the relevant attributes in `opts`
-    arguments = createArgsFromJson(
-        opts.configFile, ignore_elements, return_json_object=False
-    )
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
     opts = cp.args.TrainArgs().parse_args(arguments)
     logging.info("Training DMPNN...")
-    # Train the model and get the mean and standard deviation of AUC score from cross-validation
     mean_score, std_score = cp.train.cross_validate(
         args=opts, train_func=cp.train.run_training
     )
     logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")
 
 
-def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
+def predictdmpnn(opts: options.GnnOptions) -> None:
     """
     Predict the values using a trained D-MPNN model with the given options.
     Args:
     - opts: options.GnnOptions instance containing the details of the prediction
-    - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
     Returns:
     - None
     """
-    ignore_elements = [
-        "py/object",
-        "checkpoint_paths",
-        "save_dir",
-        "saving_name",
-    ]
     # Load options and additional arguments from a JSON file
-    arguments, data = createArgsFromJson(
-        json_arg_path, ignore_elements, return_json_object=True
-    )
-    arguments.append("--preds_path")
-    arguments.append("")
-    save_dir = data.get("save_dir")
-    name = data.get("saving_name")
-    # Replace relevant attributes in `opts` with loaded options
+    arguments = createArgsFromJson(jsonFile=opts.configFile)
     opts = cp.args.PredictArgs().parse_args(arguments)
-    opts.preds_path = save_dir + "/" + name
-    df = pd.read_csv(opts.test_path)
-    smiles = []
-    for index, rows in df.iterrows():
-        my_list = [rows.smiles]
-        smiles.append(my_list)
-    # Make predictions and return the result
-    cp.train.make_predictions(args=opts, smiles=smiles)
+
+    cp.train.make_predictions(args=opts)
 
 
 def train(opts: options.Options):
     """
     Run the main training procedure
     :param opts: Options defining the details of the training
     """
-
-    os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
-
     # import data from file and create DataFrame
     if "tsv" in opts.inputFile:
         df = fp.importDataFile(
@@ -128,7 +62,7 @@ def train(opts: options.Options):
         df = fp.importDataFile(
             opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
         )
-    # initialize encoders to None
+    # initialize (auto)encoders to None
     encoder = None
     autoencoder = None
     if opts.trainAC:
@@ -142,26 +76,27 @@ def train(opts: options.Options):
     # if feature compression is enabled
     if opts.compressFeatures:
         if not opts.trainAC:
-            if opts.aeType == "deterministic":
-                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
-            elif opts.aeType == "variational":
+            if opts.aeType == "variational":
                 (autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
-            elif opts.ecWeightsFile == "":
-                encoder = load_model(opts.ecModelDir)
+            elif opts.aeType == "deterministic":
+                (autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
             else:
-                autoencoder.load_weights(
-                    os.path.join(opts.ecModelDir, opts.ecWeightsFile)
-                )
+                raise ValueError(f"Unknown autoencoder type: {opts.aeType}")
+
+            if opts.ecWeightsFile != "":
+                encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
         # compress the fingerprints using the autoencoder
         df = ac.compress_fingerprints(df, encoder)
-        # ac.visualize_fingerprints(
-        #     df,
-        #     before_col="fp",
-        #     after_col="fpcompressed",
-        #     train_indices=train_indices,
-        #     test_indices=test_indices,
-        #     save_as=f"UMAP_{opts.aeSplitType}.png",
-        # )
+    if opts.visualizeLatent and opts.trainAC:
+        logging.info("Visualizing latent space")
+        ac.visualize_fingerprints(
+            df,
+            before_col="fp",
+            after_col="fpcompressed",
+            train_indices=train_indices,
+            test_indices=test_indices,
+            save_as=f"UMAP_{opts.aeSplitType}.png",
+        )
     # train single label models if requested
     if opts.trainFNN and not opts.enableMultiLabel:
         sl.train_single_label_models(df=df, opts=opts)
@@ -193,10 +128,10 @@ def predict(opts: options.Options) -> None:
         if opts.aeType == "variational":
             (autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
         # Load trained model for autoencoder
-        if opts.ecWeightsFile == "":
-            encoder = load_model(opts.ecModelDir)
-        else:
+        if opts.ecWeightsFile != "":
             encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
+        else:
+            raise ValueError("No weights file specified for encoder")
         df = ac.compress_fingerprints(df, encoder)
 
     # Run predictions on the compressed fingerprints and store the results in a dataframe
@@ -257,36 +192,22 @@ def main():
                 raise ValueError("Input directory is not a directory")
         elif prog_args.method == "traingnn":
             traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-
+            createLogger("traingnn.log")
             traindmpnn(traingnn_opts)
 
         elif prog_args.method == "predictgnn":
             predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
-            fixed_opts = dataclasses.replace(
-                predictgnn_opts,
-                test_path=makePathAbsolute(predictgnn_opts.test_path),
-                preds_path=makePathAbsolute(predictgnn_opts.preds_path),
-            )
-
-            logging.info(
-                f"The following arguments are received or filled with default values:\n{prog_args}"
-            )
-
-            predictdmpnn(fixed_opts, prog_args.configFile)
+            createLogger("predictgnn.log")
+            predictdmpnn(predictgnn_opts)
 
         elif prog_args.method == "train":
             train_opts = options.Options.fromCmdArgs(prog_args)
-            fixed_opts = dataclasses.replace(
-                train_opts,
-                inputFile=makePathAbsolute(train_opts.inputFile),
-                outputDir=makePathAbsolute(train_opts.outputDir),
-            )
-            createDirectory(fixed_opts.outputDir)
-            createLogger(path.join(fixed_opts.outputDir, "train.log"))
+            createDirectory(train_opts.outputDir)
+            createLogger(path.join(train_opts.outputDir, "train.log"))
             logging.info(
-                f"The following arguments are received or filled with default values:\n{fixed_opts}"
+                f"The following arguments are received or filled with default values:\n{train_opts}"
             )
-            train(fixed_opts)
+            train(train_opts)
         elif prog_args.method == "predict":
             predict_opts = options.Options.fromCmdArgs(prog_args)
             fixed_opts = dataclasses.replace(
@@ -298,8 +219,6 @@ def main():
                 ),
                 ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
                 fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
-                trainAC=False,
-                trainFNN=False,
             )
             createDirectory(fixed_opts.outputDir)
             createLogger(path.join(fixed_opts.outputDir, "predict.log"))
 
@@ -1,14 +1,13 @@
 import logging
 import math
 import os.path
-from os.path import basename
 from typing import Tuple
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import seaborn as sns
-import umap
+import umap.umap_ as umap
 import wandb
 from sklearn.model_selection import train_test_split
 from tensorflow.keras import initializers, losses, optimizers
@@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
     """
     input_size = opts.fpSize
     encoding_dim = opts.encFPSize
-    ac_optimizer = optimizers.Adam(
-        learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay
+    lr_schedule = optimizers.schedules.ExponentialDecay(
+        opts.aeLearningRate,
+        decay_steps=1000,
+        decay_rate=opts.aeLearningRateDecay,
+        staircase=True,
     )
+    ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule)
 
     if output_bias is not None:
         output_bias = initializers.Constant(output_bias)
@@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
                 )(decoded)
 
         # output layer
-        # to either 0 or 1 and hence we use sigmoid activation function.
         decoded = Dense(
             units=input_size, activation="sigmoid", bias_initializer=output_bias
         )(decoded)
@@ -145,37 +147,9 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
     if opts.aeWabTracking and not opts.wabTracking:
         wandb.init(project=f"AE_{opts.aeSplitType}")
 
-    # Define output files for autoencoder and encoder weights
-    if opts.ecWeightsFile == "":
-        # If no encoder weights file is specified, use the input file name to generate a default file name
-        logging.info("No AE encoder weights file specified")
-        base_file_name = (
-            os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType
-        )
-        logging.info(
-            f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5"
-        )
-        ac_weights_file = os.path.join(
-            opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
-        )
-        # ec_weights_file = os.path.join(
-        #     opts.outputDir, base_file_name + ".encoder.weights.hdf5"
-        # )
-    else:
-        # If an encoder weights file is specified, use it as the encoder weights file name
-        logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}")
-        base_file_name = (
-            os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType
-        )
-        ac_weights_file = os.path.join(
-            opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
-        )
-        # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)
-
+    os.makedirs(opts.ecModelDir, exist_ok=True)
+    save_path = os.path.join(opts.ecModelDir, "autoencoder_weights.h5")
     # Collect the callbacks for training
-    callback_list = callbacks.autoencoder_callback(
-        checkpoint_path=ac_weights_file, opts=opts
-    )
 
     # Select all fingerprints that are valid and turn them into a numpy array
     fp_matrix = np.array(
@@ -286,32 +260,29 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
 
     # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!)
     (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias)
-
+    callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts)
     # Train the autoencoder on the training data
     auto_hist = autoencoder.fit(
         x_train,
         x_train,
-        callbacks=callback_list,
+        callbacks=[callback_list],
         epochs=opts.aeEpochs,
         batch_size=opts.aeBatchSize,
         verbose=opts.verbose,
         validation_data=(x_test, x_test) if opts.testSize > 0.0 else None,
     )
-    logging.info(f"Autoencoder weights stored in file: {ac_weights_file}")
 
     # Store the autoencoder training history and plot the metrics
     ht.store_and_plot_history(
-        base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"),
+        base_file_name=save_path,
         hist=auto_hist,
     )
 
     # Save the autoencoder callback model to disk
-    save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder")
-    if opts.testSize > 0.0:
-        (callback_autoencoder, callback_encoder) = define_ac_model(opts)
-        callback_encoder.save(filepath=save_path)
-    else:
-        encoder.save(filepath=save_path)
+    autoencoder.load_weights(save_path)
+    # Save the encoder weights
+    encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5"))
+
     # Return the encoder model of the trained autoencoder
     return encoder, train_indices, test_indices
 
@@ -386,7 +357,6 @@ def visualize_fingerprints(
     palette = {"train": "blue", "test": "red"}
 
     # Create the scatter plot
-    sns.set(style="white")
     fig, ax = plt.subplots(figsize=(10, 8))
     split = save_as.split("_", 1)
     part_after_underscore = split[1]