Skip to content
140 changes: 28 additions & 112 deletions dfpl/__main__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import dataclasses
import logging
import os.path
import pathlib
from argparse import Namespace
from os import path

import chemprop as cp
import pandas as pd
from keras.models import load_model

from dfpl import autoencoder as ac
from dfpl import feedforwardNN as fNN
Expand All @@ -17,108 +14,45 @@
from dfpl import vae as vae
from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute

project_directory = pathlib.Path(".").parent.parent.absolute()
test_train_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
type="smiles",
fpType="topological",
epochs=100,
batchSize=1024,
fpSize=2048,
encFPSize=256,
enableMultiLabel=False,
testSize=0.2,
kFolds=2,
verbose=2,
trainAC=False,
trainFNN=True,
compressFeatures=True,
activationFunction="selu",
lossFunction="bce",
optimizer="Adam",
fnnType="FNN",
)

test_pred_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
type="smiles",
fpType="topological",
)


def traindmpnn(opts: options.GnnOptions):
def traindmpnn(opts: options.GnnOptions) -> None:
"""
Train a D-MPNN model using the given options.
Args:
- opts: options.GnnOptions instance containing the details of the training
Returns:
- None
"""
os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
ignore_elements = ["py/object"]
# Load options from a JSON file and replace the relevant attributes in `opts`
arguments = createArgsFromJson(
opts.configFile, ignore_elements, return_json_object=False
)
arguments = createArgsFromJson(jsonFile=opts.configFile)
opts = cp.args.TrainArgs().parse_args(arguments)
logging.info("Training DMPNN...")
# Train the model and get the mean and standard deviation of AUC score from cross-validation
mean_score, std_score = cp.train.cross_validate(
args=opts, train_func=cp.train.run_training
)
logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")


def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
def predictdmpnn(opts: options.GnnOptions) -> None:
"""
Predict the values using a trained D-MPNN model with the given options.
Args:
- opts: options.GnnOptions instance containing the details of the prediction
- JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
Returns:
- None
"""
ignore_elements = [
"py/object",
"checkpoint_paths",
"save_dir",
"saving_name",
]
# Load options and additional arguments from a JSON file
arguments, data = createArgsFromJson(
json_arg_path, ignore_elements, return_json_object=True
)
arguments.append("--preds_path")
arguments.append("")
save_dir = data.get("save_dir")
name = data.get("saving_name")
# Replace relevant attributes in `opts` with loaded options
arguments = createArgsFromJson(jsonFile=opts.configFile)
opts = cp.args.PredictArgs().parse_args(arguments)
opts.preds_path = save_dir + "/" + name
df = pd.read_csv(opts.test_path)
smiles = []
for index, rows in df.iterrows():
my_list = [rows.smiles]
smiles.append(my_list)
# Make predictions and return the result
cp.train.make_predictions(args=opts, smiles=smiles)

cp.train.make_predictions(args=opts)


def train(opts: options.Options):
"""
Run the main training procedure
:param opts: Options defining the details of the training
"""

os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"

# import data from file and create DataFrame
if "tsv" in opts.inputFile:
df = fp.importDataFile(
Expand All @@ -128,7 +62,7 @@ def train(opts: options.Options):
df = fp.importDataFile(
opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
)
# initialize encoders to None
# initialize (auto)encoders to None
encoder = None
autoencoder = None
if opts.trainAC:
Expand All @@ -142,26 +76,26 @@ def train(opts: options.Options):
# if feature compression is enabled
if opts.compressFeatures:
if not opts.trainAC:
if opts.aeType == "deterministic":
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
elif opts.aeType == "variational":
if opts.aeType == "variational":
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
elif opts.ecWeightsFile == "":
encoder = load_model(opts.ecModelDir)
else:
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())

if opts.ecWeightsFile != "":
autoencoder.load_weights(
os.path.join(opts.ecModelDir, opts.ecWeightsFile)
)
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
# ac.visualize_fingerprints(
# df,
# before_col="fp",
# after_col="fpcompressed",
# train_indices=train_indices,
# test_indices=test_indices,
# save_as=f"UMAP_{opts.aeSplitType}.png",
# )
if opts.visualizeLatent:
ac.visualize_fingerprints(
df,
before_col="fp",
after_col="fpcompressed",
train_indices=train_indices,
test_indices=test_indices,
save_as=f"UMAP_{opts.aeSplitType}.png",
)
# train single label models if requested
if opts.trainFNN and not opts.enableMultiLabel:
sl.train_single_label_models(df=df, opts=opts)
Expand Down Expand Up @@ -193,9 +127,7 @@ def predict(opts: options.Options) -> None:
if opts.aeType == "variational":
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
# Load trained model for autoencoder
if opts.ecWeightsFile == "":
encoder = load_model(opts.ecModelDir)
else:
if opts.ecWeightsFile != "":
encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
df = ac.compress_fingerprints(df, encoder)

Expand Down Expand Up @@ -257,36 +189,22 @@ def main():
raise ValueError("Input directory is not a directory")
elif prog_args.method == "traingnn":
traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)

createLogger("traingnn.log")
traindmpnn(traingnn_opts)

elif prog_args.method == "predictgnn":
predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
predictgnn_opts,
test_path=makePathAbsolute(predictgnn_opts.test_path),
preds_path=makePathAbsolute(predictgnn_opts.preds_path),
)

logging.info(
f"The following arguments are received or filled with default values:\n{prog_args}"
)

predictdmpnn(fixed_opts, prog_args.configFile)
createLogger("predictgnn.log")
predictdmpnn(predictgnn_opts)

elif prog_args.method == "train":
train_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
train_opts,
inputFile=makePathAbsolute(train_opts.inputFile),
outputDir=makePathAbsolute(train_opts.outputDir),
)
createDirectory(fixed_opts.outputDir)
createLogger(path.join(fixed_opts.outputDir, "train.log"))
createDirectory(train_opts.outputDir)
createLogger(path.join(train_opts.outputDir, "train.log"))
logging.info(
f"The following arguments are received or filled with default values:\n{fixed_opts}"
f"The following arguments are received or filled with default values:\n{train_opts}"
)
train(fixed_opts)
train(train_opts)
elif prog_args.method == "predict":
predict_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
Expand All @@ -298,8 +216,6 @@ def main():
),
ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
trainAC=False,
trainFNN=False,
)
createDirectory(fixed_opts.outputDir)
createLogger(path.join(fixed_opts.outputDir, "predict.log"))
Expand Down
60 changes: 15 additions & 45 deletions dfpl/autoencoder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import math
import os.path
from os.path import basename
from typing import Tuple

import matplotlib.pyplot as plt
Expand Down Expand Up @@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
"""
input_size = opts.fpSize
encoding_dim = opts.encFPSize
ac_optimizer = optimizers.Adam(
learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay
lr_schedule = optimizers.schedules.ExponentialDecay(
opts.aeLearningRate,
decay_steps=1000,
decay_rate=opts.aeLearningRateDecay,
staircase=True,
)
ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule)

if output_bias is not None:
output_bias = initializers.Constant(output_bias)
Expand Down Expand Up @@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
)(decoded)

# output layer
# to either 0 or 1 and hence we use sigmoid activation function.
decoded = Dense(
units=input_size, activation="sigmoid", bias_initializer=output_bias
)(decoded)
Expand Down Expand Up @@ -145,37 +147,9 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
if opts.aeWabTracking and not opts.wabTracking:
wandb.init(project=f"AE_{opts.aeSplitType}")

# Define output files for autoencoder and encoder weights
if opts.ecWeightsFile == "":
# If no encoder weights file is specified, use the input file name to generate a default file name
logging.info("No AE encoder weights file specified")
base_file_name = (
os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType
)
logging.info(
f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5"
)
ac_weights_file = os.path.join(
opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
)
# ec_weights_file = os.path.join(
# opts.outputDir, base_file_name + ".encoder.weights.hdf5"
# )
else:
# If an encoder weights file is specified, use it as the encoder weights file name
logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}")
base_file_name = (
os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType
)
ac_weights_file = os.path.join(
opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
)
# ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)

os.makedirs(opts.ecModelDir, exist_ok=True)
save_path = os.path.join(opts.ecModelDir, "autoencoder_weights.h5")
# Collect the callbacks for training
callback_list = callbacks.autoencoder_callback(
checkpoint_path=ac_weights_file, opts=opts
)

# Select all fingerprints that are valid and turn them into a numpy array
fp_matrix = np.array(
Expand Down Expand Up @@ -286,32 +260,29 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:

# Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!)
(autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias)

callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts)
# Train the autoencoder on the training data
auto_hist = autoencoder.fit(
x_train,
x_train,
callbacks=callback_list,
callbacks=[callback_list],
epochs=opts.aeEpochs,
batch_size=opts.aeBatchSize,
verbose=opts.verbose,
validation_data=(x_test, x_test) if opts.testSize > 0.0 else None,
)
logging.info(f"Autoencoder weights stored in file: {ac_weights_file}")

# Store the autoencoder training history and plot the metrics
ht.store_and_plot_history(
base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"),
base_file_name=save_path,
hist=auto_hist,
)

# Save the autoencoder callback model to disk
save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder")
if opts.testSize > 0.0:
(callback_autoencoder, callback_encoder) = define_ac_model(opts)
callback_encoder.save(filepath=save_path)
else:
encoder.save(filepath=save_path)
autoencoder.load_weights(save_path)
# Save the encoder weights
encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5"))

# Return the encoder model of the trained autoencoder
return encoder, train_indices, test_indices

Expand Down Expand Up @@ -386,7 +357,6 @@ def visualize_fingerprints(
palette = {"train": "blue", "test": "red"}

# Create the scatter plot
sns.set(style="white")
fig, ax = plt.subplots(figsize=(10, 8))
split = save_as.split("_", 1)
part_after_underscore = split[1]
Expand Down
Loading
Loading