Skip to content

Commit e73d36c

Browse files
authored
Merge pull request #42 from soulios/galaxy-format
Galaxy format
2 parents 8eae014 + 785eb54 commit e73d36c

File tree

16 files changed

+439
-649
lines changed

16 files changed

+439
-649
lines changed

.github/workflows/pr.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,20 @@ jobs:
7575
echo "predict result directory missing" >&2
7676
exit 1
7777
fi
78+
rm -rf example/results_predict/
79+
rm -rf example/results_train/
80+
dfpl train -f example/train.json --trainAC TRUE --compressFeatures TRUE --aeType deterministic
81+
if [ ! -d example/results_train/ ]; then
82+
echo "training result directory missing" >&2
83+
exit 1
84+
fi
85+
tree example
86+
87+
dfpl predict -f example/predict.json --compressFeatures TRUE --aeType deterministic
88+
if [ ! -d example/results_predict/ ]; then
89+
echo "predict result directory missing" >&2
90+
exit 1
91+
fi
7892
echo "result lines "$(wc -l example/results_predict/smiles.csv)
7993
if [ "$(cat example/results_predict/smiles.csv | wc -l)" -lt "6" ]; then
8094
echo "predict result should have at least 6 lines. But had only $(cat example/results_predict/smiles.csv | wc -l)" >&2

dfpl/__main__.py

Lines changed: 34 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
11
import dataclasses
22
import logging
33
import os.path
4-
import pathlib
54
from argparse import Namespace
65
from os import path
76

87
import chemprop as cp
9-
import pandas as pd
10-
from keras.models import load_model
118

129
from dfpl import autoencoder as ac
1310
from dfpl import feedforwardNN as fNN
@@ -17,108 +14,45 @@
1714
from dfpl import vae as vae
1815
from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute
1916

20-
project_directory = pathlib.Path(".").parent.parent.absolute()
21-
test_train_opts = options.Options(
22-
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
23-
outputDir=f"{project_directory}/output_data/console_test",
24-
ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
25-
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
26-
type="smiles",
27-
fpType="topological",
28-
epochs=100,
29-
batchSize=1024,
30-
fpSize=2048,
31-
encFPSize=256,
32-
enableMultiLabel=False,
33-
testSize=0.2,
34-
kFolds=2,
35-
verbose=2,
36-
trainAC=False,
37-
trainFNN=True,
38-
compressFeatures=True,
39-
activationFunction="selu",
40-
lossFunction="bce",
41-
optimizer="Adam",
42-
fnnType="FNN",
43-
)
4417

45-
test_pred_opts = options.Options(
46-
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
47-
outputDir=f"{project_directory}/output_data/console_test",
48-
outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
49-
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
50-
fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
51-
type="smiles",
52-
fpType="topological",
53-
)
54-
55-
56-
def traindmpnn(opts: options.GnnOptions):
18+
def traindmpnn(opts: options.GnnOptions) -> None:
5719
"""
5820
Train a D-MPNN model using the given options.
5921
Args:
6022
- opts: options.GnnOptions instance containing the details of the training
6123
Returns:
6224
- None
6325
"""
64-
os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
65-
ignore_elements = ["py/object"]
6626
# Load options from a JSON file and replace the relevant attributes in `opts`
67-
arguments = createArgsFromJson(
68-
opts.configFile, ignore_elements, return_json_object=False
69-
)
27+
arguments = createArgsFromJson(jsonFile=opts.configFile)
7028
opts = cp.args.TrainArgs().parse_args(arguments)
7129
logging.info("Training DMPNN...")
72-
# Train the model and get the mean and standard deviation of AUC score from cross-validation
7330
mean_score, std_score = cp.train.cross_validate(
7431
args=opts, train_func=cp.train.run_training
7532
)
7633
logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")
7734

7835

79-
def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
36+
def predictdmpnn(opts: options.GnnOptions) -> None:
8037
"""
8138
Predict the values using a trained D-MPNN model with the given options.
8239
Args:
8340
- opts: options.GnnOptions instance containing the details of the prediction
84-
- JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
8541
Returns:
8642
- None
8743
"""
88-
ignore_elements = [
89-
"py/object",
90-
"checkpoint_paths",
91-
"save_dir",
92-
"saving_name",
93-
]
9444
# Load options and additional arguments from a JSON file
95-
arguments, data = createArgsFromJson(
96-
json_arg_path, ignore_elements, return_json_object=True
97-
)
98-
arguments.append("--preds_path")
99-
arguments.append("")
100-
save_dir = data.get("save_dir")
101-
name = data.get("saving_name")
102-
# Replace relevant attributes in `opts` with loaded options
45+
arguments = createArgsFromJson(jsonFile=opts.configFile)
10346
opts = cp.args.PredictArgs().parse_args(arguments)
104-
opts.preds_path = save_dir + "/" + name
105-
df = pd.read_csv(opts.test_path)
106-
smiles = []
107-
for index, rows in df.iterrows():
108-
my_list = [rows.smiles]
109-
smiles.append(my_list)
110-
# Make predictions and return the result
111-
cp.train.make_predictions(args=opts, smiles=smiles)
47+
48+
cp.train.make_predictions(args=opts)
11249

11350

11451
def train(opts: options.Options):
11552
"""
11653
Run the main training procedure
11754
:param opts: Options defining the details of the training
11855
"""
119-
120-
os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
121-
12256
# import data from file and create DataFrame
12357
if "tsv" in opts.inputFile:
12458
df = fp.importDataFile(
@@ -128,7 +62,7 @@ def train(opts: options.Options):
12862
df = fp.importDataFile(
12963
opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
13064
)
131-
# initialize encoders to None
65+
# initialize (auto)encoders to None
13266
encoder = None
13367
autoencoder = None
13468
if opts.trainAC:
@@ -142,26 +76,27 @@ def train(opts: options.Options):
14276
# if feature compression is enabled
14377
if opts.compressFeatures:
14478
if not opts.trainAC:
145-
if opts.aeType == "deterministic":
146-
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
147-
elif opts.aeType == "variational":
79+
if opts.aeType == "variational":
14880
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
149-
elif opts.ecWeightsFile == "":
150-
encoder = load_model(opts.ecModelDir)
81+
elif opts.aeType == "deterministic":
82+
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
15183
else:
152-
autoencoder.load_weights(
153-
os.path.join(opts.ecModelDir, opts.ecWeightsFile)
154-
)
84+
raise ValueError(f"Unknown autoencoder type: {opts.aeType}")
85+
86+
if opts.ecWeightsFile != "":
87+
encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
15588
# compress the fingerprints using the autoencoder
15689
df = ac.compress_fingerprints(df, encoder)
157-
# ac.visualize_fingerprints(
158-
# df,
159-
# before_col="fp",
160-
# after_col="fpcompressed",
161-
# train_indices=train_indices,
162-
# test_indices=test_indices,
163-
# save_as=f"UMAP_{opts.aeSplitType}.png",
164-
# )
90+
if opts.visualizeLatent and opts.trainAC:
91+
logging.info("Visualizing latent space")
92+
ac.visualize_fingerprints(
93+
df,
94+
before_col="fp",
95+
after_col="fpcompressed",
96+
train_indices=train_indices,
97+
test_indices=test_indices,
98+
save_as=f"UMAP_{opts.aeSplitType}.png",
99+
)
165100
# train single label models if requested
166101
if opts.trainFNN and not opts.enableMultiLabel:
167102
sl.train_single_label_models(df=df, opts=opts)
@@ -193,10 +128,10 @@ def predict(opts: options.Options) -> None:
193128
if opts.aeType == "variational":
194129
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
195130
# Load trained model for autoencoder
196-
if opts.ecWeightsFile == "":
197-
encoder = load_model(opts.ecModelDir)
198-
else:
131+
if opts.ecWeightsFile != "":
199132
encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
133+
else:
134+
raise ValueError("No weights file specified for encoder")
200135
df = ac.compress_fingerprints(df, encoder)
201136

202137
# Run predictions on the compressed fingerprints and store the results in a dataframe
@@ -257,36 +192,22 @@ def main():
257192
raise ValueError("Input directory is not a directory")
258193
elif prog_args.method == "traingnn":
259194
traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
260-
195+
createLogger("traingnn.log")
261196
traindmpnn(traingnn_opts)
262197

263198
elif prog_args.method == "predictgnn":
264199
predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
265-
fixed_opts = dataclasses.replace(
266-
predictgnn_opts,
267-
test_path=makePathAbsolute(predictgnn_opts.test_path),
268-
preds_path=makePathAbsolute(predictgnn_opts.preds_path),
269-
)
270-
271-
logging.info(
272-
f"The following arguments are received or filled with default values:\n{prog_args}"
273-
)
274-
275-
predictdmpnn(fixed_opts, prog_args.configFile)
200+
createLogger("predictgnn.log")
201+
predictdmpnn(predictgnn_opts)
276202

277203
elif prog_args.method == "train":
278204
train_opts = options.Options.fromCmdArgs(prog_args)
279-
fixed_opts = dataclasses.replace(
280-
train_opts,
281-
inputFile=makePathAbsolute(train_opts.inputFile),
282-
outputDir=makePathAbsolute(train_opts.outputDir),
283-
)
284-
createDirectory(fixed_opts.outputDir)
285-
createLogger(path.join(fixed_opts.outputDir, "train.log"))
205+
createDirectory(train_opts.outputDir)
206+
createLogger(path.join(train_opts.outputDir, "train.log"))
286207
logging.info(
287-
f"The following arguments are received or filled with default values:\n{fixed_opts}"
208+
f"The following arguments are received or filled with default values:\n{train_opts}"
288209
)
289-
train(fixed_opts)
210+
train(train_opts)
290211
elif prog_args.method == "predict":
291212
predict_opts = options.Options.fromCmdArgs(prog_args)
292213
fixed_opts = dataclasses.replace(
@@ -298,8 +219,6 @@ def main():
298219
),
299220
ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
300221
fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
301-
trainAC=False,
302-
trainFNN=False,
303222
)
304223
createDirectory(fixed_opts.outputDir)
305224
createLogger(path.join(fixed_opts.outputDir, "predict.log"))

dfpl/autoencoder.py

Lines changed: 16 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import logging
22
import math
33
import os.path
4-
from os.path import basename
54
from typing import Tuple
65

76
import matplotlib.pyplot as plt
87
import numpy as np
98
import pandas as pd
109
import seaborn as sns
11-
import umap
10+
import umap.umap_ as umap
1211
import wandb
1312
from sklearn.model_selection import train_test_split
1413
from tensorflow.keras import initializers, losses, optimizers
@@ -32,9 +31,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
3231
"""
3332
input_size = opts.fpSize
3433
encoding_dim = opts.encFPSize
35-
ac_optimizer = optimizers.Adam(
36-
learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay
34+
lr_schedule = optimizers.schedules.ExponentialDecay(
35+
opts.aeLearningRate,
36+
decay_steps=1000,
37+
decay_rate=opts.aeLearningRateDecay,
38+
staircase=True,
3739
)
40+
ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule)
3841

3942
if output_bias is not None:
4043
output_bias = initializers.Constant(output_bias)
@@ -104,7 +107,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
104107
)(decoded)
105108

106109
# output layer
107-
# to either 0 or 1 and hence we use sigmoid activation function.
108110
decoded = Dense(
109111
units=input_size, activation="sigmoid", bias_initializer=output_bias
110112
)(decoded)
@@ -145,37 +147,9 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
145147
if opts.aeWabTracking and not opts.wabTracking:
146148
wandb.init(project=f"AE_{opts.aeSplitType}")
147149

148-
# Define output files for autoencoder and encoder weights
149-
if opts.ecWeightsFile == "":
150-
# If no encoder weights file is specified, use the input file name to generate a default file name
151-
logging.info("No AE encoder weights file specified")
152-
base_file_name = (
153-
os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType
154-
)
155-
logging.info(
156-
f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5"
157-
)
158-
ac_weights_file = os.path.join(
159-
opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
160-
)
161-
# ec_weights_file = os.path.join(
162-
# opts.outputDir, base_file_name + ".encoder.weights.hdf5"
163-
# )
164-
else:
165-
# If an encoder weights file is specified, use it as the encoder weights file name
166-
logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}")
167-
base_file_name = (
168-
os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType
169-
)
170-
ac_weights_file = os.path.join(
171-
opts.outputDir, base_file_name + ".autoencoder.weights.hdf5"
172-
)
173-
# ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile)
174-
150+
os.makedirs(opts.ecModelDir, exist_ok=True)
151+
save_path = os.path.join(opts.ecModelDir, "autoencoder_weights.h5")
175152
# Collect the callbacks for training
176-
callback_list = callbacks.autoencoder_callback(
177-
checkpoint_path=ac_weights_file, opts=opts
178-
)
179153

180154
# Select all fingerprints that are valid and turn them into a numpy array
181155
fp_matrix = np.array(
@@ -286,32 +260,29 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
286260

287261
# Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!)
288262
(autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias)
289-
263+
callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts)
290264
# Train the autoencoder on the training data
291265
auto_hist = autoencoder.fit(
292266
x_train,
293267
x_train,
294-
callbacks=callback_list,
268+
callbacks=[callback_list],
295269
epochs=opts.aeEpochs,
296270
batch_size=opts.aeBatchSize,
297271
verbose=opts.verbose,
298272
validation_data=(x_test, x_test) if opts.testSize > 0.0 else None,
299273
)
300-
logging.info(f"Autoencoder weights stored in file: {ac_weights_file}")
301274

302275
# Store the autoencoder training history and plot the metrics
303276
ht.store_and_plot_history(
304-
base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"),
277+
base_file_name=save_path,
305278
hist=auto_hist,
306279
)
307280

308281
# Save the autoencoder callback model to disk
309-
save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder")
310-
if opts.testSize > 0.0:
311-
(callback_autoencoder, callback_encoder) = define_ac_model(opts)
312-
callback_encoder.save(filepath=save_path)
313-
else:
314-
encoder.save(filepath=save_path)
282+
autoencoder.load_weights(save_path)
283+
# Save the encoder weights
284+
encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5"))
285+
315286
# Return the encoder model of the trained autoencoder
316287
return encoder, train_indices, test_indices
317288

@@ -386,7 +357,6 @@ def visualize_fingerprints(
386357
palette = {"train": "blue", "test": "red"}
387358

388359
# Create the scatter plot
389-
sns.set(style="white")
390360
fig, ax = plt.subplots(figsize=(10, 8))
391361
split = save_as.split("_", 1)
392362
part_after_underscore = split[1]

0 commit comments

Comments
 (0)