|
| 1 | +import os |
| 2 | +from argparse import Namespace |
| 3 | +import logging |
| 4 | +import pathlib |
| 5 | +import dataclasses |
| 6 | +from os import path |
| 7 | + |
| 8 | + |
| 9 | + |
| 10 | +from tensorflow import keras |
| 11 | +import wandb |
| 12 | + |
| 13 | +from dfpl.utils import makePathAbsolute, createDirectory |
| 14 | +from dfpl import options |
| 15 | +from dfpl import fingerprint as fp |
| 16 | +from dfpl import autoencoder as ac |
| 17 | +from dfpl import feedforwardNN as fNN |
| 18 | +from dfpl import predictions |
| 19 | +from dfpl import single_label_model as sl |
| 20 | +from dfpl import normalization |
| 21 | +from sklearn.preprocessing import MinMaxScaler |
| 22 | +import pandas as pd |
| 23 | +import pickle |
| 24 | +import jsonpickle |
| 25 | +from pathlib import Path |
| 26 | +import sys |
| 27 | +sys.path.append("/home/shanavas/PycharmProjects/deepFPlearn/dfpl/") |
| 28 | +from normalization import normalize_acc_values, inverse_transform_predictions |
| 29 | + |
| 30 | +wandb.init() |
| 31 | + |
| 32 | +project_directory = pathlib.Path(".").parent.parent.absolute() |
| 33 | +test_train_opts = options.Options( |
| 34 | + inputFile=f'{project_directory}/input_datasets/toxcast_regression_AR.csv', |
| 35 | + outputDir=f'{project_directory}/output_data/console_test', |
| 36 | + ecWeightsFile=f'{project_directory}/output_data/case_regression_01/AR/ae.encoder.hdf5', |
| 37 | + ecModelDir=f'{project_directory}/output_data/case_regression_01/AR/saved_model', |
| 38 | + type='inchi', |
| 39 | + fpType='topological', |
| 40 | + epochs=100, |
| 41 | + batchSize=1024, |
| 42 | + fpSize=2048, |
| 43 | + encFPSize=256, |
| 44 | + enableMultiLabel=False, |
| 45 | + testSize=0.2, |
| 46 | + kFolds=1, |
| 47 | + verbose=2, |
| 48 | + trainAC=False, |
| 49 | + trainFNN=True, |
| 50 | + compressFeatures=False, |
| 51 | + activationFunction="selu", |
| 52 | + lossFunction='mae', |
| 53 | + optimizer='Adam', |
| 54 | + fnnType='REG', # todo: replace useRegressionModel with fnnType variable |
| 55 | + wabTarget='AR', |
| 56 | + wabTracking=True, |
| 57 | + normalizeACC = False #dilshana |
| 58 | +) |
| 59 | + |
| 60 | +test_pred_opts = options.Options( |
| 61 | + inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", |
| 62 | + outputDir=f"{project_directory}/output_data/console_test", |
| 63 | + outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", |
| 64 | + ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", |
| 65 | + fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", |
| 66 | + type="smiles", |
| 67 | + fpType="topological" |
| 68 | +) |
| 69 | + |
| 70 | + |
| 71 | +def train(opts: options.Options): |
| 72 | + """ |
| 73 | + Run the main training procedure |
| 74 | + :param opts: Options defining the details of the training |
| 75 | + """ |
| 76 | + |
| 77 | + if opts.wabTracking: |
| 78 | + wandb.init(project=f"dfpl-reg-training-{opts.wabTarget}", entity="dfpl_regression", config=vars(opts)) |
| 79 | + # opts = wandb.config |
| 80 | + |
| 81 | + # df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) |
| 82 | + df = fp.importDataFile(opts.inputFile, import_function=fp.importCSV, fp_size=opts.fpSize) |
| 83 | + |
| 84 | + # Create output dir if it doesn't exist |
| 85 | + createDirectory(opts.outputDir) # why? we just created that directory in the function before?? |
| 86 | + encoder = None |
| 87 | + |
| 88 | + |
| 89 | + |
| 90 | + |
| 91 | + |
| 92 | + if opts.trainAC: |
| 93 | + # train an autoencoder on the full feature matrix |
| 94 | + encoder = ac.train_full_ac(df, opts) |
| 95 | + |
| 96 | + if opts.compressFeatures: |
| 97 | + |
| 98 | + if not opts.trainAC: |
| 99 | + # load trained model for autoencoder |
| 100 | + encoder = keras.models.load_model(opts.ecModelDir) |
| 101 | + |
| 102 | + # compress the fingerprints using the autoencoder |
| 103 | + df = ac.compress_fingerprints(df, encoder) |
| 104 | + |
| 105 | + if opts.normalizeACC: #dilshana |
| 106 | + df, scaler_path = normalize_acc_values(df, column_name='AR', output_dir=opts.outputDir) |
| 107 | + |
| 108 | + if opts.trainFNN: |
| 109 | + # train single label models |
| 110 | + # fNN.train_single_label_models(df=df, opts=opts) |
| 111 | + sl.train_single_label_models(df=df, opts=opts) |
| 112 | + |
| 113 | + # train multi-label models |
| 114 | + if opts.enableMultiLabel: |
| 115 | + fNN.train_nn_models_multi(df=df, opts=opts) |
| 116 | + |
| 117 | + |
| 118 | +#dilshana |
| 119 | + |
| 120 | +def predict(opts: options.Options) -> None: |
| 121 | + """ |
| 122 | + Run prediction given specific options |
| 123 | + :param opts: Options defining the details of the prediction |
| 124 | + """ |
| 125 | + df = fp.importDataFile(opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize) |
| 126 | + |
| 127 | + |
| 128 | + # Create output dir if it doesn't exist |
| 129 | + createDirectory(opts.outputDir) |
| 130 | + |
| 131 | + if opts.compressFeatures: |
| 132 | + # load trained model for autoencoder |
| 133 | + encoder = keras.models.load_model(opts.ecModelDir) |
| 134 | + # compress the fingerprints using the autoencoder |
| 135 | + df = ac.compress_fingerprints(df, encoder) |
| 136 | + |
| 137 | + logging.info(f"Raw predictions: {df.head()}") |
| 138 | + # predict |
| 139 | + df2 = predictions.predict_values(df=df, opts=opts) |
| 140 | + print(df2.head()) |
| 141 | + logging.info(f"Raw predictions: {df2['predicted'].head()}") |
| 142 | + |
| 143 | + |
| 144 | + if opts.scalerFilePath: |
| 145 | + |
| 146 | + |
| 147 | + df2['predicted'] = inverse_transform_predictions(df2['predicted'].values, opts.scalerFilePath) |
| 148 | + |
| 149 | + #normalized_file = os.path.join(opts.outputDir, "normalized_predictions.csv") |
| 150 | + #logging.info(f"Saving normalized predictions to {normalized_file}") |
| 151 | + #df2.to_csv(path_or_buf=normalized_file, index=False) |
| 152 | + |
| 153 | + else: |
| 154 | + logging.warning("Normalization is enabled but scalerFilePath is not provided in the options. Skipping normalization step.") |
| 155 | + |
| 156 | + |
| 157 | + names_columns = [c for c in df2.columns if c not in ['fp', 'fpcompressed']] |
| 158 | + |
| 159 | + df2[names_columns].to_csv(path_or_buf=path.join(opts.outputDir, opts.outputFile)) |
| 160 | + logging.info(f"Prediction successful. Results written to '{path.join(opts.outputDir, opts.outputFile)}'") |
| 161 | + |
| 162 | + |
| 163 | + |
| 164 | + |
| 165 | + |
| 166 | + |
| 167 | +def createLogger(filename: str) -> None: |
| 168 | + """ |
| 169 | + Set up a logger for the main function that also saves to a log file |
| 170 | + """ |
| 171 | + # get root logger and set its level |
| 172 | + logger = logging.getLogger() |
| 173 | + logger.setLevel(logging.INFO) |
| 174 | + # create file handler which logs info messages |
| 175 | + fh = logging.FileHandler(filename, mode="w") |
| 176 | + fh.setLevel(logging.INFO) |
| 177 | + # create console handler |
| 178 | + ch = logging.StreamHandler() |
| 179 | + ch.setLevel(logging.INFO) |
| 180 | + # create formatter and add it to the handlers |
| 181 | + formatterFile = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| 182 | + formatterConsole = logging.Formatter('%(levelname)-8s %(message)s') |
| 183 | + fh.setFormatter(formatterFile) |
| 184 | + ch.setFormatter(formatterConsole) |
| 185 | + # add the handlers to the logger |
| 186 | + logger.addHandler(fh) |
| 187 | + logger.addHandler(ch) |
| 188 | + |
| 189 | + |
| 190 | +def main(): |
| 191 | + """ |
| 192 | + Main function that runs training/prediction defined by command line arguments |
| 193 | + """ |
| 194 | + parser = options.createCommandlineParser() |
| 195 | + prog_args: Namespace = parser.parse_args() |
| 196 | + |
| 197 | + try: |
| 198 | + if prog_args.method == "convert": |
| 199 | + directory = makePathAbsolute(prog_args.f) |
| 200 | + if path.isdir(directory): |
| 201 | + createLogger(path.join(directory, "convert.log")) |
| 202 | + logging.info(f"Convert all data files in {directory}") |
| 203 | + fp.convert_all(directory) |
| 204 | + else: |
| 205 | + raise ValueError("Input directory is not a directory") |
| 206 | + if prog_args.method == "train": |
| 207 | + train_opts = options.Options.fromCmdArgs(prog_args) |
| 208 | + fixed_opts = dataclasses.replace( |
| 209 | + train_opts, |
| 210 | + inputFile=makePathAbsolute(train_opts.inputFile), |
| 211 | + outputDir=makePathAbsolute(train_opts.outputDir) |
| 212 | + ) |
| 213 | + createDirectory(fixed_opts.outputDir) |
| 214 | + createLogger(path.join(fixed_opts.outputDir, "train.log")) |
| 215 | + logging.info(f"The following arguments are received or filled with default values:\n{fixed_opts}") |
| 216 | + train(fixed_opts) |
| 217 | + exit(0) |
| 218 | + elif prog_args.method == "predict": |
| 219 | + predict_opts = options.Options.fromCmdArgs(prog_args) |
| 220 | + fixed_opts = dataclasses.replace( |
| 221 | + predict_opts, |
| 222 | + inputFile=makePathAbsolute(predict_opts.inputFile), |
| 223 | + outputDir=makePathAbsolute(predict_opts.outputDir), |
| 224 | + outputFile=makePathAbsolute(path.join(predict_opts.outputDir, predict_opts.outputFile)), |
| 225 | + ecModelDir=makePathAbsolute(predict_opts.ecModelDir), |
| 226 | + fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), |
| 227 | + trainAC=False, |
| 228 | + trainFNN=False |
| 229 | + ) |
| 230 | + createDirectory(fixed_opts.outputDir) |
| 231 | + createLogger(path.join(fixed_opts.outputDir, "predict.log")) |
| 232 | + logging.info(f"The following arguments are received or filled with default values:\n{prog_args}") |
| 233 | + predict(fixed_opts) |
| 234 | + exit(0) |
| 235 | + except AttributeError as e: |
| 236 | + print(e) |
| 237 | + parser.print_usage() |
| 238 | + |
| 239 | + |
| 240 | +if __name__ == '__main__': |
| 241 | + main() |
0 commit comments