Skip to content

Commit 43fbeb9

Browse files
committed
changes
1 parent 0589051 commit 43fbeb9

File tree

694 files changed

+747442
-422495
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

694 files changed

+747442
-422495
lines changed

.RData

48 Bytes
Binary file not shown.

.Rhistory

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
install.packages("rJava")
2+
q()
3+
install.packages("rJava")
4+
install.packages("rJava", type = "source")
5+
q()

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -143,15 +143,15 @@ you can provide all necessary information as commandline-parameters. Check
143143

144144
```shell script
145145
python -m dfpl --help
146-
python -m dfpl train --help
147-
python -m dfpl predict --help
146+
python -m dfpl train-good1 --help
147+
python -m dfpl predict-good1 --help
148148
```
149149

150150
However, using JSON files that contain all train/predict options an easy way to preserve what was run and you can use
151151
them instead of providing multiple commandline arguments.
152152

153153
```shell script
154-
python -m dfpl train -f path/to/file.json
154+
python -m dfpl train-good1 -f path/to/file.json
155155
```
156156

157157
See, e.g. the JSON files under `validation/case_XX` for examples. Also, you can use the following to create template
@@ -161,7 +161,7 @@ JSON files for training or prediction
161161
import dfpl.options as opts
162162

163163
train_opts = opts.Options()
164-
train_opts.saveToFile("train.json")
164+
train_opts.saveToFile("train-good1.json")
165165

166166
predict_opts = opts.Options()
167167
predict_opts.saveToFile("predict_bestER03.json")
@@ -176,7 +176,7 @@ of `dfpl.options.TrainingOptions` or
176176
import dfpl.__main__ as main
177177
import dfpl.options as opts
178178

179-
o = opts.Options.fromJson("/path/to/train.json")
179+
o = opts.Options.fromJson("/path/to/train-good1.json")
180180
main.train(o)
181181
```
182182

cases/regression_example/create_regression_data.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,15 @@
88
def get_args():
99
parser = argparse.ArgumentParser()
1010
parser.add_argument("--filename",
11-
help="Path to CSV file containing classification train data which shall be randomly "
11+
help="Path to CSV file containing classification train-good1 data which shall be randomly "
1212
"transformed to regression data")
1313
args = parser.parse_args()
1414
return args
1515

1616

1717
def main():
1818
"""
19-
Reads the 'example/train_data.csv' file containing train data. Generates random values in (0, 2] for active
19+
Reads the 'example/train_data.csv' file containing train-good1 data. Generates random values in (0, 2] for active
2020
compounds (class: 1) and 0 for inactive compounds. Values are replaced. The original column names are kept. The
2121
result is written to a new CSV file 'example/train_data_reg.csv' where '_reg' stands for regression.
2222

dfpl/normalization.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def normalize_acc_values(df, column_name='AR', output_dir='.'):
1616
"""
1717
logging.info("Normalizing ACC values...")
1818
print("Normalizing ACC values...")
19-
scaler = MinMaxScaler(feature_range=(0, 1))
19+
scaler = MinMaxScaler(feature_range=(-1, 1))
2020
acc_values = df[column_name].values.reshape(-1, 1)
2121
scaled_acc_values = scaler.fit_transform(acc_values)
2222
df[column_name] = scaled_acc_values

dfpl/single_label_model.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import math
33
import shutil
44
import sys
5+
import os
56
from os import path
67
from time import time
78

@@ -313,7 +314,7 @@ def acper(y_true, y_pred, t: float = 0.02):
313314
yield False
314315

315316

316-
def evaluate_regression_model(x_test: np.ndarray, y_test: np.ndarray, file_prefix: str, model: Model,
317+
def evaluate_regression_model(x_test: np.ndarray, y_test: np.ndarray,file_prefix: str, model: Model,
317318
target: str, fold: int, threshold: float = 0.05) -> pd.DataFrame:
318319
"""
319320
This function returns the values of performance metrics for the regression model.
@@ -338,7 +339,6 @@ def evaluate_regression_model(x_test: np.ndarray, y_test: np.ndarray, file_prefi
338339

339340
y_predict = model.predict(x_test).flatten()
340341
pd.DataFrame(y_predict).to_csv(path_or_buf=f"{file_prefix}.y_test_predict.csv")
341-
342342
error = np.array(y_predict) - np.array(y_test)
343343
abs_error = abs(error)
344344

@@ -472,6 +472,7 @@ def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np.
472472
# use callback model for evaluation
473473
callback_model = define_single_label_model(input_size=x_train.shape[1], opts=opts)
474474
callback_model.load_weights(filepath=checkpoint_model_weights_path)
475+
# save_split_data(x_train, x_test, y_train, y_test, fold=fold, target=target,opts=opts)
475476

476477
if opts.fnnType == 'REG':
477478
pl.plot_loss(hist=hist, file=f"{model_file_prefix}.history.jpg")
@@ -485,6 +486,20 @@ def fit_and_evaluate_model(x_train: np.ndarray, x_test: np.ndarray, y_train: np.
485486
target=target, fold=fold)
486487

487488
return performance
489+
#def save_split_data(x_train, x_test, y_train, y_test, fold, target,opts: options.Options):
490+
# """Helper function to save combined x and y data in the same CSV files for train/test splits."""
491+
# # Combine x and y into a single DataFrame for train and test
492+
# train_df = pd.DataFrame(x_train)
493+
# train_df[target] = y_train # Adding y values as a new column to the x data
494+
495+
# test_df = pd.DataFrame(x_test)
496+
# test_df[target] = y_test # Adding y values as a new column to the x data
497+
498+
# Generate file names based on fold_no (0 for single fold) and save CSVs
499+
# train_df.to_csv(os.path.join(opts.outputDir, f"train_fold_{fold}_{target}.csv"), index=True)
500+
# test_df.to_csv(os.path.join(opts.outputDir, f"test_fold_{fold}_{target}.csv"), index=True)
501+
502+
488503

489504

490505
def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
@@ -542,24 +557,36 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
542557
trained_model.load_weights(path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5"))
543558
trained_model.save(filepath=path.join(opts.outputDir, f"{target}_saved_model"))
544559

560+
545561
elif 1 < opts.kFolds < int(x.shape[0] / 100):
546562
# do a k-fold cross-validation
547563
if opts.fnnType != 'REG':
548564
kfold_c_validator = StratifiedKFold(n_splits=opts.kFolds, shuffle=True, random_state=42)
549565
else:
550566
kfold_c_validator = KFold(n_splits=opts.kFolds, shuffle=True, random_state=42)
567+
568+
569+
551570
fold_no = 1
552571
# split the data
553572
for train, test in kfold_c_validator.split(x, y):
554573
# for testing use one of the splits:
555574
# kf = kfold_c_validator.split(x, y)
556575
# train, test = next(kf)
576+
train_indices_list = pd.DataFrame(train)
577+
test_indices_list = pd.DataFrame(test)
578+
557579
performance = fit_and_evaluate_model(x_train=x[train], x_test=x[test],
558580
y_train=y[train], y_test=y[test],
559581
fold=fold_no, target=target, opts=opts)
560582
performance_list.append(performance)
561-
fold_no += 1
583+
584+
# fold_no += 1
562585
# now next fold
586+
train_indices_list.to_csv(os.path.join(opts.outputDir, f"train_fold_{fold_no}.csv"),index=False,header=["Train Index"])
587+
test_indices_list.to_csv(os.path.join(opts.outputDir, f"test_fold_{fold_no}.csv"),index=False,header=["Test Index"])
588+
fold_no += 1
589+
563590

564591
# select and copy best model - how to define the best model?
565592
if opts.fnnType == 'REG':
@@ -579,6 +606,10 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None:
579606
ignore_index=True)['fold'][0]
580607
)
581608

609+
610+
611+
612+
582613
# copy checkpoint model weights
583614
shutil.copy(
584615
src=path.join(opts.outputDir, f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5"),
Binary file not shown.

0 commit comments

Comments
 (0)