diff --git a/.gitignore b/.gitignore index 72d98b0f..c5cfd1e1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,18 @@ __pycache__ *.egg* .DS_Store + +wandb + + +legacy/test_automl/data +legacy/test_automl/test.py +*.pkl +*.pt +openfe-singlecell +examples/tuning/cta_svm/test +examples/tuning/cta_svm/train +examples/tuning/cta_svm/map +test +train +map diff --git a/dance/legacy/automl_config/__init__.py b/dance/legacy/automl_config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dance/legacy/automl_config/fun2code.py b/dance/legacy/automl_config/fun2code.py new file mode 100644 index 00000000..25f76b65 --- /dev/null +++ b/dance/legacy/automl_config/fun2code.py @@ -0,0 +1,30 @@ +import scanpy as sc + +from dance.transforms.cell_feature import CellPCA, CellSVD, WeightedFeaturePCA +from dance.transforms.filter import FilterGenesPercentile, FilterGenesRegression +from dance.transforms.gene_holdout import GeneHoldout +from dance.transforms.interface import AnnDataTransform +from dance.transforms.mask import CellwiseMaskData, MaskData +from dance.transforms.misc import SaveRaw +from dance.transforms.normalize import ScaleFeature, ScTransformR + +#TODO register more functions +fun2code_dict = { + "normalize_total": AnnDataTransform(sc.pp.normalize_total, target_sum=1e4, key_added="n_counts"), + "log1p": AnnDataTransform(sc.pp.log1p, base=2), + "scaleFeature": ScaleFeature(split_names="ALL", mode="standardize"), + "scTransform": ScTransformR(mirror_index=1), + "filter_gene_by_count": AnnDataTransform(sc.pp.filter_genes, min_cells=1), + "filter_gene_by_percentile": FilterGenesPercentile(min_val=1, max_val=99, mode="sum"), + "highly_variable_genes": AnnDataTransform(sc.pp.highly_variable_genes), + "regress_out": AnnDataTransform(sc.pp.regress_out), + "Filter_gene_by_regress_score": FilterGenesRegression("enclasc"), + "cell_svd": CellSVD(), + "cell_weighted_pca": WeightedFeaturePCA(split_name="train"), + "cell_pca": CellPCA(), + "filter_cell_by_count": AnnDataTransform(sc.pp.filter_cells, min_genes=1), + "save_raw": SaveRaw(), + "cell_wise_mask_data": CellwiseMaskData(), + "mask_data": MaskData(), + "gene_hold_out": GeneHoldout() +} #funcion 2 code,map functions as variables diff --git a/dance/legacy/automl_config/readme.txt b/dance/legacy/automl_config/readme.txt new file mode 100644 index 00000000..6c8af9b9 --- /dev/null +++ b/dance/legacy/automl_config/readme.txt @@ -0,0 +1,3 @@ +If you need to register a new function, first pass the new function in the fun2code file. +If you use step2, you can declare the step2 pipline in step2_config. +If you use step3, you can register a new optimization function in step3_config. diff --git a/dance/legacy/automl_config/step2_config.py b/dance/legacy/automl_config/step2_config.py new file mode 100644 index 00000000..b2ead8b0 --- /dev/null +++ b/dance/legacy/automl_config/step2_config.py @@ -0,0 +1,154 @@ +import functools +import itertools + +import wandb + +from dance import logger +from dance.legacy.automl_config import fun2code_dict +from dance.transforms.misc import SetConfig + +#TODO register more functions and add more examples +pipline2fun_dict = { + "normalize": { + "values": ["normalize_total", "log1p", "scaleFeature", "scTransform"] + }, + "gene_filter": { + "values": + ["filter_gene_by_count", "filter_gene_by_percentile", "highly_variable_genes", "Filter_gene_by_regress_score"] + }, + "gene_dim_reduction": { + "values": ["cell_svd", "cell_weighted_pca", "cell_pca"] + }, + "cell_filter": { + "values": ["filter_cell_by_count"] + }, + "mask_name": { + "values": ["cell_wise_mask_data", "mask_data"] + }, + "gene_hold_out_name": { + "values": ["gene_hold_out"] + } +} #Functions registered in the preprocessing process + + +def generate_combinations_with_required_elements(elements, required_elements=[]): + """ + Parameters + ---------- + elements + Optional process in Step 2 + required_elements + The required process in Step 2 + """ + optional_elements = [x for x in elements if x not in required_elements] + + # Sort optional elements in the same order as in the `elements` list + optional_elements.sort(key=lambda x: elements.index(x)) + + # Generate all possible combinations of optional elements + optional_combinations = [] + for i in range(len(optional_elements) + 1): + optional_combinations += list(itertools.combinations(optional_elements, i)) + + # Combine required elements with optional combinations to get all possible combinations + all_combinations = [] + for optional_combination in optional_combinations: + all_combinations.append([x for x in elements if x in required_elements or x in optional_combination]) + return all_combinations + + +def getFunConfig(selected_keys=None): + """Get the config that needs to be optimized and the number of rounds.""" + global pipline2fun_dict + pipline2fun_dict_subset = {key: pipline2fun_dict[key] for key in selected_keys} + print(pipline2fun_dict) + count = 1 + for _, pipline_values in pipline2fun_dict_subset.items(): + count *= len(pipline_values['values']) + return pipline2fun_dict_subset, count + + +def get_transforms(config=None, set_data_config=True, save_raw=False): + """Obtain the Compose of the preprocessing function according to the preprocessing + process.""" + if ("normalize" not in config.keys() or config.normalize + != "log1p") and ("gene_filter" in config.keys() and config.gene_filter == "highly_variable_genes"): + logger.warning( + "highly_variable_genes expects logarithmized data, except when flavor='seurat_v3', in which count data is expected." + ) + return None + + transforms = [] + for key in config.keys(): + if save_raw and key == "normalize": + transforms.append(fun2code_dict["save_raw"]) + print(key, config[key]) + transforms.append(fun2code_dict[config[key]]) if key in pipline2fun_dict.keys() else None + if save_raw and "normalize" not in config.keys(): + transforms.append(fun2code_dict["save_raw"]) + if set_data_config: + data_config = {"label_channel": "cell_type"} + if "gene_dim_reduction" in config.keys(): + data_config.update({"feature_channel": fun2code_dict[config.gene_dim_reduction].name}) + transforms.append(SetConfig(data_config)) + return transforms + + +def sweepDecorator(selected_keys=None, project="pytorch-cell_type_annotation_ACTINN"): + """Decorator for preprocessing configuration functions.""" + + def decorator(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + pipline2fun_dict, count = getFunConfig(selected_keys) + parameters_dict = pipline2fun_dict + try: + sweep_config, train = func(parameters_dict) + sweep_id = wandb.sweep(sweep_config, project=project) + wandb.agent(sweep_id, train, count=count) + except Exception as e: #Except, etc. are not necessarily needed in the code. + print(f"{func.__name__}{args}\n==> {e}") + raise e + + return wrapper + + return decorator + + +def setStep2(func=None, original_list=None, required_elements=[]): + """Generate corresponding decorators for different preprocessing.""" + # all_combinations = [ + # combo for i in range(len(original_list) + 1) + # for combo in generate_combinations_with_required_elements(original_list, i, required_elements=required_elements) + # ] + all_combinations = generate_combinations_with_required_elements(elements=original_list, + required_elements=required_elements) + generated_functions = [] + for s_key in all_combinations: + s_list = list(s_key) + print(s_list) + decorator = sweepDecorator(selected_keys=s_list) + generated_functions.append(decorator(func)) + return generated_functions + + +def log_in_wandb(config): + """Decorator wrapped using wandb.It is used in train function.""" + + def decorator(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + with wandb.init(config=config): + config_s = wandb.config + result = func(config_s, *args, **kwargs) + wandb.log(result) + except Exception as e: + print(f"{func.__name__}{args}\n==> {e}") + raise e + + return wrapper + + return decorator diff --git a/dance/legacy/automl_config/step3_config.py b/dance/legacy/automl_config/step3_config.py new file mode 100644 index 00000000..989206fb --- /dev/null +++ b/dance/legacy/automl_config/step3_config.py @@ -0,0 +1,225 @@ +import functools +import sys + +import optuna +import scanpy as sc +import wandb +from optuna.integration.wandb import WeightsAndBiasesCallback + +from dance import logger +from dance.automl_config.fun2code import fun2code_dict +from dance.automl_config.step2_config import pipline2fun_dict +from dance.transforms.cell_feature import CellPCA, CellSVD, WeightedFeaturePCA +from dance.transforms.filter import FilterGenesPercentile, FilterGenesRegression +from dance.transforms.gene_holdout import GeneHoldout +from dance.transforms.interface import AnnDataTransform +from dance.transforms.mask import CellwiseMaskData, MaskData +from dance.transforms.misc import SetConfig +from dance.transforms.normalize import ScaleFeature, ScTransformR + + +def set_method_name(func): + """Get method name to name the optimization option.""" + + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + method_name = func.__name__ + "_" + result = func(method_name, *args, **kwargs) + return result + except Exception as e: + print(f"{func.__name__}{args}\n==> {e}") + raise e + + return wrapper + + +@set_method_name +def cell_pca(method_name: str, trial: optuna.Trial): + return CellPCA(n_components=trial.suggest_int(method_name + "n_components", 200, 5000)) + + +@set_method_name +def cell_weighted_pca(method_name: str, trial: optuna.Trial): + return WeightedFeaturePCA(n_components=trial.suggest_int(method_name + "n_components", 200, 5000)) + + +@set_method_name +def cell_svd(method_name: str, trial: optuna.Trial): + return CellSVD(n_components=trial.suggest_int(method_name + "n_components", 200, 5000)) + + +@set_method_name +def Filter_gene_by_regress_score(method_name: str, trial: optuna.Trial): + return FilterGenesRegression( + method=trial.suggest_categorical(method_name + "method", ["enclasc", "seurat3", "scmap"]), + num_genes=trial.suggest_int(method_name + "num_genes", 5000, 6000)) + + +@set_method_name +def highly_variable_genes(method_name: str, trial: optuna.Trial): + method_name = str(sys._getframe().f_code.co_name) + "_" + return AnnDataTransform(sc.pp.highly_variable_genes, min_mean=trial.suggest_float( + method_name + "min_mean", 0.0025, + 0.03), max_mean=trial.suggest_float(method_name + "min_mean", 1.5, + 4.5), min_disp=trial.suggest_float(method_name + "min_disp", 0.25, 0.75), + span=trial.suggest_float(method_name + "span", 0.2, + 1.0), n_bins=trial.suggest_int(method_name + "n_bins", 10, 30), + flavor=trial.suggest_categorical(method_name + "flavor", ['seurat', 'cell_ranger'])) + + +@set_method_name +def filter_gene_by_percentile(method_name: str, trial: optuna.Trial): + return FilterGenesPercentile(min_val=trial.suggest_int(method_name + "min_val", 1, 10), + max_val=trial.suggest_int(method_name + "max_val", 90, 99), + mode=trial.suggest_categorical(method_name + "mode", ["sum", "var", "cv", "rv"])) + + +@set_method_name +def filter_gene_by_count(method_name: str, trial: optuna.Trial): + method = trial.suggest_categorical(method_name + "method", ['min_counts', 'min_cells', 'max_counts', 'max_cells']) + if method == "min_counts": + num = trial.suggest_int(method_name + "num", 2, 10) + if method == "min_cells": + num = trial.suggest_int(method_name + "num", 2, 10) + if method == "max_counts": + num = trial.suggest_int(method_name + "num", 500, 1000) + if method == "max_cells": + num = trial.suggest_int(method_name + "num", 500, 1000) + return AnnDataTransform(sc.pp.filter_genes, **{method: num}) + + +@set_method_name +def log1p(method_name: str, trial: optuna.Trial): + return AnnDataTransform(sc.pp.log1p, base=trial.suggest_int(method_name + "base", 2, 10)) + + +@set_method_name +def scTransform(method_name: str, trial: optuna.Trial): + return ScTransformR(min_cells=trial.suggest_int(method_name + "min_cells", 1, 10)) + + +@set_method_name +def scaleFeature(method_name: str, trial: optuna.Trial): #eps未优化 + return ScaleFeature(mode=trial.suggest_categorical(method_name + + "mode", ["normalize", "standardize", "minmax", "l2"])) + + +@set_method_name +def normalize_total(method_name: str, trial: optuna.Trial): + exclude_highly_expressed = trial.suggest_categorical(method_name + "exclude_highly_expressed", [False, True]) + if exclude_highly_expressed: + max_fraction = trial.suggest_float(method_name + "max_fraction", 0.08, 0.1) + return AnnDataTransform(sc.pp.normalize_total, + target_sum=trial.suggest_categorical(method_name + "target_sum", [1e4, 1e5, 1e6]), + exclude_highly_expressed=exclude_highly_expressed, max_fraction=max_fraction, + key_added="n_counts") + else: + return AnnDataTransform(sc.pp.normalize_total, + target_sum=trial.suggest_categorical(method_name + "target_sum", [1e4, 1e5, 1e6]), + exclude_highly_expressed=exclude_highly_expressed, key_added="n_counts") + + +@set_method_name +def filter_cell_by_count(method_name: str, trial: optuna.Trial): + method = trial.suggest_categorical(method_name + "method", ['min_counts', 'min_genes', 'max_counts', 'max_genes']) + if method == "min_counts": + num = trial.suggest_int(method_name + "num", 1, 10) + if method == "min_genes": + num = trial.suggest_int(method_name + "num", 1, 10) + if method == "max_counts": + num = trial.suggest_int(method_name + "num", 500, 1000) + if method == "max_genes": + num = trial.suggest_int(method_name + "num", 500, 1000) + return AnnDataTransform(sc.pp.filter_cells, **{method: num}) + + +@set_method_name +def cell_wise_mask_data(method_name: str, trial: optuna.Trial): + return CellwiseMaskData(distr=trial.suggest_categorical(method_name + "distr", ['exp', 'uniform']), + mask_rate=trial.suggest_float(method_name + "mask_rate", 0.01, 0.5), + min_gene_counts=trial.suggest_int(method_name + "min_gene_counts", 1, 10)) + + +@set_method_name +def mask_data(method_name: str, trial: optuna.Trial): + return MaskData(mask_rate=trial.suggest_float(method_name + "mask_rate", 0.01, 0.5)) + + +@set_method_name +def gene_hold_out(method_name: str, trial: optuna.Trial): + return GeneHoldout(n_top=trial.suggest_int(method_name + "n_top", 1, 10), + batch_size=trial.suggest_categorical(method_name + "batch_size", [256, 512, 1024])) + + +# # 获取当前文件中的所有函数 +# functions = [(name,obj) for name, obj in inspect.getmembers( +# sys.modules[__name__]) if inspect.isfunction(obj)] + +# print(functions) +# # 遍历并装饰每个函数 +# for name, function in functions: +# if name != "set_method_name": # 排除装饰器函数本身 +# print(function) +# setattr(__name__, name, set_method_name(function)) + + +def get_transforms(trial, fun_list, set_data_config=True, save_raw=False): + """Obtain the Compose of the preprocessing function according to the preprocessing + function.""" + transforms = [] + for f_str in fun_list: + if f_str in pipline2fun_dict['normalize']['values'] and save_raw: + transforms.append(fun2code_dict["save_raw"]) + fun_i = eval(f_str) + transforms.append(fun_i(trial)) + if "highly_variable_genes" in fun_list and "log1p" not in fun_list[:fun_list.index('"highly_variable_genes"')]: + logger.warning( + "highly_variable_genes expects logarithmized data, except when flavor='seurat_v3', in which count data is expected." + ) + + #The relationship between highly_variable_genes and log1p needs to be further discussed based on the flavor parameter + #A little change is needed + return None + if set_data_config: + data_config = {"label_channel": "cell_type"} + feature_name = {"cell_svd", "cell_weighted_pca", "cell_pca"} & set(fun_list) + if feature_name: + data_config.update({"feature_channel": fun2code_dict[feature_name].name}) + transforms.append(SetConfig(data_config)) + return transforms + + +def log_in_wandb(wandbc=None): + """Decorate optimization functions.""" + + def decorator(func): + + def wrapper(*args, **kwargs): + wandb_decorator = wandbc.track_in_wandb() + decorator_function = wandb_decorator(func) + result = decorator_function(*args, **kwargs) + wandb.log(result) + values = list(result.values()) + if len(values) == 1: + return values[0] + else: + return tuple(values) + + return wrapper + + return decorator + + +def get_optimizer(project, objective, n_trials=2, direction="maximize"): + """Get optimizer.""" + wandb_kwargs = {"project": project} + wandbc = WeightsAndBiasesCallback(wandb_kwargs=wandb_kwargs, as_multirun=True) + decorator = log_in_wandb(wandbc) + decorator_function = decorator(objective) + study = optuna.create_study(direction=direction) + + def wrapper(): + study.optimize(decorator_function, n_trials=n_trials, callbacks=[wandbc]) + + return wrapper diff --git a/dance/modules/single_modality/clustering/scdcc.py b/dance/modules/single_modality/clustering/scdcc.py index 8fbe0f6f..02c4ab78 100644 --- a/dance/modules/single_modality/clustering/scdcc.py +++ b/dance/modules/single_modality/clustering/scdcc.py @@ -521,7 +521,7 @@ def fit( float(ml_loss.cpu()) + float(cl_loss.cpu()), ml_loss.cpu(), cl_loss.cpu()) index = update_interval * np.argmax(aris) - self.q = Q[f"epoch{index}"] + self.q = Q[f"epoch{int(index)}"] def predict_proba(self, x: Optional[Any] = None) -> np.ndarray: """Get the predicted propabilities for each cell. diff --git a/dance/pipeline.py b/dance/pipeline.py index 63c52649..c30a769d 100644 --- a/dance/pipeline.py +++ b/dance/pipeline.py @@ -243,6 +243,7 @@ def to_config(self) -> Config: class PipelinePlaner(Pipeline): TUNE_MODE_KEY = "tune_mode" + TUNING_PARAMS_KEY = "params_to_tune" DEFAULT_PARAMS_KEY = "default_params" PELEM_INCLUDE_KEY = "include" PELEM_EXCLUDE_KEY = "exclude" @@ -371,9 +372,17 @@ def config(self, cfg: ConfigLike): elif self.tune_mode == "params": self._candidate_params = [None] * pipeline_length for i in range(pipeline_length): - self._default_params[i] = pipeline_config[i].get(self.DEFAULT_PARAMS_KEY) - if val := self[i].params: - self._candidate_params[i] = val + if self.DEFAULT_PARAMS_KEY in pipeline_config[i]: + logger.warning(f"params tuning mode ignores {self.DEFAULT_PARAMS_KEY!r}, which is " + f"currently specified pipeline element #{i}:\n\t{pipeline_config[i]}") + + # Set default params (auto set key to the current target) + if val := pipeline_config[i].get(self.PARAMS_KEY): + self._default_params[i] = {self[i].target: val} + + # Set tuning params + if val := pipeline_config[i].get(self.TUNING_PARAMS_KEY): + self._candidate_params[i] = OmegaConf.to_container(val) # Make sure targets are set missed_target_idx = [ @@ -417,7 +426,7 @@ def _sanitize_pipeline( raise ValueError(f"Expecting {pipeline_length} targets specifications, " f"but only got {len(pipeline)}: {pipeline}") - logger.info(f"Pipeline plane:\n{Color('green')(pformat(pipeline))}") + logger.info(f"Pipeline plan:\n{Color('green')(pformat(pipeline))}") return pipeline @@ -434,7 +443,7 @@ def _sanitize_params( params_dict = params params = [None] * pipeline_length for i, j in params_dict.items(): - idx, key = i.split(f"{Pipeline.PIPELINE_KEY}.", 1)[1].split(".", 1) + idx, key = i.split(f"{Pipeline.PARAMS_KEY}.", 1)[1].split(".", 1) idx = int(idx) logger.debug(f"Setting {key!r} for pipeline element {idx} to {j}") @@ -450,7 +459,7 @@ def _sanitize_params( raise ValueError(f"Expecting {pipeline_length} targets specifications, " f"but only got {len(params)}: {params}") - logger.info(f"Params plane:\n{Color('green')(pformat(params))}") + logger.info(f"Params plan:\n{Color('green')(pformat(params))}") return params @@ -684,6 +693,9 @@ def search_space(self) -> Dict[str, Any]: "type": "feature.cell", "target": "WeightedFeaturePCA", "params": { + "out": "feature.cell", + } + "params_to_tune": { "n_components": { "values": [128, 256, 512, 1024], }, diff --git a/dance/transforms/cell_feature.py b/dance/transforms/cell_feature.py index 313b0364..af23a9a5 100644 --- a/dance/transforms/cell_feature.py +++ b/dance/transforms/cell_feature.py @@ -21,6 +21,9 @@ class WeightedFeaturePCA(BaseTransform): Number of PCs to use. split_name Which split to use to compute the gene PCA. If not set, use all data. + feat_norm_mode + Feature normalization mode, see :func:`dance.utils.matrix.normalize`. If set to `None`, then do not perform + feature normalization before reduction. """ diff --git a/dance/transforms/graph/resept_graph.py b/dance/transforms/graph/resept_graph.py index 9ff8fb7a..21652daf 100644 --- a/dance/transforms/graph/resept_graph.py +++ b/dance/transforms/graph/resept_graph.py @@ -19,10 +19,12 @@ class RESEPTGraph(BaseTransform): """ - def __init__(self, fiducial_diameter_fullres=144.56835055243283, tissue_hires_scalef=0.150015, **kwargs): + def __init__(self, fiducial_diameter_fullres=144.56835055243283, tissue_hires_scalef=0.150015, + n_neighbors: int = 15, **kwargs): super().__init__(**kwargs) self.fiducial_diameter_fullres = fiducial_diameter_fullres self.tissue_hires_scalef = tissue_hires_scalef + self.n_neighbors = n_neighbors def scale_to_RGB(self, channel, truncated_percent): truncated_down = np.percentile(channel, truncated_percent) @@ -34,6 +36,7 @@ def scale_to_RGB(self, channel, truncated_percent): def __call__(self, data: Data) -> Data: xy_pixel = data.get_feature(return_type="numpy", channel="spatial_pixel", channel_type="obsm") + sc.pp.neighbors(data.data, n_neighbors=self.n_neighbors) sc.tl.umap(data.data, n_components=3) X_transform = data.get_feature(return_type="numpy", channel="X_umap", channel_type="obsm") X_transform[:, 0] = self.scale_to_RGB(X_transform[:, 0], 100) diff --git a/dance/transforms/normalize.py b/dance/transforms/normalize.py index 507da66b..296fe306 100644 --- a/dance/transforms/normalize.py +++ b/dance/transforms/normalize.py @@ -4,6 +4,7 @@ import anndata as ad import numpy as np import pandas as pd +import scanpy as sc import scipy.sparse as sp import statsmodels.discrete.discrete_model import statsmodels.nonparametric.kernel_regression @@ -13,6 +14,7 @@ from dance.data.base import Data from dance.registry import register_preprocessor from dance.transforms.base import BaseTransform +from dance.transforms.interface import AnnDataTransform from dance.typing import Dict, List, Literal, NormMode, Optional, Union from dance.utils.matrix import normalize @@ -483,3 +485,17 @@ def info(n, th, mu, y, w): t0 = max(t0, 0) return t0 + + +@register_preprocessor("normalize") +class Log1P(AnnDataTransform): + + def __init__(self, **kwargs): + super().__init__(sc.pp.log1p, **kwargs) + + +@register_preprocessor("normalize") +class NormalizeTotal(AnnDataTransform): + + def __init__(self, **kwargs): + super().__init__(sc.pp.normalize_total, **kwargs) diff --git a/dance/transforms/sc3_feature.py b/dance/transforms/sc3_feature.py index ebda74ee..bf9d88d0 100644 --- a/dance/transforms/sc3_feature.py +++ b/dance/transforms/sc3_feature.py @@ -86,5 +86,5 @@ def __call__(self, data): sim_matrix_all = np.array(sim_matrix_all) sim_matrix_mean = np.mean(sim_matrix_all, axis=0) - data.data.uns[self.out] = sim_matrix_mean + data.data.obsm[self.out] = sim_matrix_mean return data diff --git a/dance/utils/matrix.py b/dance/utils/matrix.py index 424e651c..4176b6b1 100644 --- a/dance/utils/matrix.py +++ b/dance/utils/matrix.py @@ -1,5 +1,3 @@ -import sys - import numba import numpy as np import torch diff --git a/examples/tuning/all_params_config.yaml b/examples/tuning/all_params_config.yaml new file mode 100644 index 00000000..e69de29b diff --git a/examples/tuning/all_pipline_config.yaml b/examples/tuning/all_pipline_config.yaml new file mode 100644 index 00000000..3e23617c --- /dev/null +++ b/examples/tuning/all_pipline_config.yaml @@ -0,0 +1,18 @@ +type: preprocessor +tune_mode: pipeline +pipeline: + - type: normalize + include: + - ScaleFeature + - ScTransform + - Log1P + - NormalizeTotal + - type: filter.gene + include: + - FilterGenesTopK + - FilterGenesPercentile + - FilterGenesMarker + - FilterGenesRegression + - FilterGenesMarkerGini + - FilterGenesCommon + - FilterGenesMatch diff --git a/examples/tuning/cta_svm/main.py b/examples/tuning/cta_svm/main.py index df9fb2bd..f7a1a5a7 100644 --- a/examples/tuning/cta_svm/main.py +++ b/examples/tuning/cta_svm/main.py @@ -28,7 +28,7 @@ logger.setLevel(args.log_level) logger.info(f"\n{pprint.pformat(vars(args))}") - pipeline_planer = PipelinePlaner.from_config_file("tuning_config.yaml") + pipeline_planer = PipelinePlaner.from_config_file("examples/tuning/cta_svm/tuning_config_step3.yaml") def evaluate_pipeline(): wandb.init() @@ -39,9 +39,9 @@ def evaluate_pipeline(): # Load raw data data = CellTypeAnnotationDataset(train_dataset=args.train_dataset, test_dataset=args.test_dataset, species=args.species, tissue=args.tissue).load_data() - # Prepare preprocessing pipeline and apply it to data - preprocessing_pipeline = pipeline_planer.generate(pipeline=dict(wandb.config)) + preprocessing_pipeline = pipeline_planer.generate(pipeline=None, params=dict(wandb.config)) + print(f"Pipeline config:\n{preprocessing_pipeline.to_yaml()}") preprocessing_pipeline(data) diff --git a/examples/tuning/cta_svm/tuning_config.yaml b/examples/tuning/cta_svm/pipeline_tuning_config.yaml similarity index 59% rename from examples/tuning/cta_svm/tuning_config.yaml rename to examples/tuning/cta_svm/pipeline_tuning_config.yaml index b73c86db..7a43bf6a 100644 --- a/examples/tuning/cta_svm/tuning_config.yaml +++ b/examples/tuning/cta_svm/pipeline_tuning_config.yaml @@ -1,16 +1,28 @@ type: preprocessor tune_mode: pipeline pipeline: + - type: filter.gene + include: + - FilterGenesTopK + - FilterGenesPercentile + default_params: + FilterGenesTopK: + num_genes: 2000 - type: feature.cell include: - WeightedFeaturePCA - CellPCA - CellSVD params: - n_components: 400 + #It is suggested to change to common_params out: feature.cell default_params: + CellSVD: + n_components: 400 + CellPCA: + n_components: 400 WeightedFeaturePCA: + n_components: 400 split_name: train - type: misc target: SetConfig @@ -21,7 +33,7 @@ pipeline: wandb: entity: danceteam project: dance-dev - method: bayes + method: bayes #maybe grid metric: name: acc # val/acc goal: maximize diff --git a/examples/tuning/cta_svm/tuning_config_step3.yaml b/examples/tuning/cta_svm/tuning_config_step3.yaml new file mode 100644 index 00000000..3b729da3 --- /dev/null +++ b/examples/tuning/cta_svm/tuning_config_step3.yaml @@ -0,0 +1,33 @@ +type: preprocessor +tune_mode: params +pipeline: + - type: filter.gene + target: FilterGenesTopK + params_to_tune: + num_genes: + min: 2000 + max: 4000 + - type: feature.cell + target: WeightedFeaturePCA + params_to_tune: + n_components: + min: 200 + max: 400 + params: + out: + feature.cell + split_name: + train + - type: misc + target: SetConfig + params: + config_dict: + feature_channel: feature.cell + label_channel: cell_type +wandb: + entity: danceteam + project: dance-dev + method: bayes #maybe grid + metric: + name: acc # val/acc + goal: maximize diff --git a/examples/tuning/variable.yaml b/examples/tuning/variable.yaml new file mode 100644 index 00000000..e53823b1 --- /dev/null +++ b/examples/tuning/variable.yaml @@ -0,0 +1,121 @@ +filter_gene_sequence: + - 0: + - min_counts + - min_cells + - max_counts + - max_cells + - 1: + - min_counts + - min_cells + - max_cells + - max_counts + - 2: + - min_counts + - max_counts + - min_cells + - max_cells + - 3: + - min_counts + - max_counts + - max_cells + - min_cells + - 4: + - min_counts + - max_cells + - min_cells + - max_counts + - 5: + - min_counts + - max_cells + - max_counts + - min_cells + - 6: + - min_cells + - min_counts + - max_counts + - max_cells + - 7: + - min_cells + - min_counts + - max_cells + - max_counts + - 8: + - min_cells + - max_counts + - min_counts + - max_cells + - 9: + - min_cells + - max_counts + - max_cells + - min_counts + - 10: + - min_cells + - max_cells + - min_counts + - max_counts + - 11: + - min_cells + - max_cells + - max_counts + - min_counts + - 12: + - max_counts + - min_counts + - min_cells + - max_cells + - 13: + - max_counts + - min_counts + - max_cells + - min_cells + - 14: + - max_counts + - min_cells + - min_counts + - max_cells + - 15: + - max_counts + - min_cells + - max_cells + - min_counts + - 16: + - max_counts + - max_cells + - min_counts + - min_cells + - 17: + - max_counts + - max_cells + - min_cells + - min_counts + - 18: + - max_cells + - min_counts + - min_cells + - max_counts + - 19: + - max_cells + - min_counts + - max_counts + - min_cells + - 20: + - max_cells + - min_cells + - min_counts + - max_counts + - 21: + - max_cells + - min_cells + - max_counts + - min_counts + - 22: + - max_cells + - max_counts + - min_counts + - min_cells + - 23: + - max_cells + - max_counts + - min_cells + - min_counts diff --git a/examples/variable b/examples/variable new file mode 100644 index 00000000..e69de29b diff --git a/legacy/test_automl/step2_examples/step2_cell_type_annotation_actinn_example.py b/legacy/test_automl/step2_examples/step2_cell_type_annotation_actinn_example.py new file mode 100644 index 00000000..62e81e17 --- /dev/null +++ b/legacy/test_automl/step2_examples/step2_cell_type_annotation_actinn_example.py @@ -0,0 +1,84 @@ +from typing import Any, Callable, Dict, Tuple + +import numpy as np +import torch + +from dance import logger +from dance.datasets.singlemodality import CellTypeAnnotationDataset +from dance.legacy.automl_config.step2_config import get_transforms, log_in_wandb, setStep2 +from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN +from dance.transforms.misc import Compose +from dance.utils import set_seed + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@log_in_wandb(config=None) +def train(config): + + model = ACTINN(hidden_dims=config.hidden_dims, lambd=config.lambd, device=device) + transforms = get_transforms(config=config) + if transforms is None: + logger.warning("skip transforms") + return {"scores": 0} + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + train_dataset = [753, 3285] + test_dataset = [2695] + tissue = "Brain" + species = "mouse" + dataloader = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, tissue=tissue, + species=species, data_dir="./test_automl/data") + data = dataloader.load_data(transform=preprocessing_pipeline, cache=False) + + # Obtain training and testing data + x_train, y_train = data.get_train_data(return_type="torch") + x_test, y_test = data.get_test_data(return_type="torch") + x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float() + # Train and evaluate models for several rounds + scores = [] + for seed in range(config.seed, config.seed + config.num_runs): + set_seed(seed) + + model.fit(x_train, y_train, seed=seed, lr=config.learning_rate, num_epochs=config.num_epochs, + batch_size=config.batch_size, print_cost=False) + scores.append(score := model.score(x_test, y_test)) + return {"scores": np.mean(scores)} + + +def startSweep(parameters_dict) -> Tuple[Dict[str, Any], Callable[..., Any]]: + parameters_dict.update({ + 'batch_size': { + 'value': 128 + }, + "hidden_dims": { + 'value': [2000] + }, + 'lambd': { + 'value': 0.005 + }, + 'num_epochs': { + 'value': 50 + }, + 'seed': { + 'value': 0 + }, + 'num_runs': { + 'value': 1 + }, + 'learning_rate': { + 'value': 0.0001 + } + }) + sweep_config = {'method': 'grid'} + sweep_config['parameters'] = parameters_dict + metric = {'name': 'scores', 'goal': 'maximize'} + + sweep_config['metric'] = metric + return sweep_config, train #Return function configuration and training function + + +if __name__ == "__main__": + """get_function_combinations.""" + function_list = setStep2(startSweep, original_list=["normalize", "gene_filter", "gene_dim_reduction"]) + for func in function_list: + func() diff --git a/legacy/test_automl/step2_examples/step2_clustering_scdcc.py b/legacy/test_automl/step2_examples/step2_clustering_scdcc.py new file mode 100644 index 00000000..e8e6383b --- /dev/null +++ b/legacy/test_automl/step2_examples/step2_clustering_scdcc.py @@ -0,0 +1,167 @@ +"""normalize_total is a must at SCDCC because it requires n_counts.""" +import os +from typing import Any, Callable, Dict, Tuple + +import numpy as np +import torch + +from dance import logger +from dance.datasets.singlemodality import ClusteringDataset +from dance.legacy.automl_config.step2_config import get_transforms, log_in_wandb, setStep2 +from dance.modules.single_modality.clustering.scdcc import ScDCC +from dance.transforms.misc import Compose, SetConfig +from dance.transforms.preprocess import generate_random_pair +from dance.utils import set_seed + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +@log_in_wandb(config=None) +def train(config): + """clustering.""" + aris = [] + for seed in range(config.seed, config.seed + config.num_runs): + set_seed(seed) + dataset = "10X_PBMC" + # Load data and perform necessary preprocessing + dataloader = ClusteringDataset("./test_automl/data", dataset=dataset) + + transforms = get_transforms(config=config, set_data_config=False, save_raw=True) + if ("normalize" not in config.keys() or config.normalize != "normalize_total") or transforms is None: + logger.warning("skip transforms") + return {"scores": 0} + transforms.append( + SetConfig({ + "feature_channel": [None, None, "n_counts"], + "feature_channel_type": ["X", "raw_X", "obs"], + "label_channel": "Group" + })) + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + data = dataloader.load_data(transform=preprocessing_pipeline, cache=config.cache) + + # inputs: x, x_raw, n_counts + inputs, y = data.get_train_data() + n_clusters = len(np.unique(y)) + in_dim = inputs[0].shape[1] + + # Generate random pairs + if not os.path.exists(config.label_cells_files): + indx = np.arange(len(y)) + np.random.shuffle(indx) + label_cell_indx = indx[0:int(np.ceil(config.label_cells * len(y)))] + else: + label_cell_indx = np.loadtxt(config.label_cells_files, dtype=np.int) + + if config.n_pairwise > 0: + ml_ind1, ml_ind2, cl_ind1, cl_ind2, error_num = generate_random_pair(y, label_cell_indx, config.n_pairwise, + config.n_pairwise_error) + print("Must link paris: %d" % ml_ind1.shape[0]) + print("Cannot link paris: %d" % cl_ind1.shape[0]) + print("Number of error pairs: %d" % error_num) + else: + ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) + + # Build and train moodel + model = ScDCC(input_dim=in_dim, z_dim=config.z_dim, n_clusters=n_clusters, encodeLayer=config.encodeLayer, + decodeLayer=config.encodeLayer[::-1], sigma=config.sigma, gamma=config.gamma, + ml_weight=config.ml_weight, cl_weight=config.ml_weight, device=device, + pretrain_path=f"scdcc_{dataset}_pre.pkl") + model.fit(inputs, y, lr=config.lr, batch_size=config.batch_size, epochs=config.epochs, ml_ind1=ml_ind1, + ml_ind2=ml_ind2, cl_ind1=cl_ind1, cl_ind2=cl_ind2, update_interval=config.update_interval, + tol=config.tol, pt_batch_size=config.batch_size, pt_lr=config.pretrain_lr, + pt_epochs=config.pretrain_epochs) + + # Evaluate model predictions + score = model.score(None, y) + print(f"{score=:.4f}") + aris.append(score) + + print('scdcc') + print(f'aris: {aris}') + print(f'aris: {np.mean(aris)} +/- {np.std(aris)}') + return ({"scores": np.mean(aris)}) + + +def startSweep(parameters_dict) -> Tuple[Dict[str, Any], Callable[..., Any]]: + parameters_dict.update({ + 'seed': { + 'value': 0 + }, + 'num_runs': { + 'value': 1 + }, + 'cache': { + 'value': True + }, + 'label_cells_files': { + 'value': 'label_10X_PBMC.txt' + }, + 'label_cells': { + 'value': 0.1 + }, + 'n_pairwise': { + 'value': 0 + }, + 'n_pairwise_error': { + 'value': 0 + }, + 'z_dim': { + 'value': 32 + }, + 'encodeLayer': { + 'value': [256, 64] + }, + 'sigma': { + 'value': 2.5 + }, + 'gamma': { + 'value': 1.0 + }, + 'lr': { + 'value': 0.01 + }, + 'pretrain_lr': { + 'value': 0.001 + }, + 'ml_weight': { + 'value': 1.0 + }, + 'cl_weight': { + 'value': 1.0 + }, + 'update_interval': { + 'value': 1.0 + }, + 'tol': { + 'value': 0.00001 + }, + 'ae_weights': { + 'value': None + }, + 'ae_weight_file': { + 'value': "AE_weights.pth.tar" + }, + 'pretrain_epochs': { + 'value': 50 + }, + 'epochs': { + 'value': 500 + }, + 'batch_size': { + 'value': 256 + } + }) + + sweep_config = {'method': 'grid'} + sweep_config['parameters'] = parameters_dict + metric = {'name': 'scores', 'goal': 'maximize'} + + sweep_config['metric'] = metric + return sweep_config, train #Return function configuration and training function + + +if __name__ == "__main__": + """get_function_combinations.""" + function_list = setStep2(startSweep, original_list=["gene_filter", "cell_filter", "normalize"]) + for func in function_list: + func() diff --git a/legacy/test_automl/step2_examples/step2_imputation_deepimpute.py b/legacy/test_automl/step2_examples/step2_imputation_deepimpute.py new file mode 100644 index 00000000..df4b0ed3 --- /dev/null +++ b/legacy/test_automl/step2_examples/step2_imputation_deepimpute.py @@ -0,0 +1,124 @@ +from typing import Any, Callable, Dict, Tuple + +import numpy as np +import torch + +from dance import logger +from dance.datasets.singlemodality import ImputationDataset +from dance.legacy.automl_config.step2_config import get_transforms, log_in_wandb, setStep2 +from dance.modules.single_modality.imputation.deepimpute import DeepImpute +from dance.transforms.misc import Compose, SetConfig +from dance.utils import set_seed + +device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu") + + +@log_in_wandb(config=None) +def train(config): + """imputation.""" + rmses = [] + for seed in range(config.seed, config.seed + config.num_runs): + set_seed(seed) + dataset = "mouse_brain_data" + data_dir = "./test_automl/data" + dataloader = ImputationDataset(data_dir=data_dir, dataset=dataset, train_size=config.train_size) + transforms = get_transforms(config=config, set_data_config=False, save_raw=True) + if transforms is None: + logger.warning("skip transforms") + return {"scores": 0} + transforms.append( + SetConfig({ + "feature_channel": [None, None, "targets", "predictors", "train_mask"], + "feature_channel_type": ["X", "raw_X", "uns", "uns", "layers"], + "label_channel": [None, None], + "label_channel_type": ["X", "raw_X"], + })) + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + data = dataloader.load_data(transform=preprocessing_pipeline, cache=config.cache) + + if config.mask: + X, X_raw, targets, predictors, mask = data.get_x(return_type="default") + else: + mask = None + X, X_raw, targets, predictors = data.get_x(return_type="default") + X = torch.tensor(X.toarray()).float() + X_raw = torch.tensor(X_raw.toarray()).float() + X_train = X * mask + model = DeepImpute(predictors, targets, dataset, config.sub_outputdim, config.hidden_dim, config.dropout, seed, + 2) + + model.fit(X_train, X_train, mask, config.batch_size, config.lr, config.n_epochs, config.patience) + imputed_data = model.predict(X_train, mask) + score = model.score(X, imputed_data, mask, metric='RMSE') + print("RMSE: %.4f" % score) + rmses.append(score) + + print('deepimpute') + print(f'rmses: {rmses}') + print(f'rmses: {np.mean(rmses)} +/- {np.std(rmses)}') + return ({"scores": np.mean(rmses)}) + + +def startSweep(parameters_dict) -> Tuple[Dict[str, Any], Callable[..., Any]]: + parameters_dict.update({ + 'dropout': { + 'value': 0.1 + }, + 'lr': { + 'value': 1e-5 + }, + 'n_epochs': { + 'value': 5 + }, + 'batch_size': { + 'value': 64 + }, + 'sub_outputdim': { + 'value': 512 + }, + 'hidden_dim': { + 'value': 256 + }, + 'patience': { + 'value': 20 + }, + 'min_cells': { + 'value': 0.05 + }, + "n_top": { + 'value': 5 + }, + "train_size": { + "value": 0.9 + }, + "mask_rate": { + "value": 0.1 + }, + "cache": { + "value": False + }, + "mask": { #避免出现与超参数流程重复的情况,一般没有 + "value": True + }, + "seed": { + "value": 0 + }, + "num_runs": { + "value": 1 + } + }) + sweep_config = {'method': 'grid'} + sweep_config['parameters'] = parameters_dict + metric = {'name': 'scores', 'goal': 'minimize'} + + sweep_config['metric'] = metric + return sweep_config, train #Return function configuration and training function + + +if __name__ == "__main__": + """get_function_combinations.""" + function_list = setStep2( + startSweep, original_list=["gene_filter", "cell_filter", "normalize", "gene_hold_out_name", "mask_name"], + required_elements=["gene_hold_out_name", "mask_name"]) + for func in function_list: + func() diff --git a/legacy/test_automl/step3_examples/step3_cell_type_annotation_actinn_example.py b/legacy/test_automl/step3_examples/step3_cell_type_annotation_actinn_example.py new file mode 100644 index 00000000..7914d0f2 --- /dev/null +++ b/legacy/test_automl/step3_examples/step3_cell_type_annotation_actinn_example.py @@ -0,0 +1,61 @@ +import numpy as np +import torch + +from dance import logger +from dance.datasets.singlemodality import CellTypeAnnotationDataset +from dance.legacy.automl_config.step3_config import get_optimizer, get_transforms +from dance.modules.single_modality.cell_type_annotation.actinn import ACTINN +from dance.transforms.misc import Compose +from dance.utils import set_seed + +fun_list = ["log1p", "filter_gene_by_count"] + +device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + + +def objective(trial): + """Optimization function.""" + parameters_dict = { + 'batch_size': 128, + "hidden_dims": [2000], + 'lambd': 0.005, + 'num_epochs': 10, + 'seed': 0, + 'num_runs': 1, + 'learning_rate': 0.0001 + } + + train_dataset = [753, 3285] + test_dataset = [2695] + tissue = "Brain" + species = "mouse" + dataloader = CellTypeAnnotationDataset(train_dataset=train_dataset, test_dataset=test_dataset, tissue=tissue, + species=species, data_dir="./test_automl/data") + transforms = get_transforms(trial=trial, fun_list=fun_list) + if transforms is None: + logger.warning("skip transforms") + return {"scores": 0} + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + data = dataloader.load_data(transform=preprocessing_pipeline, cache=True) + + # Obtain training and testing data + x_train, y_train = data.get_train_data(return_type="torch") + x_test, y_test = data.get_test_data(return_type="torch") + x_train, y_train, x_test, y_test = x_train.float(), y_train.float(), x_test.float(), y_test.float() + # Train and evaluate models for several rounds + scores = [] + parameter_config = {} + parameter_config.update(parameters_dict) + model = ACTINN(hidden_dims=parameter_config.get('hidden_dims'), lambd=parameter_config.get('lambd'), device=device) + for seed in range(parameter_config.get('seed'), parameter_config.get('seed') + parameter_config.get('num_runs')): + set_seed(seed) + model.fit(x_train, y_train, seed=seed, lr=parameter_config.get('learning_rate'), + num_epochs=parameter_config.get('num_epochs'), batch_size=parameter_config.get('batch_size'), + print_cost=False) + scores.append(model.score(x_test, y_test)) + return {"scores": np.mean(scores)} + + +if __name__ == "__main__": + start_optimizer = get_optimizer(project="step3-project", objective=objective) + start_optimizer() diff --git a/legacy/test_automl/step3_examples/step3_clustering_scdcc.py b/legacy/test_automl/step3_examples/step3_clustering_scdcc.py new file mode 100644 index 00000000..29266bfa --- /dev/null +++ b/legacy/test_automl/step3_examples/step3_clustering_scdcc.py @@ -0,0 +1,119 @@ +import os + +import numpy as np +import torch + +from dance.datasets.singlemodality import ClusteringDataset +from dance.legacy.automl_config.step3_config import get_optimizer, get_transforms +from dance.modules.single_modality.clustering.scdcc import ScDCC +from dance.registry import DotDict # Optional +from dance.transforms.misc import Compose, SetConfig +from dance.transforms.preprocess import generate_random_pair +from dance.utils import set_seed + +fun_list = ["filter_cell_by_count", "filter_gene_by_count", "normalize_total"] + +device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") + + +def objective(trial): + """Optimization function.""" + parameters_dict = { + 'seed': 0, + 'num_runs': 1, + 'cache': True, + 'label_cells_files': 'label_10X_PBMC.txt', + 'label_cells': 0.1, + 'n_pairwise': 0, + 'n_pairwise_error': 0, + 'z_dim': 32, + 'encodeLayer': [256, 64], + 'sigma': 2.5, + 'gamma': 1.0, + 'lr': 0.01, + 'pretrain_lr': 0.001, + 'ml_weight': 1.0, + 'cl_weight': 1.0, + 'update_interval': 1.0, + 'tol': 0.00001, + 'ae_weights': None, + 'ae_weight_file': "AE_weights.pth.tar", + 'pretrain_epochs': 50, + 'epochs': 500, + 'batch_size': 256 + } + transforms = get_transforms(trial=trial, fun_list=fun_list, set_data_config=False, save_raw=True) + transforms.append( + SetConfig({ + "feature_channel": [None, None, "n_counts"], + "feature_channel_type": ["X", "raw_X", "obs"], + "label_channel": "Group" + })) + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + parameters_config = {} + parameters_config.update(parameters_dict) + parameters_config = DotDict(parameters_config) + aris = [] + for seed in range(parameters_config.seed, parameters_config.seed + parameters_config.num_runs): + set_seed(seed) + dataset = "10X_PBMC" + # Load data and perform necessary preprocessing + dataloader = ClusteringDataset("./test_automl/data", dataset=dataset) + data = dataloader.load_data(transform=preprocessing_pipeline, cache=parameters_config.cache) + + # inputs: x, x_raw, n_counts + inputs, y = data.get_train_data() + n_clusters = len(np.unique(y)) + in_dim = inputs[0].shape[1] + + # Generate random pairs + if not os.path.exists(parameters_config.label_cells_files): + indx = np.arange(len(y)) + np.random.shuffle(indx) + label_cell_indx = indx[0:int(np.ceil(parameters_config.label_cells * len(y)))] + else: + label_cell_indx = np.loadtxt(parameters_config.label_cells_files, dtype=np.int) + + if parameters_config.n_pairwise > 0: + ml_ind1, ml_ind2, cl_ind1, cl_ind2, error_num = generate_random_pair(y, label_cell_indx, + parameters_config.n_pairwise, + parameters_config.n_pairwise_error) + print("Must link paris: %d" % ml_ind1.shape[0]) + print("Cannot link paris: %d" % cl_ind1.shape[0]) + print("Number of error pairs: %d" % error_num) + else: + ml_ind1, ml_ind2, cl_ind1, cl_ind2 = np.array([]), np.array([]), np.array([]), np.array([]) + + # Build and train moodel + model = ScDCC(input_dim=in_dim, z_dim=parameters_config.z_dim, n_clusters=n_clusters, + encodeLayer=parameters_config.encodeLayer, decodeLayer=parameters_config.encodeLayer[::-1], + sigma=parameters_config.sigma, gamma=parameters_config.gamma, + ml_weight=parameters_config.ml_weight, cl_weight=parameters_config.ml_weight, device=device, + pretrain_path=f"scdcc_{dataset}_pre.pkl") + try: + model.fit(inputs, y, lr=parameters_config.lr, batch_size=parameters_config.batch_size, + epochs=parameters_config.epochs, ml_ind1=ml_ind1, ml_ind2=ml_ind2, cl_ind1=cl_ind1, + cl_ind2=cl_ind2, update_interval=parameters_config.update_interval, tol=parameters_config.tol, + pt_batch_size=parameters_config.batch_size, pt_lr=parameters_config.pretrain_lr, + pt_epochs=parameters_config.pretrain_epochs) + except Exception as e: + """If don't skip the error, then all hyperparameter combinations will + inevitably have problems when facing all datasets and models, and need to + ensure the effectiveness of automatic machine learning and ignore some + hyperparameters.""" + print(e) + return ({"scores": 0}) + # Evaluate model predictions + score = model.score(None, y) + print(f"{score=:.4f}") + aris.append(score) + + print('scdcc') + print(f'aris: {aris}') + print(f'aris: {np.mean(aris)} +/- {np.std(aris)}') + return ({"scores": np.mean(aris)}) + + +if __name__ == "__main__": + start_optimizer = get_optimizer(project="step3-cluster-scdcc-project", objective=objective, n_trials=10) + start_optimizer() diff --git a/legacy/test_automl/step3_examples/step3_imputation_deepimpute.py b/legacy/test_automl/step3_examples/step3_imputation_deepimpute.py new file mode 100644 index 00000000..77fdfbb9 --- /dev/null +++ b/legacy/test_automl/step3_examples/step3_imputation_deepimpute.py @@ -0,0 +1,88 @@ +import torch + +from dance import logger +from dance.datasets.singlemodality import ImputationDataset +from dance.legacy.automl_config.step3_config import get_optimizer, get_transforms +from dance.modules.single_modality.imputation.deepimpute import DeepImpute +from dance.registry import DotDict +from dance.transforms.misc import Compose, SetConfig +from dance.utils import set_seed + +fun_list = ["filter_gene_by_count", "filter_cell_by_count", "log1p", "gene_hold_out", "cell_wise_mask_data"] +device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") +import numpy as np + + +def objective(trial): + parameters_dict = { + 'dropout': 0.1, + 'lr': 1e-5, + 'n_epochs': 5, + 'batch_size': 64, + 'sub_outputdim': 512, + 'hidden_dim': 256, + 'patience': 20, + 'min_cells': 0.05, + "n_top": 5, + "train_size": 0.9, + "mask_rate": 0.1, + "cache": False, + "mask": True, #Avoid duplication with hyperparameter processes + "seed": 0, + "num_runs": 1, + "gpu": 3 + } + parameters_config = {} + parameters_config.update(parameters_dict) + parameters_config = DotDict(parameters_config) + rmses = [] + for seed in range(parameters_config.seed, parameters_config.seed + parameters_config.num_runs): + set_seed(seed) + dataset = "mouse_brain_data" + data_dir = "./test_automl/data" + dataloader = ImputationDataset(data_dir=data_dir, dataset=dataset, train_size=parameters_config.train_size) + # preprocessing_pipeline = DeepImpute.preprocessing_pipeline(min_cells=parameters_config.min_cells, n_top=parameters_config.n_top, + # sub_outputdim=parameters_config.sub_outputdim, mask=parameters_config.mask, + # seed=seed, mask_rate=parameters_config.mask_rate) + transforms = get_transforms(trial=trial, fun_list=fun_list, set_data_config=False, save_raw=True) + if transforms is None: + logger.warning("skip transforms") + return {"scores": 0} + transforms.append( + SetConfig({ + "feature_channel": [None, None, "targets", "predictors", "train_mask"], + "feature_channel_type": ["X", "raw_X", "uns", "uns", "layers"], + "label_channel": [None, None], + "label_channel_type": ["X", "raw_X"], + })) + preprocessing_pipeline = Compose(*transforms, log_level="INFO") + data = dataloader.load_data(transform=preprocessing_pipeline, cache=parameters_config.cache) + + if parameters_config.mask: + X, X_raw, targets, predictors, mask = data.get_x(return_type="default") + else: + mask = None + X, X_raw, targets, predictors = data.get_x(return_type="default") + X = torch.tensor(X.toarray()).float() + X_raw = torch.tensor(X_raw.toarray()).float() + X_train = X * mask + model = DeepImpute(predictors, targets, dataset, parameters_config.sub_outputdim, parameters_config.hidden_dim, + parameters_config.dropout, seed, parameters_config.gpu) + + model.fit(X_train, X_train, mask, parameters_config.batch_size, parameters_config.lr, + parameters_config.n_epochs, parameters_config.patience) + imputed_data = model.predict(X_train, mask) + score = model.score(X, imputed_data, mask, metric='RMSE') + print("RMSE: %.4f" % score) + rmses.append(score) + + print('deepimpute') + print(f'rmses: {rmses}') + print(f'rmses: {np.mean(rmses)} +/- {np.std(rmses)}') + return ({"scores": np.mean(rmses)}) + + +if __name__ == "__main__": + start_optimizer = get_optimizer(project="step3-imputation-deepimpute-project", objective=objective, n_trials=10, + direction="minimize") + start_optimizer() diff --git a/legacy/test_automl/tests/step2_test.py b/legacy/test_automl/tests/step2_test.py new file mode 100644 index 00000000..6cac0ed0 --- /dev/null +++ b/legacy/test_automl/tests/step2_test.py @@ -0,0 +1,2 @@ +def test_get_preprocessing_pipeline(): + pass #不一定需要,因为主要都是装饰器函数 diff --git a/legacy/test_automl/tests/step3_test.py b/legacy/test_automl/tests/step3_test.py new file mode 100644 index 00000000..6cac0ed0 --- /dev/null +++ b/legacy/test_automl/tests/step3_test.py @@ -0,0 +1,2 @@ +def test_get_preprocessing_pipeline(): + pass #不一定需要,因为主要都是装饰器函数 diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 612823cb..9f2a80bd 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -352,7 +352,7 @@ def test_pipeline_planer_construction(subtests, planer_toy_registry): { "type": "b", "target": "func_b1", - "params": { + "params_to_tune": { "x": { "values": ["x1", "x2", "x3"] }, @@ -402,7 +402,7 @@ def test_pipeline_planer_construction(subtests, planer_toy_registry): { "type": "b", "target": "func_b1", - "params": { + "params_to_tune": { "x": { "values": ["x1", "x2", "x3"] }, @@ -414,7 +414,7 @@ def test_pipeline_planer_construction(subtests, planer_toy_registry): { "type": "c", "target": "func_c1", - "params": { + "params_to_tune": { "z": { "min": 0, "max": 1 @@ -424,7 +424,7 @@ def test_pipeline_planer_construction(subtests, planer_toy_registry): { "type": "c", "target": "func_c1", - "params": { + "params_to_tune": { "z": { "min": -10., "max": 10. @@ -562,7 +562,7 @@ def test_pipeline_planer_generation(subtests, planer_toy_registry): { "type": "b", "target": "func_b1", - "params": { + "params_to_tune": { "x": { "values": ["x1", "x2", "x3"] }, @@ -605,9 +605,7 @@ def test_pipeline_planer_generation(subtests, planer_toy_registry): ] } - assert dict(p.generate_config(params=[{ - "x": "x1" - }, None])) == { + ans = { "type": "a", "pipeline": [ { @@ -623,6 +621,10 @@ def test_pipeline_planer_generation(subtests, planer_toy_registry): }, ], } + # Option 1: list of param dict + assert dict(p.generate_config(params=[{"x": "x1"}, None])) == ans + # Option 2: wandb type config + assert dict(p.generate_config(params={"params.0.x": "x1"})) == ans with pytest.raises(ValueError): # Unknown param key 'y' @@ -639,7 +641,7 @@ def test_pipeline_planer_generation(subtests, planer_toy_registry): "pipeline": [{ "type": "b", # "target": "func_b1", # this must be set - "params": { + "params_to_tune": { "x": { "values": ["x1", "x2", "x3"] }, @@ -725,10 +727,8 @@ def test_pipeline_planer_generation(subtests, planer_toy_registry): { "type": "b", "target": "func_b1", - "default_params": { - "func_b1": { - "x": "b1" - }, + "params": { + "x": "b1" }, }, { diff --git a/tests/transforms/test_RESPETGraph.py b/tests/transforms/test_RESPETGraph.py new file mode 100644 index 00000000..86d9442d --- /dev/null +++ b/tests/transforms/test_RESPETGraph.py @@ -0,0 +1,23 @@ +import numpy as np +import pandas as pd +from anndata import AnnData + +from dance.data import Data +from dance.transforms.graph import RESEPTGraph + +SEED = 123 + + +def test_RESPET_GRAPH(): + num_cells = 100 + num_genes = 500 + gene_expression = np.random.default_rng(seed=SEED).random((num_cells, num_genes)) + adata = AnnData(X=gene_expression) + random_df = pd.DataFrame( + np.random.default_rng(seed=SEED).integers(1, 10000, size=(num_cells, 2)), columns=["x_pixel", "y_pixel"], + index=adata.obs_names) + adata.obsm['spatial_pixel'] = random_df + data = Data(adata.copy()) + RESEPTgraph = RESEPTGraph() + RESEPTgraph(data) + assert data.data.uns['RESEPTGraph'].shape == (2000, 2000, 3) diff --git a/tests/transforms/test_SC3Feature.py b/tests/transforms/test_SC3Feature.py new file mode 100644 index 00000000..5e0be6ca --- /dev/null +++ b/tests/transforms/test_SC3Feature.py @@ -0,0 +1,24 @@ +import numpy as np +import pytest +from anndata import AnnData + +from dance.data import Data +from dance.transforms import SC3Feature + +SEED = 123 + + +@pytest.fixture +def toy_data(): + x = np.random.default_rng(SEED).random((50, 30)) + adata = AnnData(X=x, dtype=np.float32) + data = Data(adata.copy()) + return adata, data + + +def test_sc3_feature(toy_data): + adata, data = toy_data + sc3feature = SC3Feature() + data = sc3feature(data) + sc3_feature = data.get_feature(return_type="numpy", channel="SC3Feature", channel_type="obsm") + assert sc3_feature.shape[0] == data.shape[0] diff --git a/tests/transforms/test_TangramFeature.py b/tests/transforms/test_TangramFeature.py new file mode 100644 index 00000000..4be8968e --- /dev/null +++ b/tests/transforms/test_TangramFeature.py @@ -0,0 +1,24 @@ +import numpy as np +import pytest +from anndata import AnnData + +from dance.data import Data +from dance.transforms import TangramFeature + +SEED = 123 + + +@pytest.fixture +def toy_data(): + x = np.random.default_rng(SEED).random((5, 3)) + adata = AnnData(X=x, dtype=np.float32) + data = Data(adata.copy()) + return adata, data + + +def test_tangram_feature(toy_data): + adata, data = toy_data + tangramFeature = TangramFeature() + data = tangramFeature(data) + tangram_feature = data.get_feature(return_type="numpy", channel="TangramFeature", channel_type="obs") + assert np.sum(tangram_feature) == 1 diff --git a/tests/transforms/test_celltypeNums.py b/tests/transforms/test_celltypeNums.py new file mode 100644 index 00000000..ec2b557d --- /dev/null +++ b/tests/transforms/test_celltypeNums.py @@ -0,0 +1,20 @@ +import numpy as np +from anndata import AnnData + +from dance.data import Data +from dance.transforms import CellTypeNums + +SEED = 123 + + +def test_cell_type_nums(): + np.random.seed(SEED) + num_cells = 100 + num_genes = 500 + gene_expression = np.random.default_rng(seed=SEED).random((num_cells, num_genes)) + cell_types = np.random.default_rng(seed=SEED).choice(['Type_A', 'Type_B', 'Type_C'], num_cells) + adata = AnnData(X=gene_expression, obs={'cellType': cell_types}) + data = Data(adata.copy()) + data = CellTypeNums()(data) + cell_type_nums = data.get_feature(return_type="numpy", channel="CellTypeNums", channel_type="uns") + assert cell_type_nums.shape[0] == len(np.unique(cell_types))