1+ import pandas as pd # NOqa: F821
2+
13try :
24 from ingest_files import IngestFiles
35 from monitor import log_exception
@@ -14,13 +16,20 @@ def __init__(self, file_path, study_file_id, study_id, **kwargs):
1416 IngestFiles .__init__ (
1517 self , file_path , allowed_file_types = self .ALLOWED_FILE_TYPES
1618 )
17- pass
19+ # If performing cluster extraction, set obsm_keys
20+ extract_cluster = kwargs .get ("extract_cluster" )
21+ if extract_cluster :
22+ self .obsm_keys = kwargs ["obsm_keys" ]
23+ else :
24+ pass
1825
1926 def obtain_adata (self ):
2027 try :
21- self .adata = self .open_file (self .file_path )[0 ]
22- print (self .adata )
23- IngestFiles .dev_logger .info (str (self .adata ))
28+ adata = self .open_file (self .file_path )[0 ]
29+ # for faster dev, print adata info to screen, may want to remove in future
30+ print (adata )
31+ IngestFiles .dev_logger .info (str (adata ))
32+ return adata
2433 except ValueError as e :
2534 raise ValueError (e )
2635
@@ -35,3 +44,69 @@ def validate(self):
3544 except ValueError :
3645 return False
3746
47+ @staticmethod
48+ def generate_cluster_header (adata , clustering_name ):
49+ """
50+ Based on clustering dimensions, write clustering NAME line to file
51+ """
52+ dim = ['NAME' , 'X' , 'Y' ]
53+ clustering_dimension = adata .obsm [clustering_name ].shape [1 ]
54+ if clustering_dimension == 3 :
55+ headers = dim .append ('Z' )
56+ elif clustering_dimension == 3 :
57+ headers = dim
58+ elif clustering_dimension > 3 :
59+ msg = f"Too many dimensions for visualization in obsm \" { clustering_name } \" , found { clustering_dimension } , expected 2 or 3."
60+ raise ValueError (msg )
61+ else :
62+ msg = f"Too few dimensions for visualization in obsm \" { clustering_name } \" , found { clustering_dimension } , expected 2 or 3."
63+ raise ValueError (msg )
64+ with open (f"{ clustering_name } .cluster.anndata_segment.tsv" , "w" ) as f :
65+ f .write ('\t ' .join (headers ) + '\n ' )
66+
67+ @staticmethod
68+ def generate_cluster_type_declaration (adata , clustering_name ):
69+ """
70+ Based on clustering dimensions, write clustering TYPE line to file
71+ """
72+ clustering_dimension = adata .obsm [clustering_name ].shape [1 ]
73+ types = ["TYPE" , * ["numeric" ] * clustering_dimension ]
74+ with open (f"{ clustering_name } .cluster.anndata_segment.tsv" , "a" ) as f :
75+ f .write ('\t ' .join (types ) + '\n ' )
76+
77+ @staticmethod
78+ def generate_cluster_body (adata , clustering_name ):
79+ """
80+ Append clustering data to clustering file
81+ """
82+ cluster_cells = pd .DataFrame (adata .obs_names )
83+ cluster_body = pd .concat (
84+ [cluster_cells , pd .DataFrame (adata .obsm [clustering_name ])], axis = 1
85+ )
86+ pd .DataFrame (cluster_body ).to_csv (
87+ f"{ clustering_name } .cluster.anndata_segment.tsv" ,
88+ sep = "\t " ,
89+ mode = "a" ,
90+ header = None ,
91+ index = False ,
92+ )
93+
94+ @staticmethod
95+ def files_to_delocalize (arguments ):
96+ # ToDo - check if names using obsm_keys need sanitization
97+ cluster_file_names = [name + ".tsv" for name in arguments ["obsm_keys" ]]
98+ return cluster_file_names
99+
100+ @staticmethod
101+ def delocalize_cluster_files (file_path , study_file_id , files_to_delocalize ):
102+ """ Copy cluster files to study bucket
103+ """
104+
105+ for file in files_to_delocalize :
106+ IngestFiles .delocalize_file (
107+ study_file_id ,
108+ None ,
109+ file_path ,
110+ file ,
111+ f"_scp_internal/anndata_ingest/{ file } " ,
112+ )
0 commit comments