Skip to content

Commit 3f67f18

Browse files
authored
Merge pull request #146 from broadinstitute/development
Release 1.6.2
2 parents 745cd1c + 6d55525 commit 3f67f18

21 files changed

+17494
-16053
lines changed

ingest/expression_files/dense_ingestor.py

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -249,27 +249,19 @@ def transform(self):
249249
raise ValueError(f"Duplicate gene: {gene}")
250250
self.gene_names[gene] = True
251251

252-
if len(exp_scores) > 0:
253-
data_arrays, gene_models, num_processed = self.create_models(
254-
exp_cells,
255-
exp_scores,
256-
gene,
257-
None,
258-
gene_models,
259-
data_arrays,
260-
num_processed,
261-
False,
262-
)
263-
# load any remaining models (this is necessary here since there isn't
264-
# an easy way to detect the last line of the file in the iteration above
265-
if len(gene_models) > 0:
266-
self.create_models(
252+
data_arrays, gene_models, num_processed = self.create_models(
267253
exp_cells,
268254
exp_scores,
269255
gene,
270256
None,
271257
gene_models,
272258
data_arrays,
273259
num_processed,
274-
True,
260+
False,
261+
)
262+
# Load any remaining models. This is necessary because the amount of
263+
# models maybe less than the batch size.
264+
if len(gene_models) > 0:
265+
self.create_models(
266+
[], [], None, None, gene_models, data_arrays, num_processed, True
275267
)

ingest/expression_files/expression_files.py

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -203,19 +203,22 @@ def create_models(
203203
"""
204204
current_data_arrays = []
205205
start_time = datetime.datetime.now()
206-
207206
model_id = ObjectId()
208-
gene_models.append(
209-
GeneExpression.create_gene_model(
210-
name=gene,
211-
study_file_id=self.study_file_id,
212-
study_id=self.study_id,
213-
gene_id=gene_id,
214-
_id=model_id,
207+
208+
GeneExpression.dev_logger.debug(f"Creating models for {gene}")
209+
if gene:
210+
gene_models.append(
211+
GeneExpression.create_gene_model(
212+
name=gene,
213+
study_file_id=self.study_file_id,
214+
study_id=self.study_id,
215+
gene_id=gene_id,
216+
_id=model_id,
217+
)
215218
)
216-
)
217-
if len(data_arrays) > 0:
218-
# Data arrays for cells
219+
# Make data array models for genes with expression data
220+
if len(exp_scores) > 0:
221+
# Data array model for cells
219222
for cell_data_array in GeneExpression.create_data_arrays(
220223
name=f"{gene} Cells",
221224
array_type="cells",
@@ -225,7 +228,7 @@ def create_models(
225228
**self.data_array_kwargs,
226229
):
227230
current_data_arrays.append(cell_data_array)
228-
# Data arrays for expression values
231+
# Data array model for expression values
229232
for exp_value_data_array in GeneExpression.create_data_arrays(
230233
name=f"{gene} Expression",
231234
array_type="expression",
@@ -236,12 +239,18 @@ def create_models(
236239
):
237240
current_data_arrays.append(exp_value_data_array)
238241
this_batch_size = len(data_arrays) + len(current_data_arrays)
239-
# Determine if models should be batched
242+
# Determine if models should be batched/loaded
240243
if this_batch_size >= GeneExpression.DATA_ARRAY_BATCH_SIZE or force:
241-
self.load(gene_models, GeneExpression.COLLECTION_NAME)
242-
self.load(data_arrays, DataArray.COLLECTION_NAME)
244+
if force:
245+
# Add new data arrays
246+
data_arrays += current_data_arrays
247+
current_data_arrays.clear()
248+
if len(data_arrays) > 0:
249+
self.load(data_arrays, DataArray.COLLECTION_NAME)
250+
if len(gene_models) > 0:
251+
self.load(gene_models, GeneExpression.COLLECTION_NAME)
243252
num_processed += len(gene_models)
244-
print(
253+
GeneExpression.dev_logger.info(
245254
f"Processed {num_processed} genes. "
246255
f"{str(datetime.datetime.now() - start_time)} "
247256
f"elapsed"

ingest/expression_files/mtx.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,19 @@ def get_mtx_dimensions(file_handler) -> List:
143143
raise e
144144
raise ValueError("MTX file did not contain data")
145145

146+
@staticmethod
147+
def get_features(feature_row: str):
148+
"""Determines gene id and gene name from a given row:str in a feature
149+
file
150+
"""
151+
feature_data = feature_row.split("\t")
152+
gene_id = feature_data[0]
153+
gene_name = feature_data[0]
154+
if len(feature_data) >= 2:
155+
# gene_name field is present
156+
gene_name = feature_data[1]
157+
return gene_id, gene_name
158+
146159
def execute_ingest(self):
147160
"""Parses MTX files"""
148161
self.extract_feature_barcode_matrices()
@@ -190,11 +203,16 @@ def transform(self):
190203
if current_idx != prev_idx:
191204
if not MTXIngestor.is_sorted(current_idx, visited_expression_idx):
192205
raise ValueError("MTX file must be sorted")
206+
GeneExpression.dev_logger.debug(
207+
f"Processing {self.genes[prev_idx - 1]}"
208+
)
193209
visited_expression_idx.append(current_idx)
194210
if prev_idx != 0:
195211
# Expressed cells and scores are associated with prior gene
196-
prev_gene_id, prev_gene = self.genes[prev_idx - 1].split("\t")
197-
# Ff the previous gene exists, load its models
212+
prev_gene_id, prev_gene = MTXIngestor.get_features(
213+
self.genes[prev_idx - 1]
214+
)
215+
# If the previous gene exists, load its models
198216
data_arrays, gene_models, num_processed = self.create_models(
199217
exp_cells,
200218
exp_scores,
@@ -213,7 +231,9 @@ def transform(self):
213231
exp_cells.append(exp_cell)
214232
exp_scores.append(exp_score)
215233
# Create data array for last row
216-
current_gene_id, current_gene = self.genes[prev_idx - 1].split("\t")
234+
current_gene_id, current_gene = MTXIngestor.get_features(
235+
self.genes[prev_idx - 1]
236+
)
217237
self.create_models(
218238
exp_cells,
219239
exp_scores,
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
GENE CELL_0001 CELL_0002 CELL_0003 CELL_0004
2+
Itm2a 0 0 0 0
3+
Sergef 0 0 0 0
4+
Chil5 0 0 0 0
5+
Fam109a 0 0 0 0

0 commit comments

Comments
 (0)