Skip to content

Commit 0b6cb89

Browse files
authored
Merge pull request #45 from broadinstitute/jlc_gzip_ingest
Add support for gzip files [SCP-1884]
2 parents 8f55d65 + 1a2e511 commit 0b6cb89

File tree

4 files changed

+50
-3
lines changed

4 files changed

+50
-3
lines changed

ingest/ingest_files.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import mimetypes
99
import os
1010
import re
11+
import gzip
1112

1213
import pandas as pd
1314
from google.cloud import storage
@@ -19,6 +20,7 @@ def __init__(self, file_path, allowed_file_types, *, open_as=None):
1920
# File is remote (in GCS bucket) when running via PAPI,
2021
# and typically local when developing
2122
self.is_remote_file = file_path[:5] == "gs://"
23+
self.is_gzip_file = self.get_file_type(file_path)[1] == 'gzip'
2224

2325
self.verify_file_exists(file_path)
2426

@@ -67,9 +69,13 @@ def resolve_path(self, file_path):
6769
"""
6870
if self.is_remote_file:
6971
file_path = self.download_from_bucket(file_path)
70-
7172
# Remove BOM with encoding ='utf - 8 - sig'
72-
return open(file_path, encoding="utf-8-sig")
73+
if self.is_gzip_file:
74+
open_file = gzip.open(file_path, 'rt', encoding='utf-8-sig')
75+
else:
76+
open_file = open(file_path, encoding="utf-8-sig")
77+
78+
return open_file
7379

7480
def reset_file(self, start_point, open_as=None):
7581
"""Restart file reader at point that's equal to start_point.

ingest/ingest_pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cluster --cluster-file ../tests/data/test_1k_cluster_Data.csv --ingest-cluster --name cluster1 --domain-ranges "{'x':[-1, 1], 'y':[-1, 1], 'z':[-1, 1]}"
1717
1818
# Ingest Cell Metadata file
19-
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/metadata_valid.tsv --ingest-cell-metadata
19+
python ingest_pipeline.py --study-accession SCP1 --file-id 123abc ingest_cell_metadata --cell-metadata-file ../tests/data/valid_v1.1.1.tsv --ingest-cell-metadata
2020
2121
# Ingest Cell Metadata file against convention
2222
!! Please note that you must have permission to the SCP bucket
930 KB
Binary file not shown.

tests/test_ingest.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,47 @@ def test_ingest_local_dense_matrix(self):
218218

219219
self.assertEqual(model, expected_model)
220220

221+
def test_ingest_local_compressed_dense_matrix(self):
222+
"""Ingest Pipeline should extract and transform local dense matrices
223+
from compressed file in the same manner as uncompressed file
224+
"""
225+
226+
args = [
227+
'--study-accession',
228+
'SCP1',
229+
'--file-id',
230+
'1234abc',
231+
'ingest_expression',
232+
'--taxon-name',
233+
'Homo sapiens',
234+
'--taxon-common-name',
235+
'human',
236+
'--ncbi-taxid',
237+
'9606',
238+
'--genome-assembly-accession',
239+
'GCA_000001405.15',
240+
'--genome-annotation',
241+
'Ensembl 94',
242+
'--matrix-file',
243+
'../tests/data/dense_matrix_19_genes_100k_cells.txt.gz',
244+
'--matrix-file-type',
245+
'dense',
246+
]
247+
ingest = self.setup_ingest(args)
248+
249+
models = ingest.load_expression_data_args[0]
250+
251+
# Verify that 19 gene models were passed into load method
252+
num_models = len(models)
253+
expected_num_models = 19
254+
self.assertEqual(num_models, expected_num_models)
255+
256+
# Verify that the first gene model looks as expected
257+
mock_dir = 'dense_matrix_19_genes_100k_cells_txt'
258+
model, expected_model = get_nth_gene_models(0, models, mock_dir)
259+
260+
self.assertEqual(model, expected_model)
261+
221262
def test_ingest_missing_local_file(self):
222263
"""Ingest Pipeline should throw error for missing local file
223264
"""

0 commit comments

Comments
 (0)