Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 53 additions & 51 deletions add-clinical-header/insert_clinical_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
NULL_VALUES = ['NA', None]

def process_datum(datum):
try:
try:
dfixed = datum.strip()
except AttributeError:
dfixed='NA'
Expand All @@ -27,19 +27,21 @@ def process_datum(datum):
else:
return dfixed


def get_header(filename):
""" Returns the file header. """
filedata = [x for x in open(filename).read().split('\n') if not x.startswith('#')]
header = map(str.strip, filedata[0].split('\t'))
return header
""" Returns the file header. """
with open(filename) as f:
filedata = [x for x in f.read().split('\n') if not x.startswith('#')]
header = list(map(str.strip, filedata[0].split('\t')))
return header


def load_clinical_attribute_metadata():
""" Loads clinical attribute metadata. """
metadata_header = get_header(CLINICAL_ATTRIBUTE_METADATA_FILENAME)

# read file and load clinical attribute metadata
metadata_file = open(CLINICAL_ATTRIBUTE_METADATA_FILENAME, 'rU')
metadata_file = open(CLINICAL_ATTRIBUTE_METADATA_FILENAME, 'r')
metadata_reader = csv.DictReader(metadata_file, dialect='excel-tab')
for line in metadata_reader:
column = line['NORMALIZED_COLUMN_HEADER']
Expand Down Expand Up @@ -74,49 +76,49 @@ def get_clinical_header(clinical_filename):


def get_clinical_header_metadata(header, clinical_filename):
"""
Returns the clinical header metadata.
The order of the clinical header metadata goes:
1. display name
2. descriptions
3. datatype (STRING, NUMBER, BOOLEAN)
4. attribute type (PATIENT, SAMPLE)
5. priority
"""

display_names = []
descriptions = []
datatypes = []
attribute_types = []
priorities = []
"""
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indentation/spacing fix

Returns the clinical header metadata.
The order of the clinical header metadata goes:
1. display name
2. descriptions
3. datatype (STRING, NUMBER, BOOLEAN)
4. attribute type (PATIENT, SAMPLE)
5. priority
"""
display_names = []
descriptions = []
datatypes = []
attribute_types = []
priorities = []

is_mixed_attributes = ('data_clinical.txt' == os.path.basename(clinical_filename))
for column in header:
if not column in CLINICAL_ATTRIBUTE_METADATA.keys():
print 'Clinical attribute not known:', column
print 'Please add clinical attribute metadata before continuing. Exiting...'
sys.exit(2)
display_names.append(CLINICAL_ATTRIBUTE_METADATA[column]['DISPLAY_NAME'])
descriptions.append(CLINICAL_ATTRIBUTE_METADATA[column]['DESCRIPTION'])
datatypes.append(CLINICAL_ATTRIBUTE_METADATA[column]['DATATYPE'])
priorities.append(CLINICAL_ATTRIBUTE_METADATA[column]['PRIORITY'])

is_mixed_attributes = ('data_clinical.txt' == os.path.basename(clinical_filename))
for column in header:
if column not in CLINICAL_ATTRIBUTE_METADATA.keys():
print(f'Clinical attribute not known: {column}', file=ERROR_FILE)
print('Please add clinical attribute metadata before continuing. Exiting...', file=ERROR_FILE)
sys.exit(2)
display_names.append(CLINICAL_ATTRIBUTE_METADATA[column]['DISPLAY_NAME'])
descriptions.append(CLINICAL_ATTRIBUTE_METADATA[column]['DESCRIPTION'])
datatypes.append(CLINICAL_ATTRIBUTE_METADATA[column]['DATATYPE'])
priorities.append(CLINICAL_ATTRIBUTE_METADATA[column]['PRIORITY'])
# add attribute type only if clinical file contains mixed attributes
if is_mixed_attributes:
attribute_types.append(CLINICAL_ATTRIBUTE_METADATA[column]['ATTRIBUTE_TYPE'])

display_names = '#' + '\t'.join(display_names)
descriptions = '#' + '\t'.join(descriptions)
datatypes = '#' + '\t'.join(datatypes)
priorities = '#' + '\t'.join(priorities)
attribute_types = "#" + '\t'.join(attribute_types)

if is_mixed_attributes:
metadata = [display_names, descriptions, datatypes, attribute_types, priorities]
else:
metadata = [display_names, descriptions, datatypes, priorities]
attribute_types.append(CLINICAL_ATTRIBUTE_METADATA[column]['ATTRIBUTE_TYPE'])
display_names = '#' + '\t'.join(display_names)
descriptions = '#' + '\t'.join(descriptions)
datatypes = '#' + '\t'.join(datatypes)
priorities = '#' + '\t'.join(priorities)
attribute_types = "#" + '\t'.join(attribute_types)

if is_mixed_attributes:
metadata = [display_names, descriptions, datatypes, attribute_types, priorities]
else:
metadata = [display_names, descriptions, datatypes, priorities]

return metadata
return metadata



def write_clinical_metadata(clinical_header, clinical_filename):
Expand All @@ -126,7 +128,7 @@ def write_clinical_metadata(clinical_header, clinical_filename):
clinical_metadata = get_clinical_header_metadata(clinical_header, clinical_filename)

# read the clinical data file and filter data by given header
clinical_file = open(clinical_filename, 'rU')
clinical_file = open(clinical_filename, 'r')
clinical_reader = csv.DictReader(clinical_file, dialect='excel-tab')
filtered_clinical_data = ['\t'.join(clinical_header)]
for line in clinical_reader:
Expand All @@ -147,13 +149,12 @@ def write_clinical_metadata(clinical_header, clinical_filename):
output_file.write('\n'.join(output_data))
output_file.close()

print 'Clinical file with metadata written to:', output_filename
print(f'Clinical file with metadata written to:{output_filename}')


def insert_clinical_metadata_main(directory):
""" Writes clinical data to separate clinical patient and clinical sample files. """
clinical_files = find_clinical_files(directory)

for clinical_filename in clinical_files:
# get the patient and sample clinical file headers
clinical_header = get_clinical_header(clinical_filename)
Expand All @@ -172,8 +173,8 @@ def find_clinical_files(directory):


def usage():
print >> OUTPUT_FILE, 'insert_clinical_metadata.py --directory cancer/study/path'
sys.exit(2)
print('insert_clinical_metadata.py --directory cancer/study/path', file=OUTPUT_FILE)
sys.exit(2)

def main():
# get command line arguments
Expand All @@ -185,7 +186,7 @@ def main():

# exit if clinical file does not exist
if not os.path.exists(directory):
print 'No such directory:', directory
print(f'No such directory:{directory}')
sys.exit(2)

# load clinical attribute metadata
Expand All @@ -198,3 +199,4 @@ def main():

if __name__ == '__main__':
main()

10 changes: 5 additions & 5 deletions generate-case-lists/clinicalfile_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,12 @@ def get_all_metadata_mappings(file):
return all_metadata_mapping

def write_metadata_headers(metadata_lines,clinical_filename):
print '\t'.join(metadata_lines["DISPLAY_NAME"]).replace('\n', '')
print '\t'.join(metadata_lines["DESCRIPTION"]).replace('\n', '')
print '\t'.join(metadata_lines["DATATYPE"]).replace('\n', '')
print('\t'.join(metadata_lines["DISPLAY_NAME"]).replace('\n', ''))
print('\t'.join(metadata_lines["DESCRIPTION"]).replace('\n', ''))
print('\t'.join(metadata_lines["DATATYPE"]).replace('\n', ''))
if is_old_format(clinical_filename):
print '\t'.join(metadata_lines["ATTRIBUTE_TYPE"]).replace('\n', '')
print '\t'.join(metadata_lines["PRIORITY"]).replace('\n', '')
print('\t'.join(metadata_lines["ATTRIBUTE_TYPE"]).replace('\n', ''))
print('\t'.join(metadata_lines["PRIORITY"]).replace('\n', ''))

def write_header_line(line, output_file):
os.write(output_file, '#')
Expand Down
49 changes: 25 additions & 24 deletions generate-case-lists/generate_case_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def generate_case_lists(case_list_config_filename, case_list_dir, study_dir, stu
# check full header matches what we expect
for column in CASE_LIST_CONFIG_HEADER_COLUMNS:
if column not in header:
print >> sys.stderr, "ERROR: column '%s' is not in '%s'" % (column, case_list_config_filename)
print(f"ERROR: column '{column}' is not in '{case_list_config_filename}'", file=sys.stderr)
sys.exit(2)

for line in case_list_config_file:
Expand All @@ -85,7 +85,7 @@ def generate_case_lists(case_list_config_filename, case_list_dir, study_dir, stu
case_list_file_full_path = os.path.join(case_list_dir, case_list_filename)
if os.path.isfile(case_list_file_full_path) and not overwrite:
if verbose:
print "LOG: generate_case_lists(), '%s' exists and overwrite is false, skipping caselist..." % (case_list_filename)
print(f"LOG: generate_case_lists(), '{case_list_filename}' exists and overwrite is false, skipping caselist...")
continue

# might be single staging file
Expand All @@ -97,7 +97,7 @@ def generate_case_lists(case_list_config_filename, case_list_dir, study_dir, stu
delimiter = CASE_LIST_UNION_DELIMITER if union_case_list else CASE_LIST_INTERSECTION_DELIMITER
staging_filenames = staging_filename_list.split(delimiter)
if verbose:
print "LOG: generate_case_lists(), staging filenames: %s" % (",".join(staging_filenames))
print(f"LOG: generate_case_lists(), staging filenames: {','.join(staging_filenames)}")

# if this is intersection all staging files must exist
if intersection_case_list and \
Expand All @@ -111,14 +111,14 @@ def generate_case_lists(case_list_config_filename, case_list_dir, study_dir, stu
num_staging_files_processed = 0
for staging_filename in staging_filenames:
if verbose:
print "LOG: generate_case_lists(), processing staging file '%s'" % (staging_filename)
print(f"LOG: generate_case_lists(), processing staging file '{staging_filename}'")
# compute the case set
case_list = []
case_list = get_case_list_from_staging_file(study_dir, staging_filename, verbose)

if len(case_list) == 0:
if verbose:
print "LOG: generate_case_lists(), no cases in '%s', skipping..." % (staging_filename)
print(f"LOG: generate_case_lists(), no cases in '{staging_filename}', skipping...")
continue

if intersection_case_list:
Expand All @@ -136,20 +136,20 @@ def generate_case_lists(case_list_config_filename, case_list_dir, study_dir, stu
# write case list file (don't make empty case lists)
if len(case_set) > 0:
if verbose:
print "LOG: generate_case_lists(), calling write_case_list_file()..."
print("LOG: generate_case_lists(), calling write_case_list_file()...")

# do not write out complete cases file unless we've processed all the files required
if intersection_case_list and num_staging_files_processed != len(staging_filenames):
if verbose:
print "LOG: generate_case_lists(), number of staging files processed (%d) != number of staging files required (%d) for '%s', skipping call to write_case_list_file()..." % (num_staging_files_processed, len(staging_filenames), case_list_filename)
print(f"LOG: generate_case_lists(), number of staging files processed ({num_staging_files_processed}) != number of staging files required ({len(staging_filenames)}) for '{case_list_filename}', skipping call to write_case_list_file()...")
else:
write_case_list_file(header, config_fields, study_id, case_list_file_full_path, case_set, verbose)
elif verbose:
print "LOG: generate_case_lists(), case_set.size() == 0, skipping call to write_case_list_file()..."
print("LOG: generate_case_lists(), case_set.size() == 0, skipping call to write_case_list_file()...")

def get_case_list_from_staging_file(study_dir, staging_filename, verbose):
if verbose:
print "LOG: get_case_list_from_staging_file(), '%s'" % (staging_filename)
print(f"LOG: get_case_list_from_staging_file(), '{staging_filename}'")

case_set = set([])

Expand All @@ -158,7 +158,7 @@ def get_case_list_from_staging_file(study_dir, staging_filename, verbose):
sequenced_samples_full_path = os.path.join(study_dir, SEQUENCED_SAMPLES_FILENAME)
if os.path.isfile(sequenced_samples_full_path):
if verbose:
print "LOG: get_case_list_from_staging_file(), '%s' exists, calling get_case_list_from_sequenced_samples_file()" % (SEQUENCED_SAMPLES_FILENAME)
print(f"LOG: get_case_list_from_staging_file(), '{SEQUENCED_SAMPLES_FILENAME}' exists, using sequenced_samples_file()")
return get_case_list_from_sequenced_samples_file(sequenced_samples_full_path, verbose)

staging_file_full_path = os.path.join(study_dir, staging_filename)
Expand All @@ -185,7 +185,7 @@ def get_case_list_from_staging_file(study_dir, staging_filename, verbose):
# we are assuming the header contains the case ids because SAMPLE_ID_COLUMN_HEADER is missing
if MUTATION_CASE_ID_COLUMN_HEADER not in values and SAMPLE_ID_COLUMN_HEADER not in [x.upper() for x in values]:
if verbose:
print "LOG: get_case_list_from_staging_file(), this is not a MAF header but has no '%s' column, we assume it contains sample ids..." % (SAMPLE_ID_COLUMN_HEADER)
print(f"LOG: get_case_list_from_staging_file(), this is not a MAF header but has no '{SAMPLE_ID_COLUMN_HEADER}' column, we assume it contains sample ids...")
for potential_case_id in values:
# check to filter out column headers other than sample ids
if potential_case_id.upper() in NON_CASE_IDS:
Expand All @@ -196,7 +196,7 @@ def get_case_list_from_staging_file(study_dir, staging_filename, verbose):
# we know at this point one of these columns exists, so no fear of ValueError from index method
id_column_index = values.index(MUTATION_CASE_ID_COLUMN_HEADER) if MUTATION_CASE_ID_COLUMN_HEADER in values else [x.upper() for x in values].index(SAMPLE_ID_COLUMN_HEADER)
if verbose:
print "LOG: get_case_list_from_staging_file(), this is a MAF or clinical file, samples ids in column with index: %d" % (id_column_index)
print(f"LOG: get_case_list_from_staging_file(), this is a MAF or clinical file, samples ids in column with index: {id_column_index}")
process_header = False
continue # done with header, move on to next line
case_set.add(values[id_column_index])
Expand All @@ -205,21 +205,21 @@ def get_case_list_from_staging_file(study_dir, staging_filename, verbose):

def get_case_list_from_sequenced_samples_file(sequenced_samples_full_path, verbose):
if verbose:
print "LOG: get_case_list_from_sequenced_samples_file, '%s'", sequenced_samples_full_path
print(f"LOG: get_case_list_from_sequenced_samples_file, '{sequenced_samples_full_path}'")

case_set = set([])
with open(sequenced_samples_full_path, 'r') as sequenced_samples_file:
for line in sequenced_samples_file:
case_set.add(line.rstrip('\n'))

if verbose:
print "LOG: get_case_list_from_sequenced_samples_file, case set size: %d" % (len(case_set))
print(f"LOG: get_case_list_from_sequenced_samples_file, case set size: {len(case_set)}")

return list(case_set)

def write_case_list_file(case_list_config_header, case_list_config_fields, study_id, case_list_full_path, case_set, verbose):
if verbose:
print "LOG: write_case_list_file(), '%s'" % (case_list_full_path)
print(f"LOG: write_case_list_file(), '{case_list_full_path}'")
with open(case_list_full_path, 'w') as case_list_file:
case_list_file.write("cancer_study_identifier: " + study_id + "\n")
stable_id = case_list_config_fields[case_list_config_header.index("META_STABLE_ID")].replace(CANCER_STUDY_TAG, study_id)
Expand Down Expand Up @@ -248,25 +248,26 @@ def main():
verbose = args.verbose

if verbose:
print "LOG: case_list_config_file='%s'" % (case_list_config_filename)
print "LOG: case_list_dir='%s'" % (case_list_dir)
print "LOG: study_dir='%s'" % (study_dir)
print "LOG: study_id='%s'" % (study_id)
print "LOG: overwrite='%s'" % (overwrite)
print "LOG: verbose='%s'" % (verbose)
print(f"LOG: case_list_config_file='{case_list_config_filename}'")
print(f"LOG: case_list_dir='{case_list_dir}'")
print(f"LOG: study_dir='{study_dir}'")
print(f"LOG: study_id='{study_id}'")
print(f"LOG: overwrite='{overwrite}'")
print(f"LOG: verbose='{verbose}'")

if not os.path.isfile(case_list_config_filename):
print >> sys.stderr, "ERROR: case list configuration file '%s' does not exist or is not a file" % (case_list_config_filename)
print(f"ERROR: case list configuration file '{case_list_config_filename}' does not exist or is not a file", file=sys.stderr)
sys.exit(2)
parser.print_help()
sys.exit(2)

if not os.path.isdir(case_list_dir):
print >> sys.stderr, "ERROR: case list file directory '%s' does not exist or is not a directory" % (case_list_dir)
print(f"ERROR: case list file directory '{case_list_dir}' does not exist or is not a directory", file=sys.stderr)
parser.print_help()
sys.exit(2)

if not os.path.isdir(study_dir):
print >> sys.stderr, "ERROR: study directory '%s' does not exist or is not a directory" % (study_dir)
print(f"ERROR: study directory '{study_dir}' does not exist or is not a directory", file=sys.stderr)
parser.print_help()
sys.exit(2)

Expand Down
2 changes: 1 addition & 1 deletion generate-meta-files/generate_meta_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@
file_path = f"{args.directory}/meta_study.txt"
with open(file_path, 'w') as meta_study:
meta_study.writelines(f"cancer_study_identifier: {args.study_id}\n")
for val in metadata: meta_study.write(val)
for val in metadata: meta_study.write(val)
10 changes: 5 additions & 5 deletions oncotree-code-converter/clinicalfile_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,12 @@ def get_all_metadata_mappings(file):
return all_metadata_mapping

def write_metadata_headers(metadata_lines,clinical_filename):
print '\t'.join(metadata_lines["DISPLAY_NAME"]).replace('\n', '')
print '\t'.join(metadata_lines["DESCRIPTION"]).replace('\n', '')
print '\t'.join(metadata_lines["DATATYPE"]).replace('\n', '')
print('\t'.join(metadata_lines["DISPLAY_NAME"]).replace('\n', ''))
print('\t'.join(metadata_lines["DESCRIPTION"]).replace('\n', ''))
print( '\t'.join(metadata_lines["DATATYPE"]).replace('\n', ''))
if is_old_format(clinical_filename):
print '\t'.join(metadata_lines["ATTRIBUTE_TYPE"]).replace('\n', '')
print '\t'.join(metadata_lines["PRIORITY"]).replace('\n', '')
print( '\t'.join(metadata_lines["ATTRIBUTE_TYPE"]).replace('\n', ''))
print( '\t'.join(metadata_lines["PRIORITY"]).replace('\n', ''))

def write_header_line(line, output_file):
os.write(output_file, '#')
Expand Down
Loading