|
1 | 1 | #!/usr/bin/env python3 |
2 | | -# -*- coding: utf-8 -*- |
3 | 2 |
|
4 | 3 | """ |
5 | | - Copyright (C) 2021, Ontario Institute for Cancer Research |
6 | | - |
7 | | - This program is free software: you can redistribute it and/or modify |
8 | | - it under the terms of the GNU Affero General Public License as published by |
9 | | - the Free Software Foundation, either version 3 of the License, or |
10 | | - (at your option) any later version. |
11 | | - |
12 | | - This program is distributed in the hope that it will be useful, |
13 | | - but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | - GNU Affero General Public License for more details. |
16 | | - |
17 | | - You should have received a copy of the GNU Affero General Public License |
18 | | - along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | | -
|
20 | | - Authors: |
21 | | - Junjun Zhang |
22 | | -""" |
| 4 | + Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR). |
| 5 | +
|
| 6 | + This program is free software: you can redistribute it and/or modify |
| 7 | + it under the terms of the GNU Affero General Public License as published |
| 8 | + by the Free Software Foundation, either version 3 of the License, or |
| 9 | + (at your option) any later version. |
| 10 | +
|
| 11 | + This program is distributed in the hope that it will be useful, |
| 12 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 13 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 14 | + GNU Affero General Public License for more details. |
| 15 | +
|
| 16 | + You should have received a copy of the GNU Affero General Public License |
| 17 | + along with this program. If not, see <https://www.gnu.org/licenses/>. |
| 18 | +
|
| 19 | + Authors: |
| 20 | + Linda Xiang <linda.xiang@oicr.on.ca> |
| 21 | + Junjun Zhang <junjun.zhang@oicr.on.ca> |
| 22 | + """ |
| 23 | + |
23 | 24 |
|
24 | | -import os |
25 | 25 | import sys |
26 | | -import argparse |
27 | | -import subprocess |
| 26 | +import uuid |
| 27 | +import json |
| 28 | +import csv |
| 29 | +import textwrap |
| 30 | +from argparse import ArgumentParser |
28 | 31 |
|
29 | 32 |
|
30 | | -def main(): |
31 | | - """ |
32 | | - Python implementation of tool: payload-gen-seq-experiment |
| 33 | +TSV_FIELDS = { |
| 34 | + 'experiment': [ |
| 35 | + 'type', 'program_id', 'submitter_sequencing_experiment_id', 'submitter_donor_id', 'gender', |
| 36 | + 'submitter_specimen_id', 'tumour_normal_designation', 'specimen_type', 'specimen_tissue_source', 'submitter_sample_id', |
| 37 | + 'sample_type', 'submitter_matched_normal_sample_id', 'sequencing_center', 'platform', 'platform_model', |
| 38 | + 'experimental_strategy', 'sequencing_date', 'read_group_count' |
| 39 | + ], |
| 40 | + 'read_group': [ |
| 41 | + 'type', 'submitter_read_group_id', 'read_group_id_in_bam', 'submitter_sequencing_experiment_id', 'platform_unit', |
| 42 | + 'is_paired_end', 'file_r1', 'file_r2', 'read_length_r1', 'read_length_r2', 'insert_size', 'sample_barcode', 'library_name' |
| 43 | + ], |
| 44 | + 'file': [ |
| 45 | + 'type', 'name', 'size', 'md5sum', 'path', 'format' |
| 46 | + ] |
| 47 | +} |
33 | 48 |
|
34 | | - This is auto-generated Python code, please update as needed! |
35 | | - """ |
36 | 49 |
|
37 | | - parser = argparse.ArgumentParser(description='Tool: payload-gen-seq-experiment') |
38 | | - parser.add_argument('-i', '--input-file', dest='input_file', type=str, |
39 | | - help='Input file', required=True) |
40 | | - parser.add_argument('-o', '--output-dir', dest='output_dir', type=str, |
41 | | - help='Output directory', required=True) |
42 | | - args = parser.parse_args() |
| 50 | +def empty_str_to_null(metadata): |
| 51 | + for k in metadata: |
| 52 | + if k in ['read_groups', 'files']: |
| 53 | + for i in range(len(metadata[k])): |
| 54 | + empty_str_to_null(metadata[k][i]) |
| 55 | + if isinstance(metadata[k], str) and metadata[k] in ["", "_NULL_"]: |
| 56 | + metadata[k] = None |
| 57 | + |
| 58 | + |
| 59 | +def tsv_confomity_check(ftype, tsv): |
| 60 | + expected_fields = TSV_FIELDS[ftype] |
| 61 | + |
| 62 | + header_processed = False |
| 63 | + with open(tsv, 'r') as t: |
| 64 | + uniq_row = {} |
| 65 | + for l in t: |
| 66 | + l = l.rstrip('\n').rstrip('\r') # remove trailing newline, remove windows `\r` (just in case) |
| 67 | + if not header_processed: # it's header |
| 68 | + fields = l.split('\t') |
| 69 | + if len(fields) != len(set(fields)): |
| 70 | + sys.exit("Error found: Field duplicated in input TSV: %s, offending header: %s\n" % (tsv, l)) |
| 71 | + |
| 72 | + missed_fields = set(expected_fields) - set(fields) |
| 73 | + if missed_fields: # missing fields |
| 74 | + sys.exit("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n" % \ |
| 75 | + (tsv, l, ', '.join(missed_fields))) |
| 76 | + |
| 77 | + unexpected_fields = set(fields) - set(expected_fields) |
| 78 | + if unexpected_fields: # unexpected fields |
| 79 | + sys.exit("Error found: Unexpected field in input TSV: %s, offending header: %s. Unexpected field(s): %s\n" % \ |
| 80 | + (tsv, l, ', '.join(unexpected_fields))) |
| 81 | + |
| 82 | + header_processed = True |
| 83 | + |
| 84 | + else: # it's data row |
| 85 | + # at this point we only check whether number of values matches number of expected fields and uniqueness check, |
| 86 | + # later steps will perform more sophisticated content check |
| 87 | + values = l.split('\t') |
| 88 | + if len(expected_fields) != len(values): |
| 89 | + sys.exit("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n" % \ |
| 90 | + (len(values), len(expected_fields), l)) |
| 91 | + |
| 92 | + if l in uniq_row: |
| 93 | + sys.exit("Error found: data row repeated in file: %s, offending data row: %s\n" % (tsv, l)) |
| 94 | + else: |
| 95 | + uniq_row[l] = True |
| 96 | + |
| 97 | + |
| 98 | +def load_all_tsvs(exp_tsv, rg_tsv, file_tsv): |
| 99 | + metadata_dict = {} |
| 100 | + with open(exp_tsv, 'r') as f: |
| 101 | + rows = list(csv.DictReader(f, delimiter='\t')) |
| 102 | + if len(rows) != 1: |
| 103 | + sys.exit("Error found: experiment TSV expects exactly one data row, offending file: %s has %s row(s)\n" % \ |
| 104 | + (exp_tsv, len(rows))) |
| 105 | + rows[0]['read_group_count'] = int(rows[0]['read_group_count']) |
| 106 | + metadata_dict.update(rows[0]) |
| 107 | + |
| 108 | + with open(rg_tsv, 'r') as f: |
| 109 | + metadata_dict['read_groups'] = [] |
| 110 | + for rg in csv.DictReader(f, delimiter='\t'): |
| 111 | + if rg['is_paired_end'].lower() == 'true': |
| 112 | + rg['is_paired_end'] = True |
| 113 | + elif rg['is_paired_end'].lower() == 'false': |
| 114 | + rg['is_paired_end'] = False |
| 115 | + else: |
| 116 | + rg['is_paired_end'] = None |
| 117 | + |
| 118 | + for field in ('read_length_r1', 'read_length_r2', 'insert_size'): |
| 119 | + if rg[field]: |
| 120 | + rg[field] = int(rg[field]) |
| 121 | + else: |
| 122 | + rg[field] = None |
43 | 123 |
|
44 | | - if not os.path.isfile(args.input_file): |
45 | | - sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file) |
| 124 | + metadata_dict['read_groups'].append(rg) |
46 | 125 |
|
47 | | - if not os.path.isdir(args.output_dir): |
48 | | - sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir) |
| 126 | + if len(metadata_dict['read_groups']) == 0: |
| 127 | + sys.exit("Error found: read group TSV does not contain any read group information\n") |
49 | 128 |
|
50 | | - subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True) |
| 129 | + with open(file_tsv, 'r') as f: |
| 130 | + metadata_dict['files'] = [] |
| 131 | + for f in csv.DictReader(f, delimiter='\t'): |
| 132 | + if f['size']: |
| 133 | + f['size'] = int(f['size']) |
| 134 | + else: |
| 135 | + f['size'] = None |
| 136 | + |
| 137 | + metadata_dict['files'].append(f) |
| 138 | + |
| 139 | + if len(metadata_dict['files']) == 0: |
| 140 | + sys.exit("Error found: file TSV does not contain any file information\n") |
| 141 | + |
| 142 | + return metadata_dict |
| 143 | + |
| 144 | + |
| 145 | +def validate_args(args): |
| 146 | + if args.metadata_json and \ |
| 147 | + not (args.experiment_info_tsv or args.read_group_info_tsv or args.file_info_tsv): |
| 148 | + return True |
| 149 | + elif not args.metadata_json and \ |
| 150 | + (args.experiment_info_tsv and args.read_group_info_tsv and args.file_info_tsv): |
| 151 | + return True |
| 152 | + else: |
| 153 | + sys.exit(textwrap.dedent( |
| 154 | + """ |
| 155 | + Usage: |
| 156 | + When '-m' is provided, no other arguments can be used |
| 157 | + When '-m' is not provided, please provide all of these arguments: -x, -r and -f |
| 158 | + """ |
| 159 | + )) |
| 160 | + |
| 161 | + |
| 162 | +def main(metadata): |
| 163 | + empty_str_to_null(metadata) |
| 164 | + |
| 165 | + payload = { |
| 166 | + 'analysisType': { |
| 167 | + 'name': 'sequencing_experiment' |
| 168 | + }, |
| 169 | + 'studyId': metadata.get('program_id'), |
| 170 | + 'experiment': { |
| 171 | + 'submitter_sequencing_experiment_id': metadata.get('submitter_sequencing_experiment_id'), |
| 172 | + 'sequencing_center': metadata.get('sequencing_center'), |
| 173 | + 'platform': metadata.get('platform'), |
| 174 | + 'platform_model': metadata.get('platform_model'), |
| 175 | + 'experimental_strategy': metadata.get('experimental_strategy'), |
| 176 | + 'sequencing_date': metadata.get('sequencing_date') |
| 177 | + }, |
| 178 | + 'read_group_count': metadata.get('read_group_count'), |
| 179 | + 'read_groups': [], |
| 180 | + 'samples': [], |
| 181 | + 'files': [] |
| 182 | + } |
| 183 | + |
| 184 | + # get sample of the payload |
| 185 | + sample = { |
| 186 | + 'submitterSampleId': metadata.get('submitter_sample_id'), |
| 187 | + 'matchedNormalSubmitterSampleId': metadata.get('submitter_matched_normal_sample_id'), |
| 188 | + 'sampleType': metadata.get('sample_type'), |
| 189 | + 'specimen': { |
| 190 | + 'submitterSpecimenId': metadata.get('submitter_specimen_id'), |
| 191 | + 'tumourNormalDesignation': metadata.get('tumour_normal_designation'), |
| 192 | + 'specimenTissueSource': metadata.get('specimen_tissue_source'), |
| 193 | + 'specimenType': metadata.get('specimen_type') |
| 194 | + }, |
| 195 | + 'donor': { |
| 196 | + 'submitterDonorId': metadata.get('submitter_donor_id'), |
| 197 | + 'gender': metadata.get('gender') |
| 198 | + } |
| 199 | + } |
| 200 | + |
| 201 | + payload['samples'].append(sample) |
| 202 | + |
| 203 | + # get file of the payload |
| 204 | + for input_file in metadata.get("files"): |
| 205 | + payload['files'].append( |
| 206 | + { |
| 207 | + 'fileName': input_file.get('name'), |
| 208 | + 'fileSize': input_file.get('size'), |
| 209 | + 'fileMd5sum': input_file.get('md5sum'), |
| 210 | + 'fileType': input_file.get('format'), |
| 211 | + 'fileAccess': 'controlled', |
| 212 | + 'dataType': 'Submitted Reads', |
| 213 | + 'info': { |
| 214 | + 'data_category': 'Sequencing Reads' |
| 215 | + } |
| 216 | + } |
| 217 | + ) |
| 218 | + |
| 219 | + for rg in metadata.get("read_groups"): |
| 220 | + rg.pop('type') # remove 'type' field |
| 221 | + rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field |
| 222 | + payload['read_groups'].append(rg) |
| 223 | + |
| 224 | + with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f: |
| 225 | + f.write(json.dumps(payload, indent=2)) |
51 | 226 |
|
52 | 227 |
|
53 | 228 | if __name__ == "__main__": |
54 | | - main() |
| 229 | + parser = ArgumentParser() |
| 230 | + parser.add_argument("-m", "--metadata-json", |
| 231 | + help="json file containing experiment, read_group and file information submitted from user") |
| 232 | + parser.add_argument("-x", "--experiment-info-tsv", |
| 233 | + help="tsv file containing experiment information submitted from user") |
| 234 | + parser.add_argument("-r", "--read-group-info-tsv", |
| 235 | + help="tsv file containing read_group information submitted from user") |
| 236 | + parser.add_argument("-f", "--file-info-tsv", |
| 237 | + help="tsv file containing file information submitted from user") |
| 238 | + args = parser.parse_args() |
| 239 | + |
| 240 | + validate_args(args) |
| 241 | + |
| 242 | + if args.metadata_json: |
| 243 | + with open(args.metadata_json, 'r') as f: |
| 244 | + metadata = json.load(f) |
| 245 | + else: |
| 246 | + # fistly TSV format conformity check, if not well-formed no point to continue |
| 247 | + tsv_confomity_check('experiment', args.experiment_info_tsv) |
| 248 | + tsv_confomity_check('read_group', args.read_group_info_tsv) |
| 249 | + tsv_confomity_check('file', args.file_info_tsv) |
| 250 | + |
| 251 | + # all TSV are well-formed, let's load them |
| 252 | + metadata = load_all_tsvs( |
| 253 | + args.experiment_info_tsv, |
| 254 | + args.read_group_info_tsv, |
| 255 | + args.file_info_tsv |
| 256 | + ) |
| 257 | + |
| 258 | + # all TSV are well-formed, let's load them |
| 259 | + metadata = load_all_tsvs(args.experiment_info_tsv, args.read_group_info_tsv, args.file_info_tsv) |
55 | 260 |
|
| 261 | + main(metadata) |
0 commit comments