|
3 | 3 |
|
4 | 4 | """ |
5 | 5 | Copyright (C) 2021, Ontario Institute for Cancer Research |
6 | | -
|
| 6 | + |
7 | 7 | This program is free software: you can redistribute it and/or modify |
8 | 8 | it under the terms of the GNU Affero General Public License as published by |
9 | 9 | the Free Software Foundation, either version 3 of the License, or |
10 | 10 | (at your option) any later version. |
11 | | -
|
| 11 | + |
12 | 12 | This program is distributed in the hope that it will be useful, |
13 | 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | 15 | GNU Affero General Public License for more details. |
16 | | -
|
| 16 | + |
17 | 17 | You should have received a copy of the GNU Affero General Public License |
18 | 18 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 19 |
|
|
22 | 22 | """ |
23 | 23 |
|
24 | 24 | import os |
25 | | -import sys |
26 | 25 | import argparse |
27 | | -import subprocess |
28 | | - |
| 26 | +import json |
| 27 | +import uuid |
| 28 | +import hashlib |
| 29 | +import copy |
| 30 | + |
| 31 | +variant_type_to_data_type_etc = { |
| 32 | + 'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan', 'bcftools'], ['GATK-Mutect2', 'bcftools']], # dataCategory, dataType, analysis_tools |
| 33 | + 'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel', 'bcftools'], ['GATK-Mutect2', 'bcftools']] |
| 34 | +} |
| 35 | + |
| 36 | +workflow_full_name = { |
| 37 | + 'open-access-variant-filtering': 'Open Access Variant Filtering' |
| 38 | +} |
| 39 | + |
| 40 | +def calculate_size(file_path): |
| 41 | + return os.stat(file_path).st_size |
| 42 | + |
| 43 | + |
| 44 | +def calculate_md5(file_path): |
| 45 | + md5 = hashlib.md5() |
| 46 | + with open(file_path, 'rb') as f: |
| 47 | + for chunk in iter(lambda: f.read(1024 * 1024), b''): |
| 48 | + md5.update(chunk) |
| 49 | + return md5.hexdigest() |
| 50 | + |
| 51 | + |
| 52 | +def get_files_info(file_to_upload, args): |
| 53 | + fname = os.path.basename(file_to_upload).split(".") |
| 54 | + input_wf = fname[5] |
| 55 | + variant_type = fname[7] |
| 56 | + process_name = args.process_name if args.process_name else args.wf_short_name |
| 57 | + new_fname = ".".join(fname[0:8]+[process_name, 'vcf.gz']+(['tbi'] if file_to_upload.endswith('.tbi') else [])) |
| 58 | + file_info = { |
| 59 | + 'fileName': new_fname, |
| 60 | + 'fileType': 'VCF' if new_fname.endswith('.vcf.gz') else new_fname.split(".")[-1].upper(), |
| 61 | + 'fileSize': calculate_size(file_to_upload), |
| 62 | + 'fileMd5sum': calculate_md5(file_to_upload), |
| 63 | + 'fileAccess': 'open' if not args.controlled else 'controlled', |
| 64 | + 'info': { |
| 65 | + 'data_category': variant_type_to_data_type_etc[variant_type][0] |
| 66 | + } |
| 67 | + } |
| 68 | + |
| 69 | + if file_to_upload.endswith('.vcf.gz'): |
| 70 | + file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1] |
| 71 | + elif file_to_upload.endswith('.vcf.gz.tbi'): |
| 72 | + file_info['dataType'] = 'VCF Index' |
| 73 | + else: |
| 74 | + pass |
| 75 | + |
| 76 | + if input_wf in (['sanger-wgs', 'sanger-wxs']): |
| 77 | + file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2] |
| 78 | + elif input_wf in (['gatk-mutect2']): |
| 79 | + file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][3] |
| 80 | + |
| 81 | + new_dir = 'out' |
| 82 | + try: |
| 83 | + os.mkdir(new_dir) |
| 84 | + except FileExistsError: |
| 85 | + pass |
| 86 | + |
| 87 | + dst = os.path.join(os.getcwd(), new_dir, new_fname) |
| 88 | + print(dst) |
| 89 | + os.symlink(os.path.abspath(file_to_upload), dst) |
| 90 | + |
| 91 | + return file_info |
| 92 | + |
| 93 | +def get_sample_info(sample_list): |
| 94 | + samples = copy.deepcopy(sample_list) |
| 95 | + for sample in samples: |
| 96 | + for item in ['info', 'sampleId', 'specimenId', 'donorId', 'studyId']: |
| 97 | + sample.pop(item, None) |
| 98 | + sample['specimen'].pop(item, None) |
| 99 | + sample['donor'].pop(item, None) |
| 100 | + |
| 101 | + return samples |
29 | 102 |
|
30 | 103 | def main(): |
31 | 104 | """ |
32 | 105 | Python implementation of tool: payload-gen-variant-processing |
33 | | -
|
34 | | - This is auto-generated Python code, please update as needed! |
35 | 106 | """ |
36 | 107 |
|
37 | 108 | parser = argparse.ArgumentParser(description='Tool: payload-gen-variant-processing') |
38 | | - parser.add_argument('-i', '--input-file', dest='input_file', type=str, |
39 | | - help='Input file', required=True) |
40 | | - parser.add_argument('-o', '--output-dir', dest='output_dir', type=str, |
41 | | - help='Output directory', required=True) |
| 109 | + parser.add_argument("-a", dest="analysis", required=True, |
| 110 | + help="json file containing sequencing_alignment SONG analysis for tumour sample") |
| 111 | + parser.add_argument("-f", dest="files_to_upload", type=str, nargs="+", help="Files to be uploaded", required=True) |
| 112 | + parser.add_argument("-w", dest="wf_name", type=str, help="workflow full name", required=True) |
| 113 | + parser.add_argument("-s", dest="wf_short_name", type=str, help="workflow short name", required=True) |
| 114 | + parser.add_argument("-v", dest="wf_version", type=str, required=True, help="workflow version") |
| 115 | + parser.add_argument("-r", dest="wf_run", type=str, required=True, help="workflow run ID") |
| 116 | + parser.add_argument("-j", dest="wf_session", type=str, required=True, help="workflow session ID") |
| 117 | + parser.add_argument("-c", dest="controlled", action='store_true', help="set file to be controlled access") |
| 118 | + parser.add_argument("-p", dest="process_name", type=str, help="variant process name") |
| 119 | + parser.add_argument("-t", dest="analysis_type", type=str, default="variant_processing", help="specify output song analysis type") |
42 | 120 | args = parser.parse_args() |
43 | 121 |
|
44 | | - if not os.path.isfile(args.input_file): |
45 | | - sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file) |
46 | | - |
47 | | - if not os.path.isdir(args.output_dir): |
48 | | - sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir) |
49 | | - |
50 | | - subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True) |
| 122 | + analysis = {} |
| 123 | + with open(args.analysis, 'r') as f: |
| 124 | + analysis = json.load(f) |
| 125 | + |
| 126 | + input_analysis_type = analysis.get('analysisType').get('name') |
| 127 | + |
| 128 | + payload = { |
| 129 | + 'analysisType': { |
| 130 | + 'name': args.analysis_type |
| 131 | + }, |
| 132 | + 'studyId': analysis.get('studyId'), |
| 133 | + 'experiment': analysis.get('experiment'), |
| 134 | + 'samples': get_sample_info(analysis.get('samples')), |
| 135 | + 'files': [], |
| 136 | + 'workflow': { |
| 137 | + 'workflow_name': workflow_full_name.get(args.wf_name), |
| 138 | + 'workflow_short_name': args.wf_short_name, |
| 139 | + 'workflow_version': args.wf_version, |
| 140 | + 'run_id': args.wf_run, |
| 141 | + 'session_id': args.wf_session, |
| 142 | + 'inputs': [ |
| 143 | + { |
| 144 | + 'input_analysis_id': analysis.get('analysisId'), |
| 145 | + 'analysis_type': input_analysis_type |
| 146 | + } |
| 147 | + ], |
| 148 | + 'genome_build': 'GRCh38_hla_decoy_ebv' |
| 149 | + }, |
| 150 | + 'variant_class': analysis.get('variant_class') |
| 151 | + } |
| 152 | + |
| 153 | + for f in args.files_to_upload: |
| 154 | + file_info = get_files_info(f, args) |
| 155 | + payload['files'].append(file_info) |
| 156 | + |
| 157 | + with open("%s.%s.payload.json" % (str(uuid.uuid4()), args.analysis_type), 'w') as f: |
| 158 | + f.write(json.dumps(payload, indent=2)) |
51 | 159 |
|
52 | 160 |
|
53 | 161 | if __name__ == "__main__": |
|
0 commit comments