Skip to content

Commit 7665e4f

Browse files
committed
migrated payload-gen-seq-experiment to wfpm package
1 parent 99fd818 commit 7665e4f

26 files changed

Lines changed: 769 additions & 76 deletions

payload-gen-seq-experiment/main.nf

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
1919
Authors:
20+
Linda Xiang
2021
Junjun Zhang
2122
*/
2223

@@ -44,8 +45,10 @@ params.publish_dir = "" // set to empty string will disable publishDir
4445

4546

4647
// tool specific parmas go here, add / change as needed
47-
params.input_file = ""
48-
params.output_pattern = "*" // output file name pattern
48+
params.metadata_json = "NO_FILE1"
49+
params.experiment_info_tsv = "NO_FILE2"
50+
params.read_group_info_tsv = "NO_FILE3"
51+
params.file_info_tsv = "NO_FILE4"
4952

5053

5154
process payloadGenSeqExperiment {
@@ -55,22 +58,27 @@ process payloadGenSeqExperiment {
5558
cpus params.cpus
5659
memory "${params.mem} GB"
5760

58-
input: // input, make update as needed
59-
path input_file
61+
input:
62+
path metadata_json
63+
path experiment_info_tsv
64+
path read_group_info_tsv
65+
path file_info_tsv
6066

61-
output: // output, make update as needed
62-
path "output_dir/${params.output_pattern}", emit: output_file
67+
output:
68+
path "*.sequencing_experiment.payload.json", emit: payload
6369

6470
script:
65-
// add and initialize variables here as needed
71+
args_metadata_json = !metadata_json.name.startsWith("NO_FILE") ? "-m ${metadata_json}" : ""
72+
args_experiment_info_tsv = !experiment_info_tsv.name.startsWith("NO_FILE") ? "-x ${experiment_info_tsv}" : ""
73+
args_read_group_info_tsv = !read_group_info_tsv.name.startsWith("NO_FILE") ? "-r ${read_group_info_tsv}" : ""
74+
args_file_info_tsv = !file_info_tsv.name.startsWith("NO_FILE") ? "-f ${file_info_tsv}" : ""
6675

6776
"""
68-
mkdir -p output_dir
69-
7077
main.py \
71-
-i ${input_file} \
72-
-o output_dir
73-
78+
${args_metadata_json} \
79+
${args_experiment_info_tsv} \
80+
${args_read_group_info_tsv} \
81+
${args_file_info_tsv}
7482
"""
7583
}
7684

@@ -79,6 +87,9 @@ process payloadGenSeqExperiment {
7987
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
8088
workflow {
8189
payloadGenSeqExperiment(
82-
file(params.input_file)
90+
file(params.metadata_json),
91+
file(params.experiment_info_tsv),
92+
file(params.read_group_info_tsv),
93+
file(params.file_info_tsv)
8394
)
8495
}

payload-gen-seq-experiment/main.py

Lines changed: 245 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,55 +1,261 @@
11
#!/usr/bin/env python3
2-
# -*- coding: utf-8 -*-
32

43
"""
5-
Copyright (C) 2021, Ontario Institute for Cancer Research
6-
7-
This program is free software: you can redistribute it and/or modify
8-
it under the terms of the GNU Affero General Public License as published by
9-
the Free Software Foundation, either version 3 of the License, or
10-
(at your option) any later version.
11-
12-
This program is distributed in the hope that it will be useful,
13-
but WITHOUT ANY WARRANTY; without even the implied warranty of
14-
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15-
GNU Affero General Public License for more details.
16-
17-
You should have received a copy of the GNU Affero General Public License
18-
along with this program. If not, see <http://www.gnu.org/licenses/>.
19-
20-
Authors:
21-
Junjun Zhang
22-
"""
4+
Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
5+
6+
This program is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License as published
8+
by the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
This program is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU Affero General Public License for more details.
15+
16+
You should have received a copy of the GNU Affero General Public License
17+
along with this program. If not, see <https://www.gnu.org/licenses/>.
18+
19+
Authors:
20+
Linda Xiang <linda.xiang@oicr.on.ca>
21+
Junjun Zhang <junjun.zhang@oicr.on.ca>
22+
"""
23+
2324

24-
import os
2525
import sys
26-
import argparse
27-
import subprocess
26+
import uuid
27+
import json
28+
import csv
29+
import textwrap
30+
from argparse import ArgumentParser
2831

2932

30-
def main():
31-
"""
32-
Python implementation of tool: payload-gen-seq-experiment
33+
TSV_FIELDS = {
34+
'experiment': [
35+
'type', 'program_id', 'submitter_sequencing_experiment_id', 'submitter_donor_id', 'gender',
36+
'submitter_specimen_id', 'tumour_normal_designation', 'specimen_type', 'specimen_tissue_source', 'submitter_sample_id',
37+
'sample_type', 'submitter_matched_normal_sample_id', 'sequencing_center', 'platform', 'platform_model',
38+
'experimental_strategy', 'sequencing_date', 'read_group_count'
39+
],
40+
'read_group': [
41+
'type', 'submitter_read_group_id', 'read_group_id_in_bam', 'submitter_sequencing_experiment_id', 'platform_unit',
42+
'is_paired_end', 'file_r1', 'file_r2', 'read_length_r1', 'read_length_r2', 'insert_size', 'sample_barcode', 'library_name'
43+
],
44+
'file': [
45+
'type', 'name', 'size', 'md5sum', 'path', 'format'
46+
]
47+
}
3348

34-
This is auto-generated Python code, please update as needed!
35-
"""
3649

37-
parser = argparse.ArgumentParser(description='Tool: payload-gen-seq-experiment')
38-
parser.add_argument('-i', '--input-file', dest='input_file', type=str,
39-
help='Input file', required=True)
40-
parser.add_argument('-o', '--output-dir', dest='output_dir', type=str,
41-
help='Output directory', required=True)
42-
args = parser.parse_args()
50+
def empty_str_to_null(metadata):
51+
for k in metadata:
52+
if k in ['read_groups', 'files']:
53+
for i in range(len(metadata[k])):
54+
empty_str_to_null(metadata[k][i])
55+
if isinstance(metadata[k], str) and metadata[k] in ["", "_NULL_"]:
56+
metadata[k] = None
57+
58+
59+
def tsv_confomity_check(ftype, tsv):
60+
expected_fields = TSV_FIELDS[ftype]
61+
62+
header_processed = False
63+
with open(tsv, 'r') as t:
64+
uniq_row = {}
65+
for l in t:
66+
l = l.rstrip('\n').rstrip('\r') # remove trailing newline, remove windows `\r` (just in case)
67+
if not header_processed: # it's header
68+
fields = l.split('\t')
69+
if len(fields) != len(set(fields)):
70+
sys.exit("Error found: Field duplicated in input TSV: %s, offending header: %s\n" % (tsv, l))
71+
72+
missed_fields = set(expected_fields) - set(fields)
73+
if missed_fields: # missing fields
74+
sys.exit("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n" % \
75+
(tsv, l, ', '.join(missed_fields)))
76+
77+
unexpected_fields = set(fields) - set(expected_fields)
78+
if unexpected_fields: # unexpected fields
79+
sys.exit("Error found: Unexpected field in input TSV: %s, offending header: %s. Unexpected field(s): %s\n" % \
80+
(tsv, l, ', '.join(unexpected_fields)))
81+
82+
header_processed = True
83+
84+
else: # it's data row
85+
# at this point we only check whether number of values matches number of expected fields and uniqueness check,
86+
# later steps will perform more sophisticated content check
87+
values = l.split('\t')
88+
if len(expected_fields) != len(values):
89+
sys.exit("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n" % \
90+
(len(values), len(expected_fields), l))
91+
92+
if l in uniq_row:
93+
sys.exit("Error found: data row repeated in file: %s, offending data row: %s\n" % (tsv, l))
94+
else:
95+
uniq_row[l] = True
96+
97+
98+
def load_all_tsvs(exp_tsv, rg_tsv, file_tsv):
99+
metadata_dict = {}
100+
with open(exp_tsv, 'r') as f:
101+
rows = list(csv.DictReader(f, delimiter='\t'))
102+
if len(rows) != 1:
103+
sys.exit("Error found: experiment TSV expects exactly one data row, offending file: %s has %s row(s)\n" % \
104+
(exp_tsv, len(rows)))
105+
rows[0]['read_group_count'] = int(rows[0]['read_group_count'])
106+
metadata_dict.update(rows[0])
107+
108+
with open(rg_tsv, 'r') as f:
109+
metadata_dict['read_groups'] = []
110+
for rg in csv.DictReader(f, delimiter='\t'):
111+
if rg['is_paired_end'].lower() == 'true':
112+
rg['is_paired_end'] = True
113+
elif rg['is_paired_end'].lower() == 'false':
114+
rg['is_paired_end'] = False
115+
else:
116+
rg['is_paired_end'] = None
117+
118+
for field in ('read_length_r1', 'read_length_r2', 'insert_size'):
119+
if rg[field]:
120+
rg[field] = int(rg[field])
121+
else:
122+
rg[field] = None
43123

44-
if not os.path.isfile(args.input_file):
45-
sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file)
124+
metadata_dict['read_groups'].append(rg)
46125

47-
if not os.path.isdir(args.output_dir):
48-
sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir)
126+
if len(metadata_dict['read_groups']) == 0:
127+
sys.exit("Error found: read group TSV does not contain any read group information\n")
49128

50-
subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True)
129+
with open(file_tsv, 'r') as f:
130+
metadata_dict['files'] = []
131+
for f in csv.DictReader(f, delimiter='\t'):
132+
if f['size']:
133+
f['size'] = int(f['size'])
134+
else:
135+
f['size'] = None
136+
137+
metadata_dict['files'].append(f)
138+
139+
if len(metadata_dict['files']) == 0:
140+
sys.exit("Error found: file TSV does not contain any file information\n")
141+
142+
return metadata_dict
143+
144+
145+
def validate_args(args):
146+
if args.metadata_json and \
147+
not (args.experiment_info_tsv or args.read_group_info_tsv or args.file_info_tsv):
148+
return True
149+
elif not args.metadata_json and \
150+
(args.experiment_info_tsv and args.read_group_info_tsv and args.file_info_tsv):
151+
return True
152+
else:
153+
sys.exit(textwrap.dedent(
154+
"""
155+
Usage:
156+
When '-m' is provided, no other arguments can be used
157+
When '-m' is not provided, please provide all of these arguments: -x, -r and -f
158+
"""
159+
))
160+
161+
162+
def main(metadata):
163+
empty_str_to_null(metadata)
164+
165+
payload = {
166+
'analysisType': {
167+
'name': 'sequencing_experiment'
168+
},
169+
'studyId': metadata.get('program_id'),
170+
'experiment': {
171+
'submitter_sequencing_experiment_id': metadata.get('submitter_sequencing_experiment_id'),
172+
'sequencing_center': metadata.get('sequencing_center'),
173+
'platform': metadata.get('platform'),
174+
'platform_model': metadata.get('platform_model'),
175+
'experimental_strategy': metadata.get('experimental_strategy'),
176+
'sequencing_date': metadata.get('sequencing_date')
177+
},
178+
'read_group_count': metadata.get('read_group_count'),
179+
'read_groups': [],
180+
'samples': [],
181+
'files': []
182+
}
183+
184+
# get sample of the payload
185+
sample = {
186+
'submitterSampleId': metadata.get('submitter_sample_id'),
187+
'matchedNormalSubmitterSampleId': metadata.get('submitter_matched_normal_sample_id'),
188+
'sampleType': metadata.get('sample_type'),
189+
'specimen': {
190+
'submitterSpecimenId': metadata.get('submitter_specimen_id'),
191+
'tumourNormalDesignation': metadata.get('tumour_normal_designation'),
192+
'specimenTissueSource': metadata.get('specimen_tissue_source'),
193+
'specimenType': metadata.get('specimen_type')
194+
},
195+
'donor': {
196+
'submitterDonorId': metadata.get('submitter_donor_id'),
197+
'gender': metadata.get('gender')
198+
}
199+
}
200+
201+
payload['samples'].append(sample)
202+
203+
# get file of the payload
204+
for input_file in metadata.get("files"):
205+
payload['files'].append(
206+
{
207+
'fileName': input_file.get('name'),
208+
'fileSize': input_file.get('size'),
209+
'fileMd5sum': input_file.get('md5sum'),
210+
'fileType': input_file.get('format'),
211+
'fileAccess': 'controlled',
212+
'dataType': 'Submitted Reads',
213+
'info': {
214+
'data_category': 'Sequencing Reads'
215+
}
216+
}
217+
)
218+
219+
for rg in metadata.get("read_groups"):
220+
rg.pop('type') # remove 'type' field
221+
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
222+
payload['read_groups'].append(rg)
223+
224+
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
225+
f.write(json.dumps(payload, indent=2))
51226

52227

53228
if __name__ == "__main__":
54-
main()
229+
parser = ArgumentParser()
230+
parser.add_argument("-m", "--metadata-json",
231+
help="json file containing experiment, read_group and file information submitted from user")
232+
parser.add_argument("-x", "--experiment-info-tsv",
233+
help="tsv file containing experiment information submitted from user")
234+
parser.add_argument("-r", "--read-group-info-tsv",
235+
help="tsv file containing read_group information submitted from user")
236+
parser.add_argument("-f", "--file-info-tsv",
237+
help="tsv file containing file information submitted from user")
238+
args = parser.parse_args()
239+
240+
validate_args(args)
241+
242+
if args.metadata_json:
243+
with open(args.metadata_json, 'r') as f:
244+
metadata = json.load(f)
245+
else:
246+
# fistly TSV format conformity check, if not well-formed no point to continue
247+
tsv_confomity_check('experiment', args.experiment_info_tsv)
248+
tsv_confomity_check('read_group', args.read_group_info_tsv)
249+
tsv_confomity_check('file', args.file_info_tsv)
250+
251+
# all TSV are well-formed, let's load them
252+
metadata = load_all_tsvs(
253+
args.experiment_info_tsv,
254+
args.read_group_info_tsv,
255+
args.file_info_tsv
256+
)
257+
258+
# all TSV are well-formed, let's load them
259+
metadata = load_all_tsvs(args.experiment_info_tsv, args.read_group_info_tsv, args.file_info_tsv)
55260

261+
main(metadata)

0 commit comments

Comments
 (0)