Skip to content

Commit 673fd65

Browse files
authored
Merge pull request #102 from icgc-argo/payload-gen-seq-experiment.0.2.0.0
update payload-gen-seq-experiment
2 parents 3b7f120 + cf288ba commit 673fd65

13 files changed

Lines changed: 216 additions & 42 deletions

tests/data/experiment-fq.v2.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
type program_id submitter_sequencing_experiment_id submitter_donor_id gender submitter_specimen_id tumour_normal_designation specimen_type specimen_tissue_source submitter_sample_id sample_type submitter_matched_normal_sample_id sequencing_center platform platform_model experimental_strategy sequencing_date read_group_count
2+
sequencing_experiment TEST-PRO TEST_EXP HCC1143 Female HCC1143_FASTQ_INPUT Tumour Cell line - derived from tumour Blood derived HCC1143_FASTQ_INPUT DNA HCC1143_BAM_INPUT EXT ILLUMINA HiSeq 2000 WGS 2014-12-12 3

tests/data/experiment.v2.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
type program_id submitter_sequencing_experiment_id submitter_donor_id gender submitter_specimen_id tumour_normal_designation specimen_type specimen_tissue_source submitter_sample_id sample_type submitter_matched_normal_sample_id sequencing_center platform platform_model experimental_strategy sequencing_date read_group_count
2+
sequencing_experiment TEST-PRO TEST_EXP HCC1143 Female HCC1143_BAM_INPUT Normal Cell line - derived from normal Blood derived HCC1143_BAM_INPUT DNA EXT ILLUMINA HiSeq 2000 WGS 2014-12-12 3

tests/data/file-fq.v2.tsv

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
type name format size md5sum path
2+
file C0HVY.2_r1.fq FASTQ 5381 6584ebb05edfbd6f59be6307556bd871 C0HVY.2_r1.fq
3+
file C0HVY.2_r2.fq FASTQ 5381 d649696f346eb75b95b68e8ecd45f44f C0HVY.2_r2.fq
4+
file D0RE2.1_r1.fq FASTQ 6148 058510bbdab2dccc14a2d7402a21248d D0RE2.1_r1.fq
5+
file D0RE2.1_r2.fq FASTQ 6148 5fb35647198666b6a06e8539074282fd D0RE2.1_r2.fq
6+
file D0RH0.2_r1.fq FASTQ 4784 eb109b8774df03084024bc09a86721be D0RH0.2_r1.fq
7+
file D0RH0.2_r2.fq FASTQ 4784 ed209fda15a5928087104253a6bfb42d D0RH0.2_r2.fq

tests/data/file.v2.tsv

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
type name format size md5sum path
2+
file test_rg_3.bam BAM 14911 178f97f7b1ca8bfc28fd5586bdd56799 test_rg_3.bam

tests/data/read_group-fq.v2.tsv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
type submitter_read_group_id read_group_id_in_bam submitter_sequencing_experiment_id platform_unit is_paired_end file_r1 file_r2 read_length_r1 read_length_r2 insert_size sample_barcode library_name
2+
read_group C0HVY.2 TEST_EXP 74_8a true C0HVY.2_r1.fq C0HVY.2_r2.fq 150 150 298 Pond-147580
3+
read_group D0RE2.1 TEST_EXP 74_8b true D0RE2.1_r1.fq D0RE2.1_r2.fq 150 150 298 Pond-147580
4+
read_group D0RH0.2 TEST_EXP 74_8c true D0RH0.2_r1.fq D0RH0.2_r2.fq 150 150 298 Pond-147580

tests/data/read_group.v2.tsv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
type submitter_read_group_id read_group_id_in_bam submitter_sequencing_experiment_id platform_unit is_paired_end file_r1 file_r2 read_length_r1 read_length_r2 insert_size sample_barcode library_name
2+
read_group C0HVY.2 TEST_EXP 74_8a true test_rg_3.bam test_rg_3.bam 150 150 298 Pond-147580
3+
read_group D0RE2.1 TEST_EXP 74_8b true test_rg_3.bam test_rg_3.bam 150 150 298 Pond-147580
4+
read_group D0RH0.2 TEST_EXP 74_8c true test_rg_3.bam test_rg_3.bam 150 150 298 Pond-147580

tools/payload-gen-seq-experiment/payload-gen-seq-experiment.nf

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,34 +22,38 @@
2222
*/
2323

2424
nextflow.enable.dsl=2
25-
version = '0.1.3.0'
25+
version = '0.2.0.0'
26+
27+
params.metadata_json = "NO_FILE1"
28+
params.experiment_info_tsv = "NO_FILE2"
29+
params.read_group_info_tsv = "NO_FILE3"
30+
params.file_info_tsv = "NO_FILE4"
2631

27-
params.user_submit_metadata = ""
28-
params.wf_name = ""
29-
params.wf_short_name = ""
30-
params.wf_version = ""
3132
params.container_version = ''
3233

3334
process payloadGenSeqExperiment {
3435
container "quay.io/icgc-argo/payload-gen-seq-experiment:payload-gen-seq-experiment.${params.container_version ?: version}"
3536

3637
input:
37-
path user_submit_metadata
38-
val wf_name
39-
val wf_short_name
40-
val wf_version
41-
val seq_valid
38+
path metadata_json
39+
path experiment_info_tsv
40+
path read_group_info_tsv
41+
path file_info_tsv
4242

4343
output:
4444
path "*.sequencing_experiment.payload.json", emit: payload
4545

4646
script:
47-
args_wf_short_name = wf_short_name.length() > 0 ? "-c ${wf_short_name}" : ""
47+
args_metadata_json = !metadata_json.name.startsWith("NO_FILE") ? "-m ${metadata_json}" : ""
48+
args_experiment_info_tsv = !experiment_info_tsv.name.startsWith("NO_FILE") ? "-x ${experiment_info_tsv}" : ""
49+
args_read_group_info_tsv = !read_group_info_tsv.name.startsWith("NO_FILE") ? "-r ${read_group_info_tsv}" : ""
50+
args_file_info_tsv = !file_info_tsv.name.startsWith("NO_FILE") ? "-f ${file_info_tsv}" : ""
51+
4852
"""
4953
payload-gen-seq-experiment.py \
50-
-m ${user_submit_metadata} \
51-
-w ${wf_name} \
52-
-r ${workflow.runName} \
53-
-v ${wf_version} ${args_wf_short_name}
54+
${args_metadata_json} \
55+
${args_experiment_info_tsv} \
56+
${args_read_group_info_tsv} \
57+
${args_file_info_tsv}
5458
"""
5559
}

tools/payload-gen-seq-experiment/payload-gen-seq-experiment.py

Lines changed: 156 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,31 @@
2222
"""
2323

2424

25+
import sys
2526
import uuid
2627
import json
28+
import csv
29+
import textwrap
2730
from argparse import ArgumentParser
2831

2932

33+
TSV_FIELDS = {
34+
'experiment': [
35+
'type', 'program_id', 'submitter_sequencing_experiment_id', 'submitter_donor_id', 'gender',
36+
'submitter_specimen_id', 'tumour_normal_designation', 'specimen_type', 'specimen_tissue_source', 'submitter_sample_id',
37+
'sample_type', 'submitter_matched_normal_sample_id', 'sequencing_center', 'platform', 'platform_model',
38+
'experimental_strategy', 'sequencing_date', 'read_group_count'
39+
],
40+
'read_group': [
41+
'type', 'submitter_read_group_id', 'read_group_id_in_bam', 'submitter_sequencing_experiment_id', 'platform_unit',
42+
'is_paired_end', 'file_r1', 'file_r2', 'read_length_r1', 'read_length_r2', 'insert_size', 'sample_barcode', 'library_name'
43+
],
44+
'file': [
45+
'type', 'name', 'size', 'md5sum', 'path', 'format'
46+
]
47+
}
48+
49+
3050
def empty_str_to_null(metadata):
3151
for k in metadata:
3252
if k in ['read_groups', 'files']:
@@ -36,10 +56,110 @@ def empty_str_to_null(metadata):
3656
metadata[k] = None
3757

3858

39-
def main(args):
40-
with open(args.user_submit_metadata, 'r') as f:
41-
metadata = json.load(f)
59+
def tsv_confomity_check(ftype, tsv):
60+
expected_fields = TSV_FIELDS[ftype]
61+
62+
header_processed = False
63+
with open(tsv, 'r') as t:
64+
uniq_row = {}
65+
for l in t:
66+
l = l.rstrip('\n').rstrip('\r') # remove trailing newline, remove windows `\r` (just in case)
67+
if not header_processed: # it's header
68+
fields = l.split('\t')
69+
if len(fields) != len(set(fields)):
70+
sys.exit("Error found: Field duplicated in input TSV: %s, offending header: %s\n" % (tsv, l))
71+
72+
missed_fields = set(expected_fields) - set(fields)
73+
if missed_fields: # missing fields
74+
sys.exit("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n" % \
75+
(tsv, l, ', '.join(missed_fields)))
76+
77+
unexpected_fields = set(fields) - set(expected_fields)
78+
if unexpected_fields: # unexpected fields
79+
sys.exit("Error found: Unexpected field in input TSV: %s, offending header: %s. Unexpected field(s): %s\n" % \
80+
(tsv, l, ', '.join(unexpected_fields)))
81+
82+
header_processed = True
83+
84+
else: # it's data row
85+
# at this point we only check whether number of values matches number of expected fields and uniqueness check,
86+
# later steps will perform more sophisticated content check
87+
values = l.split('\t')
88+
if len(expected_fields) != len(values):
89+
sys.exit("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n" % \
90+
(len(values), len(expected_fields), l))
91+
92+
if l in uniq_row:
93+
sys.exit("Error found: data row repeated in file: %s, offending data row: %s\n" % (tsv, l))
94+
else:
95+
uniq_row[l] = True
96+
97+
98+
def load_all_tsvs(exp_tsv, rg_tsv, file_tsv):
99+
metadata_dict = {}
100+
with open(exp_tsv, 'r') as f:
101+
rows = list(csv.DictReader(f, delimiter='\t'))
102+
if len(rows) != 1:
103+
sys.exit("Error found: experiment TSV expects exactly one data row, offending file: %s has %s row(s)\n" % \
104+
(exp_tsv, len(rows)))
105+
rows[0]['read_group_count'] = int(rows[0]['read_group_count'])
106+
metadata_dict.update(rows[0])
107+
108+
with open(rg_tsv, 'r') as f:
109+
metadata_dict['read_groups'] = []
110+
for rg in csv.DictReader(f, delimiter='\t'):
111+
if rg['is_paired_end'].lower() == 'true':
112+
rg['is_paired_end'] = True
113+
elif rg['is_paired_end'].lower() == 'false':
114+
rg['is_paired_end'] = False
115+
else:
116+
rg['is_paired_end'] = None
117+
118+
for field in ('read_length_r1', 'read_length_r2', 'insert_size'):
119+
if rg[field]:
120+
rg[field] = int(rg[field])
121+
else:
122+
rg[field] = None
123+
124+
metadata_dict['read_groups'].append(rg)
42125

126+
if len(metadata_dict['read_groups']) == 0:
127+
sys.exit("Error found: read group TSV does not contain any read group information\n")
128+
129+
with open(file_tsv, 'r') as f:
130+
metadata_dict['files'] = []
131+
for f in csv.DictReader(f, delimiter='\t'):
132+
if f['size']:
133+
f['size'] = int(f['size'])
134+
else:
135+
f['size'] = None
136+
137+
metadata_dict['files'].append(f)
138+
139+
if len(metadata_dict['files']) == 0:
140+
sys.exit("Error found: file TSV does not contain any file information\n")
141+
142+
return metadata_dict
143+
144+
145+
def validate_args(args):
146+
if args.metadata_json and \
147+
not (args.experiment_info_tsv or args.read_group_info_tsv or args.file_info_tsv):
148+
return True
149+
elif not args.metadata_json and \
150+
(args.experiment_info_tsv and args.read_group_info_tsv and args.file_info_tsv):
151+
return True
152+
else:
153+
sys.exit(textwrap.dedent(
154+
"""
155+
Usage:
156+
When '-m' is provided, no other arguments can be used
157+
When '-m' is not provided, please provide all of these arguments: -x, -r and -f
158+
"""
159+
))
160+
161+
162+
def main(metadata):
43163
empty_str_to_null(metadata)
44164

45165
payload = {
@@ -89,7 +209,10 @@ def main(args):
89209
'fileMd5sum': input_file.get('md5sum'),
90210
'fileType': input_file.get('format'),
91211
'fileAccess': 'controlled',
92-
'dataType': 'submitted_reads'
212+
'dataType': 'Submitted Reads',
213+
'info': {
214+
'data_category': 'Sequencing Reads'
215+
}
93216
}
94217
)
95218

@@ -104,12 +227,35 @@ def main(args):
104227

105228
if __name__ == "__main__":
106229
parser = ArgumentParser()
107-
parser.add_argument("-m", "--user-submit-metadata", dest="user_submit_metadata", required=True,
230+
parser.add_argument("-m", "--metadata-json",
108231
help="json file containing experiment, read_group and file information submitted from user")
109-
parser.add_argument("-w", "--wf-name", dest="wf_name", type=str, help="workflow full name", required=True)
110-
parser.add_argument("-c", "--wf-short-name", dest="wf_short_name", type=str, help="workflow short name")
111-
parser.add_argument("-v", "--wf-version", dest="wf_version", type=str, required=True, help="workflow version")
112-
parser.add_argument("-r", "--wf-run", dest="wf_run", type=str, required=True, help="workflow run ID")
232+
parser.add_argument("-x", "--experiment-info-tsv",
233+
help="tsv file containing experiment information submitted from user")
234+
parser.add_argument("-r", "--read-group-info-tsv",
235+
help="tsv file containing read_group information submitted from user")
236+
parser.add_argument("-f", "--file-info-tsv",
237+
help="tsv file containing file information submitted from user")
113238
args = parser.parse_args()
114239

115-
main(args)
240+
validate_args(args)
241+
242+
if args.metadata_json:
243+
with open(args.metadata_json, 'r') as f:
244+
metadata = json.load(f)
245+
else:
246+
# fistly TSV format conformity check, if not well-formed no point to continue
247+
tsv_confomity_check('experiment', args.experiment_info_tsv)
248+
tsv_confomity_check('read_group', args.read_group_info_tsv)
249+
tsv_confomity_check('file', args.file_info_tsv)
250+
251+
# all TSV are well-formed, let's load them
252+
metadata = load_all_tsvs(
253+
args.experiment_info_tsv,
254+
args.read_group_info_tsv,
255+
args.file_info_tsv
256+
)
257+
258+
# all TSV are well-formed, let's load them
259+
metadata = load_all_tsvs(args.experiment_info_tsv, args.read_group_info_tsv, args.file_info_tsv)
260+
261+
main(metadata)

tools/payload-gen-seq-experiment/tests/checker.nf

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,19 @@
2323

2424
nextflow.enable.dsl=2
2525

26-
params.user_submit_metadata = ""
27-
params.wf_name = ""
28-
params.wf_short_name = ""
29-
params.wf_version = ""
26+
params.metadata_json = "NO_FILE1"
27+
params.experiment_info_tsv = "NO_FILE2"
28+
params.read_group_info_tsv = "NO_FILE3"
29+
params.file_info_tsv = "NO_FILE4"
3030

3131
include {payloadGenSeqExperiment} from "../payload-gen-seq-experiment" params(params)
3232

3333
workflow {
3434
main:
3535
payloadGenSeqExperiment(
36-
file(params.user_submit_metadata),
37-
params.wf_name,
38-
params.wf_short_name,
39-
params.wf_version,
40-
'ok'
36+
file(params.metadata_json),
37+
file(params.experiment_info_tsv),
38+
file(params.read_group_info_tsv),
39+
file(params.file_info_tsv)
4140
)
4241
}
Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
11
{
2-
"user_submit_metadata": "data/seq-exp.bam.metadata.json",
3-
"wf_name": "sequencing-data-submission",
4-
"wf_short_name": "seq-submission",
5-
"wf_version": "0.1.0.0"
2+
"metadata_json": "data/seq-exp.bam.metadata.json"
63
}

0 commit comments

Comments
 (0)