Skip to content

Commit d0d3abb

Browse files
authored
Merge pull request #154 from icgc-argo-workflows/payload-gen-seq-experiment@0.6.0.1
[release]
2 parents 29a0fd6 + d38ce8e commit d0d3abb

17 files changed

Lines changed: 353 additions & 75 deletions

payload-gen-seq-experiment/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ RUN apt-get update && apt-get install -y procps
44

55
LABEL org.opencontainers.image.source https://github.com/icgc-argo-workflows/data-processing-utility-tools
66

7+
RUN pip install requests && \
8+
pip install jsonschema
9+
710
RUN groupadd -g 1000 ubuntu &&\
811
useradd -l -u 1000 -g ubuntu ubuntu &&\
912
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu

payload-gen-seq-experiment/main.nf

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
/* this block is auto-generated based on info from pkg.json where */
2626
/* changes can be made if needed, do NOT modify this block manually */
2727
nextflow.enable.dsl = 2
28-
version = '0.5.0.1'
28+
version = '0.6.0.1'
2929

3030
container = [
3131
'ghcr.io': 'ghcr.io/icgc-argo-workflows/data-processing-utility-tools.payload-gen-seq-experiment'
@@ -49,7 +49,7 @@ params.experiment_info_tsv = "NO_FILE1"
4949
params.read_group_info_tsv = "NO_FILE2"
5050
params.file_info_tsv = "NO_FILE3"
5151
params.extra_info_tsv = "NO_FILE4"
52-
52+
params.schema_url="NO_FILE5"
5353

5454
process payloadGenSeqExperiment {
5555
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
@@ -63,6 +63,7 @@ process payloadGenSeqExperiment {
6363
path read_group_info_tsv
6464
path file_info_tsv
6565
path extra_info_tsv
66+
val schema_url
6667

6768
output:
6869
path "*.sequencing_experiment.payload.json", emit: payload
@@ -72,13 +73,14 @@ process payloadGenSeqExperiment {
7273
args_read_group_info_tsv = !read_group_info_tsv.name.startsWith("NO_FILE") ? "-r ${read_group_info_tsv}" : ""
7374
args_file_info_tsv = !file_info_tsv.name.startsWith("NO_FILE") ? "-f ${file_info_tsv}" : ""
7475
args_extra_info_tsv = !extra_info_tsv.name.startsWith("NO_FILE") ? "-e ${extra_info_tsv}" : ""
75-
76+
args_schema_url = !schema_url.startsWith("NO_FILE") ? "-s ${schema_url}" : ""
7677
"""
7778
main.py \
7879
${args_experiment_info_tsv} \
7980
${args_read_group_info_tsv} \
8081
${args_file_info_tsv} \
81-
${args_extra_info_tsv}
82+
${args_extra_info_tsv} \
83+
${args_schema_url}
8284
"""
8385
}
8486

@@ -90,6 +92,7 @@ workflow {
9092
file(params.experiment_info_tsv),
9193
file(params.read_group_info_tsv),
9294
file(params.file_info_tsv),
93-
file(params.extra_info_tsv)
95+
file(params.extra_info_tsv),
96+
params.schema_url
9497
)
9598
}

payload-gen-seq-experiment/main.py

Lines changed: 141 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
22

3+
34
"""
45
Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
56
@@ -19,6 +20,7 @@
1920
Authors:
2021
Linda Xiang <linda.xiang@oicr.on.ca>
2122
Junjun Zhang <junjun.zhang@oicr.on.ca>
23+
Edmund Su <edmund.su@oicr.on.ca>
2224
"""
2325

2426

@@ -27,24 +29,38 @@
2729
import json
2830
import csv
2931
import textwrap
30-
from argparse import ArgumentParser
31-
32-
33-
TSV_FIELDS = {
34-
'experiment': [
35-
'type', 'program_id', 'submitter_sequencing_experiment_id', 'submitter_donor_id', 'gender',
36-
'submitter_specimen_id', 'tumour_normal_designation', 'specimen_type', 'specimen_tissue_source', 'submitter_sample_id',
37-
'sample_type', 'submitter_matched_normal_sample_id', 'sequencing_center', 'platform', 'platform_model',
38-
'experimental_strategy', 'sequencing_date', 'read_group_count'
39-
],
40-
'read_group': [
41-
'type', 'submitter_read_group_id', 'read_group_id_in_bam', 'submitter_sequencing_experiment_id', 'platform_unit',
42-
'is_paired_end', 'file_r1', 'file_r2', 'read_length_r1', 'read_length_r2', 'insert_size', 'sample_barcode', 'library_name'
43-
],
44-
'file': [
45-
'type', 'name', 'size', 'md5sum', 'path', 'format'
32+
import argparse
33+
import requests
34+
import re
35+
import jsonschema
36+
37+
38+
TSV_FIELDS = {}
39+
40+
TSV_FIELDS['experiment'] = {}
41+
TSV_FIELDS['experiment']['core']=[
42+
'type', 'program_id', 'submitter_sequencing_experiment_id', 'submitter_donor_id', 'gender',
43+
'submitter_specimen_id', 'tumour_normal_designation', 'specimen_type', 'specimen_tissue_source',
44+
'submitter_sample_id','sample_type', 'submitter_matched_normal_sample_id', 'sequencing_center',
45+
'platform', 'platform_model','experimental_strategy', 'sequencing_date', 'read_group_count']
46+
TSV_FIELDS['experiment']["conditional"]=[
47+
"library_isolation_protocol","library_preparation_kit",
48+
"library_strandedness","rin","dv200","spike_ins_included",
49+
"spike_ins_fasta","spike_ins_concentration",
50+
"target_capture_kit"]
51+
52+
TSV_FIELDS['read_group']= {}
53+
TSV_FIELDS['read_group']["core"]=[
54+
'type', 'submitter_read_group_id', 'read_group_id_in_bam', 'submitter_sequencing_experiment_id', 'platform_unit',
55+
'is_paired_end', 'file_r1', 'file_r2', 'read_length_r1', 'read_length_r2', 'insert_size', 'sample_barcode', 'library_name'
4656
]
47-
}
57+
TSV_FIELDS['read_group']["conditional"]=[]
58+
59+
TSV_FIELDS['file']={}
60+
TSV_FIELDS['file']["core"]=['type', 'name', 'size', 'md5sum', 'path', 'format']
61+
TSV_FIELDS['file']["conditional"]=["EGAS","EGAC","EGAP","EGAN","EGAR","EGAX","EGAZ","EGAD","EGAB","EGAF"]
62+
63+
4864

4965

5066
def empty_str_to_null(metadata):
@@ -57,7 +73,9 @@ def empty_str_to_null(metadata):
5773

5874

5975
def tsv_confomity_check(ftype, tsv):
60-
expected_fields = TSV_FIELDS[ftype]
76+
core_fields = TSV_FIELDS[ftype]['core']
77+
conditional_fields = TSV_FIELDS[ftype]['conditional']
78+
expected_fields=core_fields+conditional_fields
6179

6280
header_processed = False
6381
with open(tsv, 'r') as t:
@@ -69,7 +87,7 @@ def tsv_confomity_check(ftype, tsv):
6987
if len(fields) != len(set(fields)):
7088
sys.exit("Error found: Field duplicated in input TSV: %s, offending header: %s\n" % (tsv, l))
7189

72-
missed_fields = set(expected_fields) - set(fields)
90+
missed_fields = set(core_fields) - set(fields)
7391
if missed_fields: # missing fields
7492
sys.exit("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n" % \
7593
(tsv, l, ', '.join(missed_fields)))
@@ -85,7 +103,7 @@ def tsv_confomity_check(ftype, tsv):
85103
# at this point we only check whether number of values matches number of expected fields and uniqueness check,
86104
# later steps will perform more sophisticated content check
87105
values = l.split('\t')
88-
if len(expected_fields) != len(values):
106+
if len(values) < len(core_fields):
89107
sys.exit("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n" % \
90108
(len(values), len(expected_fields), l))
91109

@@ -155,9 +173,29 @@ def validate_args(args):
155173
Usage:
156174
When '-m' is provided, no other arguments can be used
157175
When '-m' is not provided, please provide all of these arguments: -x, -r and -f
176+
Optionally '-s' a schema URL can be provided, which the payload will be validated against
158177
"""
159178
))
160179

180+
def validatePayload(payload,args):
181+
if args.schema_url:
182+
url=args.schema_url
183+
else:
184+
url="https://submission-song.rdpc.cancercollaboratory.org/schemas/sequencing_experiment"
185+
186+
resp=requests.get(url)
187+
if not resp.status_code==200:
188+
sys.exit("Unable to retrieve schema. Please check URL\n")
189+
190+
try:
191+
jsonschema.validate(instance=payload,schema=resp.json()['schema'])
192+
except jsonschema.exceptions.ValidationError as err:
193+
print(err)
194+
sys.exit("Payload failed to validate against schema\n")
195+
else:
196+
return True
197+
198+
161199

162200
def main(metadata, extra_info=dict()):
163201
empty_str_to_null(metadata)
@@ -181,6 +219,25 @@ def main(metadata, extra_info=dict()):
181219
'files': []
182220
}
183221

222+
# optional experiment arguements
223+
# Strings
224+
optional_experimental_fields=[
225+
"library_isolation_protocol","library_preparation_kit",
226+
"library_strandedness","dv200","spike_ins_included",
227+
"spike_ins_fasta","spike_ins_concentration","sequencing_center"]
228+
for optional_experimental_field in optional_experimental_fields:
229+
if metadata.get(optional_experimental_field):
230+
payload['experiment'][optional_experimental_field]=metadata.get(optional_experimental_field)
231+
# Int
232+
optional_experimental_fields=["rin"]
233+
for optional_experimental_field in optional_experimental_fields:
234+
if metadata.get(optional_experimental_field):
235+
payload['experiment'][optional_experimental_field]=int(metadata.get(optional_experimental_field))
236+
237+
# RNA-seq library_Strandedness requirement check
238+
if metadata.get('experimental_strategy')=='RNA-Seq' and not metadata.get("library_strandedness"):
239+
sys.exit(f"'experimental_strategy' 'RNA-Seq' specified but 'library_strandedness' is missing. Resubmit with both values 'experimental_strategy' and 'library_strandedness'")
240+
184241
# get sample of the payload
185242
sample = {
186243
'submitterSampleId': metadata.get('submitter_sample_id'),
@@ -198,27 +255,11 @@ def main(metadata, extra_info=dict()):
198255
}
199256
}
200257

201-
if extra_info:
202-
if extra_info['sample'].get(sample['submitterSampleId']):
203-
sample['sampleId'] = extra_info['sample'][sample['submitterSampleId']]
204-
else:
205-
sys.exit(f"Provided extra_info_tsv misses mapping for submitter sample ID: {sample['submitterSampleId']}")
206-
207-
if extra_info['specimen'].get(sample['specimen']['submitterSpecimenId']):
208-
sample['specimenId'] = extra_info['specimen'][sample['specimen']['submitterSpecimenId']]
209-
sample['specimen']['specimenId'] = sample["specimenId"]
210-
else:
211-
sys.exit(f"Provided extra_info_tsv misses mapping for submitter specimen ID: {sample['specimen']['submitterSpecimenId']}")
212-
213-
if extra_info['donor'].get(sample['donor']['submitterDonorId']):
214-
sample['donor']['donorId'] = extra_info['donor'][sample['donor']['submitterDonorId']]
215-
sample['specimen']['donorId'] = sample['donor']['donorId']
216-
else:
217-
sys.exit(f"Provided extra_info_tsv misses mapping for submitter donor ID: {sample['donor']['submitterDonorId']}")
218-
219258
payload['samples'].append(sample)
220259

221260
# get file of the payload
261+
262+
optional_file_fields=["EGAS","EGAC","EGAP","EGAN","EGAR","EGAX","EGAZ","EGAD","EGAB","EGAF"]
222263
for input_file in metadata.get("files"):
223264
payload['files'].append(
224265
{
@@ -233,18 +274,56 @@ def main(metadata, extra_info=dict()):
233274
}
234275
}
235276
)
277+
for optional_file_field in optional_file_fields:
278+
if input_file.get(optional_file_field):
279+
payload['files'][-1][optional_file_field]=input_file.get(optional_file_field)
236280

237281
for rg in metadata.get("read_groups"):
238282
rg.pop('type') # remove 'type' field
239283
rg.pop('submitter_sequencing_experiment_id') # remove 'submitter_sequencing_experiment_id' field
240284
payload['read_groups'].append(rg)
241285

286+
287+
if extra_info:
288+
for item,dict_to_update,submitter_id in zip(
289+
["sample","donor","specimen","experiment"],
290+
[payload['samples'][0],payload['samples'][0]['donor'],payload['samples'][0]['specimen'],payload['experiment']],
291+
["submitterSampleId","submitterDonorId","submitterSpecimenId","submitter_sequencing_experiment_id"]
292+
):
293+
if not item in extra_info:
294+
continue
295+
for key in extra_info[item][dict_to_update.get(submitter_id)].keys() :
296+
if key in dict_to_update:
297+
sys.exit(f"Conflicting entries detected. Attempted altering of existing field {key} in {item}")
298+
if extra_info[item][dict_to_update.get(submitter_id)]:
299+
dict_to_update.update(extra_info[item][dict_to_update.get(submitter_id)])
300+
301+
for item,list_to_parse,unique_ele_name in zip(
302+
["files","read_groups"],
303+
[payload["files"],payload['read_groups']],
304+
["fileName","submitter_read_group_id"]
305+
):
306+
if not item in extra_info:
307+
continue
308+
for ele_to_update in extra_info[item].keys():
309+
for existing_ele in list_to_parse:
310+
if existing_ele[unique_ele_name]!=ele_to_update:
311+
continue
312+
for key in extra_info[item][ele_to_update].keys():
313+
if key in existing_ele:
314+
sys.exit(f"Conflicting entries detected. Attempted altering of existing field {key} in {existing_ele}")
315+
if item=='files':
316+
existing_ele['info'].update(extra_info[item][ele_to_update])
317+
else:
318+
existing_ele.update(extra_info[item][ele_to_update])
319+
320+
validatePayload(payload,args)
242321
with open("%s.sequencing_experiment.payload.json" % str(uuid.uuid4()), 'w') as f:
243322
f.write(json.dumps(payload, indent=2))
244323

245324

246325
if __name__ == "__main__":
247-
parser = ArgumentParser()
326+
parser = argparse.ArgumentParser()
248327
parser.add_argument("-m", "--metadata-json",
249328
help="json file containing experiment, read_group and file information submitted from user")
250329
parser.add_argument("-x", "--experiment-info-tsv",
@@ -254,7 +333,9 @@ def main(metadata, extra_info=dict()):
254333
parser.add_argument("-f", "--file-info-tsv",
255334
help="tsv file containing file information submitted from user")
256335
parser.add_argument("-e", "--extra-info-tsv",
257-
help="tsv file containing file information submitted from user")
336+
help="tsv file containing additional information pertaining to existing experiment, read_group, and file information submitted from user that does not fit within existing schemas")
337+
parser.add_argument("-s", "--schema-url",
338+
help="URL to validate schema against")
258339
args = parser.parse_args()
259340

260341
validate_args(args)
@@ -263,7 +344,7 @@ def main(metadata, extra_info=dict()):
263344
with open(args.metadata_json, 'r') as f:
264345
metadata = json.load(f)
265346
else:
266-
# fistly TSV format conformity check, if not well-formed no point to continue
347+
# firstly TSV format conformity check, if not well-formed no point to continue
267348
tsv_confomity_check('experiment', args.experiment_info_tsv)
268349
tsv_confomity_check('read_group', args.read_group_info_tsv)
269350
tsv_confomity_check('file', args.file_info_tsv)
@@ -275,27 +356,28 @@ def main(metadata, extra_info=dict()):
275356
args.file_info_tsv
276357
)
277358

278-
# all TSV are well-formed, let's load them
279-
metadata = load_all_tsvs(args.experiment_info_tsv, args.read_group_info_tsv, args.file_info_tsv)
280-
281359
extra_info = dict()
282360
if args.extra_info_tsv:
283361
with open(args.extra_info_tsv, 'r') as f:
284362
for row in csv.DictReader(f, delimiter='\t'):
285-
type = row['type']
286-
submitter_id = row['submitter_id']
287-
uniform_id = row['uniform_id']
288-
if type in extra_info:
289-
sys.exit(f"Values in 'type' field duplicated. Offending value: {type}, in file: {args.extra_info_tsv}")
290-
else:
291-
extra_info[type] = dict()
292-
293-
if submitter_id in extra_info[type]:
294-
sys.exit(f"Values in 'submitter_id' field duplicated. Offending value: {submitter_id}, for type: {type}, in file: {args.extra_info_tsv}" )
295-
else:
296-
extra_info[type][submitter_id] = uniform_id
297-
298-
if 'donor' not in extra_info or 'specimen' not in extra_info or 'sample' not in extra_info:
299-
sys.exit(f"Provided extra_info_tsv file '{args.extra_info_tsv}' is required to have ID mappings for 'donor', 'specimen' and 'sample'")
363+
364+
for row_type in ['type','submitter_id','submitter_field','field_value']:
365+
if row_type not in row.keys():
366+
sys.exit(f"Incorrect formatting of : {args.extra_info_tsv}. {row_type} is missing")
367+
368+
row_type = row['type']
369+
row_id= row['submitter_id']
370+
row_field= row['submitter_field']
371+
row_val= row['field_value']
372+
373+
if (row_type!="sample") and (row_type!="donor") and (row_type!="specimen") and (row_type!="files") and (row_type!="experiment"):
374+
sys.exit(f"Incorrect identifier supplied. Must be on the following : 'sample','donor','specimen','files','experiments'. Offending value: {type}, in file: {args.extra_info_tsv}")
375+
376+
if row_type not in extra_info:
377+
extra_info[row_type]=dict()
378+
if row_id not in extra_info[row_type]:
379+
extra_info[row_type][row_id]=dict()
380+
extra_info[row_type][row_id][row_field]=row_val
381+
300382

301383
main(metadata, extra_info)

payload-gen-seq-experiment/pkg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "payload-gen-seq-experiment",
3-
"version": "0.5.0.1",
3+
"version": "0.6.0.1",
44
"description": "SONG payload generation for sequencing experiment",
55
"main": "main.nf",
66
"deprecated": false,

0 commit comments

Comments
 (0)