11#!/usr/bin/env python3
22
3+
34"""
45 Copyright (c) 2019-2021, Ontario Institute for Cancer Research (OICR).
56
1920 Authors:
2021 Linda Xiang <linda.xiang@oicr.on.ca>
2122 Junjun Zhang <junjun.zhang@oicr.on.ca>
23+ Edmund Su <edmund.su@oicr.on.ca>
2224 """
2325
2426
2729import json
2830import csv
2931import textwrap
30- from argparse import ArgumentParser
31-
32-
33- TSV_FIELDS = {
34- 'experiment' : [
35- 'type' , 'program_id' , 'submitter_sequencing_experiment_id' , 'submitter_donor_id' , 'gender' ,
36- 'submitter_specimen_id' , 'tumour_normal_designation' , 'specimen_type' , 'specimen_tissue_source' , 'submitter_sample_id' ,
37- 'sample_type' , 'submitter_matched_normal_sample_id' , 'sequencing_center' , 'platform' , 'platform_model' ,
38- 'experimental_strategy' , 'sequencing_date' , 'read_group_count'
39- ],
40- 'read_group' : [
41- 'type' , 'submitter_read_group_id' , 'read_group_id_in_bam' , 'submitter_sequencing_experiment_id' , 'platform_unit' ,
42- 'is_paired_end' , 'file_r1' , 'file_r2' , 'read_length_r1' , 'read_length_r2' , 'insert_size' , 'sample_barcode' , 'library_name'
43- ],
44- 'file' : [
45- 'type' , 'name' , 'size' , 'md5sum' , 'path' , 'format'
32+ import argparse
33+ import requests
34+ import re
35+ import jsonschema
36+
37+
38+ TSV_FIELDS = {}
39+
40+ TSV_FIELDS ['experiment' ] = {}
41+ TSV_FIELDS ['experiment' ]['core' ]= [
42+ 'type' , 'program_id' , 'submitter_sequencing_experiment_id' , 'submitter_donor_id' , 'gender' ,
43+ 'submitter_specimen_id' , 'tumour_normal_designation' , 'specimen_type' , 'specimen_tissue_source' ,
44+ 'submitter_sample_id' ,'sample_type' , 'submitter_matched_normal_sample_id' , 'sequencing_center' ,
45+ 'platform' , 'platform_model' ,'experimental_strategy' , 'sequencing_date' , 'read_group_count' ]
46+ TSV_FIELDS ['experiment' ]["conditional" ]= [
47+ "library_isolation_protocol" ,"library_preparation_kit" ,
48+ "library_strandedness" ,"rin" ,"dv200" ,"spike_ins_included" ,
49+ "spike_ins_fasta" ,"spike_ins_concentration" ,
50+ "target_capture_kit" ]
51+
52+ TSV_FIELDS ['read_group' ]= {}
53+ TSV_FIELDS ['read_group' ]["core" ]= [
54+ 'type' , 'submitter_read_group_id' , 'read_group_id_in_bam' , 'submitter_sequencing_experiment_id' , 'platform_unit' ,
55+ 'is_paired_end' , 'file_r1' , 'file_r2' , 'read_length_r1' , 'read_length_r2' , 'insert_size' , 'sample_barcode' , 'library_name'
4656 ]
47- }
57+ TSV_FIELDS ['read_group' ]["conditional" ]= []
58+
59+ TSV_FIELDS ['file' ]= {}
60+ TSV_FIELDS ['file' ]["core" ]= ['type' , 'name' , 'size' , 'md5sum' , 'path' , 'format' ]
61+ TSV_FIELDS ['file' ]["conditional" ]= ["EGAS" ,"EGAC" ,"EGAP" ,"EGAN" ,"EGAR" ,"EGAX" ,"EGAZ" ,"EGAD" ,"EGAB" ,"EGAF" ]
62+
63+
4864
4965
5066def empty_str_to_null (metadata ):
@@ -57,7 +73,9 @@ def empty_str_to_null(metadata):
5773
5874
5975def tsv_confomity_check (ftype , tsv ):
60- expected_fields = TSV_FIELDS [ftype ]
76+ core_fields = TSV_FIELDS [ftype ]['core' ]
77+ conditional_fields = TSV_FIELDS [ftype ]['conditional' ]
78+ expected_fields = core_fields + conditional_fields
6179
6280 header_processed = False
6381 with open (tsv , 'r' ) as t :
@@ -69,7 +87,7 @@ def tsv_confomity_check(ftype, tsv):
6987 if len (fields ) != len (set (fields )):
7088 sys .exit ("Error found: Field duplicated in input TSV: %s, offending header: %s\n " % (tsv , l ))
7189
72- missed_fields = set (expected_fields ) - set (fields )
90+ missed_fields = set (core_fields ) - set (fields )
7391 if missed_fields : # missing fields
7492 sys .exit ("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n " % \
7593 (tsv , l , ', ' .join (missed_fields )))
@@ -85,7 +103,7 @@ def tsv_confomity_check(ftype, tsv):
85103 # at this point we only check whether number of values matches number of expected fields and uniqueness check,
86104 # later steps will perform more sophisticated content check
87105 values = l .split ('\t ' )
88- if len (expected_fields ) != len (values ):
106+ if len (values ) < len (core_fields ):
89107 sys .exit ("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n " % \
90108 (len (values ), len (expected_fields ), l ))
91109
@@ -155,9 +173,29 @@ def validate_args(args):
155173 Usage:
156174 When '-m' is provided, no other arguments can be used
157175 When '-m' is not provided, please provide all of these arguments: -x, -r and -f
176+ Optionally '-s' a schema URL can be provided, which the payload will be validated against
158177 """
159178 ))
160179
180+ def validatePayload (payload ,args ):
181+ if args .schema_url :
182+ url = args .schema_url
183+ else :
184+ url = "https://submission-song.rdpc.cancercollaboratory.org/schemas/sequencing_experiment"
185+
186+ resp = requests .get (url )
187+ if not resp .status_code == 200 :
188+ sys .exit ("Unable to retrieve schema. Please check URL\n " )
189+
190+ try :
191+ jsonschema .validate (instance = payload ,schema = resp .json ()['schema' ])
192+ except jsonschema .exceptions .ValidationError as err :
193+ print (err )
194+ sys .exit ("Payload failed to validate against schema\n " )
195+ else :
196+ return True
197+
198+
161199
162200def main (metadata , extra_info = dict ()):
163201 empty_str_to_null (metadata )
@@ -181,6 +219,25 @@ def main(metadata, extra_info=dict()):
181219 'files' : []
182220 }
183221
222+ # optional experiment arguements
223+ # Strings
224+ optional_experimental_fields = [
225+ "library_isolation_protocol" ,"library_preparation_kit" ,
226+ "library_strandedness" ,"dv200" ,"spike_ins_included" ,
227+ "spike_ins_fasta" ,"spike_ins_concentration" ,"sequencing_center" ]
228+ for optional_experimental_field in optional_experimental_fields :
229+ if metadata .get (optional_experimental_field ):
230+ payload ['experiment' ][optional_experimental_field ]= metadata .get (optional_experimental_field )
231+ # Int
232+ optional_experimental_fields = ["rin" ]
233+ for optional_experimental_field in optional_experimental_fields :
234+ if metadata .get (optional_experimental_field ):
235+ payload ['experiment' ][optional_experimental_field ]= int (metadata .get (optional_experimental_field ))
236+
237+ # RNA-seq library_Strandedness requirement check
238+ if metadata .get ('experimental_strategy' )== 'RNA-Seq' and not metadata .get ("library_strandedness" ):
239+ sys .exit (f"'experimental_strategy' 'RNA-Seq' specified but 'library_strandedness' is missing. Resubmit with both values 'experimental_strategy' and 'library_strandedness'" )
240+
184241 # get sample of the payload
185242 sample = {
186243 'submitterSampleId' : metadata .get ('submitter_sample_id' ),
@@ -198,27 +255,11 @@ def main(metadata, extra_info=dict()):
198255 }
199256 }
200257
201- if extra_info :
202- if extra_info ['sample' ].get (sample ['submitterSampleId' ]):
203- sample ['sampleId' ] = extra_info ['sample' ][sample ['submitterSampleId' ]]
204- else :
205- sys .exit (f"Provided extra_info_tsv misses mapping for submitter sample ID: { sample ['submitterSampleId' ]} " )
206-
207- if extra_info ['specimen' ].get (sample ['specimen' ]['submitterSpecimenId' ]):
208- sample ['specimenId' ] = extra_info ['specimen' ][sample ['specimen' ]['submitterSpecimenId' ]]
209- sample ['specimen' ]['specimenId' ] = sample ["specimenId" ]
210- else :
211- sys .exit (f"Provided extra_info_tsv misses mapping for submitter specimen ID: { sample ['specimen' ]['submitterSpecimenId' ]} " )
212-
213- if extra_info ['donor' ].get (sample ['donor' ]['submitterDonorId' ]):
214- sample ['donor' ]['donorId' ] = extra_info ['donor' ][sample ['donor' ]['submitterDonorId' ]]
215- sample ['specimen' ]['donorId' ] = sample ['donor' ]['donorId' ]
216- else :
217- sys .exit (f"Provided extra_info_tsv misses mapping for submitter donor ID: { sample ['donor' ]['submitterDonorId' ]} " )
218-
219258 payload ['samples' ].append (sample )
220259
221260 # get file of the payload
261+
262+ optional_file_fields = ["EGAS" ,"EGAC" ,"EGAP" ,"EGAN" ,"EGAR" ,"EGAX" ,"EGAZ" ,"EGAD" ,"EGAB" ,"EGAF" ]
222263 for input_file in metadata .get ("files" ):
223264 payload ['files' ].append (
224265 {
@@ -233,18 +274,56 @@ def main(metadata, extra_info=dict()):
233274 }
234275 }
235276 )
277+ for optional_file_field in optional_file_fields :
278+ if input_file .get (optional_file_field ):
279+ payload ['files' ][- 1 ][optional_file_field ]= input_file .get (optional_file_field )
236280
237281 for rg in metadata .get ("read_groups" ):
238282 rg .pop ('type' ) # remove 'type' field
239283 rg .pop ('submitter_sequencing_experiment_id' ) # remove 'submitter_sequencing_experiment_id' field
240284 payload ['read_groups' ].append (rg )
241285
286+
287+ if extra_info :
288+ for item ,dict_to_update ,submitter_id in zip (
289+ ["sample" ,"donor" ,"specimen" ,"experiment" ],
290+ [payload ['samples' ][0 ],payload ['samples' ][0 ]['donor' ],payload ['samples' ][0 ]['specimen' ],payload ['experiment' ]],
291+ ["submitterSampleId" ,"submitterDonorId" ,"submitterSpecimenId" ,"submitter_sequencing_experiment_id" ]
292+ ):
293+ if not item in extra_info :
294+ continue
295+ for key in extra_info [item ][dict_to_update .get (submitter_id )].keys () :
296+ if key in dict_to_update :
297+ sys .exit (f"Conflicting entries detected. Attempted altering of existing field { key } in { item } " )
298+ if extra_info [item ][dict_to_update .get (submitter_id )]:
299+ dict_to_update .update (extra_info [item ][dict_to_update .get (submitter_id )])
300+
301+ for item ,list_to_parse ,unique_ele_name in zip (
302+ ["files" ,"read_groups" ],
303+ [payload ["files" ],payload ['read_groups' ]],
304+ ["fileName" ,"submitter_read_group_id" ]
305+ ):
306+ if not item in extra_info :
307+ continue
308+ for ele_to_update in extra_info [item ].keys ():
309+ for existing_ele in list_to_parse :
310+ if existing_ele [unique_ele_name ]!= ele_to_update :
311+ continue
312+ for key in extra_info [item ][ele_to_update ].keys ():
313+ if key in existing_ele :
314+ sys .exit (f"Conflicting entries detected. Attempted altering of existing field { key } in { existing_ele } " )
315+ if item == 'files' :
316+ existing_ele ['info' ].update (extra_info [item ][ele_to_update ])
317+ else :
318+ existing_ele .update (extra_info [item ][ele_to_update ])
319+
320+ validatePayload (payload ,args )
242321 with open ("%s.sequencing_experiment.payload.json" % str (uuid .uuid4 ()), 'w' ) as f :
243322 f .write (json .dumps (payload , indent = 2 ))
244323
245324
246325if __name__ == "__main__" :
247- parser = ArgumentParser ()
326+ parser = argparse . ArgumentParser ()
248327 parser .add_argument ("-m" , "--metadata-json" ,
249328 help = "json file containing experiment, read_group and file information submitted from user" )
250329 parser .add_argument ("-x" , "--experiment-info-tsv" ,
@@ -254,7 +333,9 @@ def main(metadata, extra_info=dict()):
254333 parser .add_argument ("-f" , "--file-info-tsv" ,
255334 help = "tsv file containing file information submitted from user" )
256335 parser .add_argument ("-e" , "--extra-info-tsv" ,
257- help = "tsv file containing file information submitted from user" )
336+ help = "tsv file containing additional information pertaining to existing experiment, read_group, and file information submitted from user that does not fit within existing schemas" )
337+ parser .add_argument ("-s" , "--schema-url" ,
338+ help = "URL to validate schema against" )
258339 args = parser .parse_args ()
259340
260341 validate_args (args )
@@ -263,7 +344,7 @@ def main(metadata, extra_info=dict()):
263344 with open (args .metadata_json , 'r' ) as f :
264345 metadata = json .load (f )
265346 else :
266- # fistly TSV format conformity check, if not well-formed no point to continue
347+ # firstly TSV format conformity check, if not well-formed no point to continue
267348 tsv_confomity_check ('experiment' , args .experiment_info_tsv )
268349 tsv_confomity_check ('read_group' , args .read_group_info_tsv )
269350 tsv_confomity_check ('file' , args .file_info_tsv )
@@ -275,27 +356,28 @@ def main(metadata, extra_info=dict()):
275356 args .file_info_tsv
276357 )
277358
278- # all TSV are well-formed, let's load them
279- metadata = load_all_tsvs (args .experiment_info_tsv , args .read_group_info_tsv , args .file_info_tsv )
280-
281359 extra_info = dict ()
282360 if args .extra_info_tsv :
283361 with open (args .extra_info_tsv , 'r' ) as f :
284362 for row in csv .DictReader (f , delimiter = '\t ' ):
285- type = row ['type' ]
286- submitter_id = row ['submitter_id' ]
287- uniform_id = row ['uniform_id' ]
288- if type in extra_info :
289- sys .exit (f"Values in 'type' field duplicated. Offending value: { type } , in file: { args .extra_info_tsv } " )
290- else :
291- extra_info [type ] = dict ()
292-
293- if submitter_id in extra_info [type ]:
294- sys .exit (f"Values in 'submitter_id' field duplicated. Offending value: { submitter_id } , for type: { type } , in file: { args .extra_info_tsv } " )
295- else :
296- extra_info [type ][submitter_id ] = uniform_id
297-
298- if 'donor' not in extra_info or 'specimen' not in extra_info or 'sample' not in extra_info :
299- sys .exit (f"Provided extra_info_tsv file '{ args .extra_info_tsv } ' is required to have ID mappings for 'donor', 'specimen' and 'sample'" )
363+
364+ for row_type in ['type' ,'submitter_id' ,'submitter_field' ,'field_value' ]:
365+ if row_type not in row .keys ():
366+ sys .exit (f"Incorrect formatting of : { args .extra_info_tsv } . { row_type } is missing" )
367+
368+ row_type = row ['type' ]
369+ row_id = row ['submitter_id' ]
370+ row_field = row ['submitter_field' ]
371+ row_val = row ['field_value' ]
372+
373+ if (row_type != "sample" ) and (row_type != "donor" ) and (row_type != "specimen" ) and (row_type != "files" ) and (row_type != "experiment" ):
374+ sys .exit (f"Incorrect identifier supplied. Must be on the following : 'sample','donor','specimen','files','experiments'. Offending value: { type } , in file: { args .extra_info_tsv } " )
375+
376+ if row_type not in extra_info :
377+ extra_info [row_type ]= dict ()
378+ if row_id not in extra_info [row_type ]:
379+ extra_info [row_type ][row_id ]= dict ()
380+ extra_info [row_type ][row_id ][row_field ]= row_val
381+
300382
301383 main (metadata , extra_info )
0 commit comments