2222 """
2323
2424
25+ import sys
2526import uuid
2627import json
28+ import csv
29+ import textwrap
2730from argparse import ArgumentParser
2831
2932
33+ TSV_FIELDS = {
34+ 'experiment' : [
35+ 'type' , 'program_id' , 'submitter_sequencing_experiment_id' , 'submitter_donor_id' , 'gender' ,
36+ 'submitter_specimen_id' , 'tumour_normal_designation' , 'specimen_type' , 'specimen_tissue_source' , 'submitter_sample_id' ,
37+ 'sample_type' , 'submitter_matched_normal_sample_id' , 'sequencing_center' , 'platform' , 'platform_model' ,
38+ 'experimental_strategy' , 'sequencing_date' , 'read_group_count'
39+ ],
40+ 'read_group' : [
41+ 'type' , 'submitter_read_group_id' , 'read_group_id_in_bam' , 'submitter_sequencing_experiment_id' , 'platform_unit' ,
42+ 'is_paired_end' , 'file_r1' , 'file_r2' , 'read_length_r1' , 'read_length_r2' , 'insert_size' , 'sample_barcode' , 'library_name'
43+ ],
44+ 'file' : [
45+ 'type' , 'name' , 'size' , 'md5sum' , 'path' , 'format'
46+ ]
47+ }
48+
49+
3050def empty_str_to_null (metadata ):
3151 for k in metadata :
3252 if k in ['read_groups' , 'files' ]:
@@ -36,10 +56,110 @@ def empty_str_to_null(metadata):
3656 metadata [k ] = None
3757
3858
39- def main (args ):
40- with open (args .user_submit_metadata , 'r' ) as f :
41- metadata = json .load (f )
59+ def tsv_confomity_check (ftype , tsv ):
60+ expected_fields = TSV_FIELDS [ftype ]
61+
62+ header_processed = False
63+ with open (tsv , 'r' ) as t :
64+ uniq_row = {}
65+ for l in t :
66+ l = l .rstrip ('\n ' ).rstrip ('\r ' ) # remove trailing newline, remove windows `\r` (just in case)
67+ if not header_processed : # it's header
68+ fields = l .split ('\t ' )
69+ if len (fields ) != len (set (fields )):
70+ sys .exit ("Error found: Field duplicated in input TSV: %s, offending header: %s\n " % (tsv , l ))
71+
72+ missed_fields = set (expected_fields ) - set (fields )
73+ if missed_fields : # missing fields
74+ sys .exit ("Error found: Field missing in input TSV: %s, offending header: %s. Missed field(s): %s\n " % \
75+ (tsv , l , ', ' .join (missed_fields )))
76+
77+ unexpected_fields = set (fields ) - set (expected_fields )
78+ if unexpected_fields : # unexpected fields
79+ sys .exit ("Error found: Unexpected field in input TSV: %s, offending header: %s. Unexpected field(s): %s\n " % \
80+ (tsv , l , ', ' .join (unexpected_fields )))
81+
82+ header_processed = True
83+
84+ else : # it's data row
85+ # at this point we only check whether number of values matches number of expected fields and uniqueness check,
86+ # later steps will perform more sophisticated content check
87+ values = l .split ('\t ' )
88+ if len (expected_fields ) != len (values ):
89+ sys .exit ("Error found: number of fields: %s does not match expected: %s, offending data row: %s\n " % \
90+ (len (values ), len (expected_fields ), l ))
91+
92+ if l in uniq_row :
93+ sys .exit ("Error found: data row repeated in file: %s, offending data row: %s\n " % (tsv , l ))
94+ else :
95+ uniq_row [l ] = True
96+
97+
98+ def load_all_tsvs (exp_tsv , rg_tsv , file_tsv ):
99+ metadata_dict = {}
100+ with open (exp_tsv , 'r' ) as f :
101+ rows = list (csv .DictReader (f , delimiter = '\t ' ))
102+ if len (rows ) != 1 :
103+ sys .exit ("Error found: experiment TSV expects exactly one data row, offending file: %s has %s row(s)\n " % \
104+ (exp_tsv , len (rows )))
105+ rows [0 ]['read_group_count' ] = int (rows [0 ]['read_group_count' ])
106+ metadata_dict .update (rows [0 ])
107+
108+ with open (rg_tsv , 'r' ) as f :
109+ metadata_dict ['read_groups' ] = []
110+ for rg in csv .DictReader (f , delimiter = '\t ' ):
111+ if rg ['is_paired_end' ].lower () == 'true' :
112+ rg ['is_paired_end' ] = True
113+ elif rg ['is_paired_end' ].lower () == 'false' :
114+ rg ['is_paired_end' ] = False
115+ else :
116+ rg ['is_paired_end' ] = None
117+
118+ for field in ('read_length_r1' , 'read_length_r2' , 'insert_size' ):
119+ if rg [field ]:
120+ rg [field ] = int (rg [field ])
121+ else :
122+ rg [field ] = None
123+
124+ metadata_dict ['read_groups' ].append (rg )
42125
126+ if len (metadata_dict ['read_groups' ]) == 0 :
127+ sys .exit ("Error found: read group TSV does not contain any read group information\n " )
128+
129+ with open (file_tsv , 'r' ) as f :
130+ metadata_dict ['files' ] = []
131+ for f in csv .DictReader (f , delimiter = '\t ' ):
132+ if f ['size' ]:
133+ f ['size' ] = int (f ['size' ])
134+ else :
135+ f ['size' ] = None
136+
137+ metadata_dict ['files' ].append (f )
138+
139+ if len (metadata_dict ['files' ]) == 0 :
140+ sys .exit ("Error found: file TSV does not contain any file information\n " )
141+
142+ return metadata_dict
143+
144+
145+ def validate_args (args ):
146+ if args .metadata_json and \
147+ not (args .experiment_info_tsv or args .read_group_info_tsv or args .file_info_tsv ):
148+ return True
149+ elif not args .metadata_json and \
150+ (args .experiment_info_tsv and args .read_group_info_tsv and args .file_info_tsv ):
151+ return True
152+ else :
153+ sys .exit (textwrap .dedent (
154+ """
155+ Usage:
156+ When '-m' is provided, no other arguments can be used
157+ When '-m' is not provided, please provide all of these arguments: -x, -r and -f
158+ """
159+ ))
160+
161+
162+ def main (metadata ):
43163 empty_str_to_null (metadata )
44164
45165 payload = {
@@ -89,7 +209,10 @@ def main(args):
89209 'fileMd5sum' : input_file .get ('md5sum' ),
90210 'fileType' : input_file .get ('format' ),
91211 'fileAccess' : 'controlled' ,
92- 'dataType' : 'submitted_reads'
212+ 'dataType' : 'Submitted Reads' ,
213+ 'info' : {
214+ 'data_category' : 'Sequencing Reads'
215+ }
93216 }
94217 )
95218
@@ -104,12 +227,35 @@ def main(args):
104227
105228if __name__ == "__main__" :
106229 parser = ArgumentParser ()
107- parser .add_argument ("-m" , "--user-submit- metadata" , dest = "user_submit_metadata" , required = True ,
230+ parser .add_argument ("-m" , "--metadata-json" ,
108231 help = "json file containing experiment, read_group and file information submitted from user" )
109- parser .add_argument ("-w" , "--wf-name" , dest = "wf_name" , type = str , help = "workflow full name" , required = True )
110- parser .add_argument ("-c" , "--wf-short-name" , dest = "wf_short_name" , type = str , help = "workflow short name" )
111- parser .add_argument ("-v" , "--wf-version" , dest = "wf_version" , type = str , required = True , help = "workflow version" )
112- parser .add_argument ("-r" , "--wf-run" , dest = "wf_run" , type = str , required = True , help = "workflow run ID" )
232+ parser .add_argument ("-x" , "--experiment-info-tsv" ,
233+ help = "tsv file containing experiment information submitted from user" )
234+ parser .add_argument ("-r" , "--read-group-info-tsv" ,
235+ help = "tsv file containing read_group information submitted from user" )
236+ parser .add_argument ("-f" , "--file-info-tsv" ,
237+ help = "tsv file containing file information submitted from user" )
113238 args = parser .parse_args ()
114239
115- main (args )
240+ validate_args (args )
241+
242+ if args .metadata_json :
243+ with open (args .metadata_json , 'r' ) as f :
244+ metadata = json .load (f )
245+ else :
246+ # fistly TSV format conformity check, if not well-formed no point to continue
247+ tsv_confomity_check ('experiment' , args .experiment_info_tsv )
248+ tsv_confomity_check ('read_group' , args .read_group_info_tsv )
249+ tsv_confomity_check ('file' , args .file_info_tsv )
250+
251+ # all TSV are well-formed, let's load them
252+ metadata = load_all_tsvs (
253+ args .experiment_info_tsv ,
254+ args .read_group_info_tsv ,
255+ args .file_info_tsv
256+ )
257+
258+ # all TSV are well-formed, let's load them
259+ metadata = load_all_tsvs (args .experiment_info_tsv , args .read_group_info_tsv , args .file_info_tsv )
260+
261+ main (metadata )
0 commit comments