icgc-argo-workflows
diff --git a/‎payload-gen-variant-filtering/.dockerignore‎
Lines changed: 5 additions & 0 deletions b/‎payload-gen-variant-filtering/.dockerignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎payload-gen-variant-filtering/.gitignore‎
Lines changed: 69 additions & 0 deletions b/‎payload-gen-variant-filtering/.gitignore‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎payload-gen-variant-filtering/Dockerfile‎
Lines changed: 15 additions & 0 deletions b/‎payload-gen-variant-filtering/Dockerfile‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎payload-gen-variant-filtering/main.nf‎
Lines changed: 103 additions & 0 deletions b/‎payload-gen-variant-filtering/main.nf‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎payload-gen-variant-filtering/main.py‎
Lines changed: 161 additions & 0 deletions b/‎payload-gen-variant-filtering/main.py‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎payload-gen-variant-filtering/nextflow.config‎
Lines changed: 4 additions & 0 deletions b/‎payload-gen-variant-filtering/nextflow.config‎
Lines changed: 4 additions & 0 deletions
@@ -0,0 +1,5 @@
+.gitignore
+.nextflow*
+tests
+work
+outdir
@@ -0,0 +1,69 @@
+*.py[cod]
+
+# C extensions
+*.so
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+.eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+lib
+lib64
+venv*/
+pyvenv*/
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+.coverage.*
+nosetests.xml
+coverage.xml
+htmlcov
+
+# Translations
+*.mo
+
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+.idea
+*.iml
+*.komodoproject
+
+# Complexity
+output/*.html
+output/*/index.html
+
+# Sphinx
+docs/_build
+
+.DS_Store
+*~
+.*.sw[po]
+.build
+.ve
+.env
+.cache
+.pytest
+.bootstrap
+.appveyor.token
+*.bak
+*.log
+.vscode
+.python-version
+.nextflow*
+work
+outdir
@@ -0,0 +1,15 @@
+FROM python:3.7.5-slim-buster
+
+LABEL org.opencontainers.image.source https://github.com/icgc-argo/data-processing-utility-tools
+
+RUN groupadd -g 1000 ubuntu &&\
+    useradd -l -u 1000 -g ubuntu ubuntu &&\
+    install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
+
+ENV PATH="/tools:${PATH}"
+
+COPY *.py /tools/
+
+USER ubuntu
+
+CMD ["/bin/bash"]
@@ -0,0 +1,103 @@
+#!/usr/bin/env nextflow
+
+/*
+  Copyright (C) 2021,  Ontario Institute for Cancer Research
+  
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Affero General Public License for more details.
+  
+  You should have received a copy of the GNU Affero General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+  Authors:
+    Linda Xiang
+*/
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+version = '0.1.0'  // package version
+
+container = [
+    'ghcr.io': 'ghcr.io/icgc-argo/data-processing-utility-tools.payload-gen-variant-filtering'
+]
+default_container_registry = 'ghcr.io'
+/********************************************************************/
+
+
+// universal params go here
+params.container_registry = ""
+params.container_version = ""
+params.container = ""
+
+params.cpus = 1
+params.mem = 1  // GB
+params.publish_dir = ""  // set to empty string will disable publishDir
+
+
+// tool specific parmas go here, add / change as needed
+params.analysis = ""
+params.files_to_upload = []
+params.wf_name = ""
+params.wf_short_name = ""
+params.wf_version = ""
+params.controlled = false
+
+
+process payloadGenVariantFiltering {
+  container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+  input:  // input, make update as needed
+    path analysis
+    path files_to_upload
+    val wf_name
+    val wf_short_name
+    val wf_version
+    val controlled
+
+  output:  // output, make update as needed
+    path "*.payload.json", emit: payload
+
+  script:
+    // add and initialize variables here as needed
+    arg_controlled = controlled ? "-c" : ""
+
+    """   
+    main.py \
+      -a ${analysis} \
+      -f ${files_to_upload} \
+      -w ${wf_name} \
+      -s ${wf_short_name} \
+      -v ${wf_version} \
+      -r ${workflow.runName} \
+      -j ${workflow.sessionId} \
+      ${arg_controlled}
+
+    """
+}
+
+
+// this provides an entry point for this main script, so it can be run directly without clone the repo
+// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
+workflow {
+  payloadGenVariantFiltering(
+    file(params.analysis),
+    Channel.fromPath(params.files_to_upload).collect(),
+    params.wf_name,
+    params.wf_short_name,
+    params.wf_version,
+    params.controlled
+  )
+}
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+  Copyright (C) 2021,  Ontario Institute for Cancer Research
+  
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Affero General Public License for more details.
+  
+  You should have received a copy of the GNU Affero General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+  Authors:
+    Linda Xiang
+"""
+
+import os
+import argparse
+import json
+import uuid
+import hashlib
+import copy
+
+variant_type_to_data_type_etc = {
+    'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan', 'bcftools'], ['GATK-Mutect2', 'bcftools']],   # dataCategory, dataType, analysis_tools
+    'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel', 'bcftools'], ['GATK-Mutect2', 'bcftools']]
+}
+
+workflow_full_name = {
+    'open-access-variant-filtering': 'Open Access Variant Filtering'
+}
+
+def calculate_size(file_path):
+    return os.stat(file_path).st_size
+
+
+def calculate_md5(file_path):
+    md5 = hashlib.md5()
+    with open(file_path, 'rb') as f:
+        for chunk in iter(lambda: f.read(1024 * 1024), b''):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+def get_files_info(file_to_upload, args, input_wf, variant_type):
+    basename = os.path.basename(file_to_upload).replace("filtered", args.wf_short_name)
+    
+    file_info = {
+        'fileName': basename,
+        'fileType': 'VCF' if basename.endswith('.vcf.gz') else basename.split(".")[-1].upper(),
+        'fileSize': calculate_size(file_to_upload),
+        'fileMd5sum': calculate_md5(file_to_upload),
+        'fileAccess': 'open' if not args.controlled else 'controlled',
+        'info': {
+            'data_category': variant_type_to_data_type_etc[variant_type][0]
+        }
+    }
+
+    if file_to_upload.endswith('.vcf.gz'):
+        file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
+    elif file_to_upload.endswith('.vcf.gz.tbi'):
+        file_info['dataType'] = 'VCF Index'
+    else:
+        pass
+
+    if input_wf in (['sanger-wgs', 'sanger-wxs']):
+        file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2]
+    elif input_wf in (['gatk-mutect2']):
+        file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][3]
+
+    return file_info
+
+def get_sample_info(sample_list):
+    samples = copy.deepcopy(sample_list)
+    for sample in samples:
+        for item in ['info', 'sampleId', 'specimenId', 'donorId', 'studyId']:
+            sample.pop(item, None)
+            sample['specimen'].pop(item, None)
+            sample['donor'].pop(item, None)
+
+    return samples
+
+def get_variant_type(analysis):
+    for f in analysis.get('files'):
+        if f.get('dataType') == "VCF Index": continue
+        if f.get('dataType') == "Raw SNV Calls":
+            variant_type = 'snv'
+        elif f.get('dataType') == "Raw InDel Calls":
+            variant_type = 'indel'
+    
+    return variant_type
+
+def main():
+    """
+    Python implementation of tool: payload-gen-variant-filtering
+    """
+
+    parser = argparse.ArgumentParser(description='Tool: payload-gen-variant-filtering')
+    parser.add_argument("-a", dest="analysis", required=True,
+                        help="json file containing sequencing_alignment SONG analysis for tumour sample")
+    parser.add_argument("-f", dest="files_to_upload", type=str, nargs="+", help="Files to be uploaded", required=True)
+    parser.add_argument("-w", dest="wf_name", type=str, help="workflow full name", required=True)
+    parser.add_argument("-s", dest="wf_short_name", type=str, help="workflow short name", required=True)
+    parser.add_argument("-v", dest="wf_version", type=str, required=True, help="workflow version")
+    parser.add_argument("-r", dest="wf_run", type=str, required=True, help="workflow run ID")
+    parser.add_argument("-j", dest="wf_session", type=str, required=True, help="workflow session ID")
+    parser.add_argument("-c", dest="controlled", action='store_true', help="set file to be controlled access")
+    args = parser.parse_args()
+
+    analysis = {}
+    with open(args.analysis, 'r') as f:
+        analysis = json.load(f)
+
+    input_analysis_type = analysis.get('analysisType').get('name')
+    input_wf = analysis.get('workflow', {}).get('workflow_short_name')
+    variant_type = get_variant_type(analysis)
+
+    output_analysis_type = "variant_filtering"
+    payload = {
+        'analysisType': {
+            'name': output_analysis_type
+        },
+        'studyId': analysis.get('studyId'),  
+        'experiment': analysis.get('experiment'),
+        'samples': get_sample_info(analysis.get('samples')),
+        'files': [],
+        'workflow': {
+            'workflow_name': workflow_full_name.get(args.wf_name),
+            'workflow_short_name': args.wf_short_name,
+            'workflow_version': args.wf_version,
+            'run_id': args.wf_run,
+            'session_id': args.wf_session,
+            'inputs': [
+                {
+                    'input_analysis_id': analysis.get('analysisId'),
+                    'analysis_type': input_analysis_type
+                }
+            ],
+            'genome_build': 'GRCh38_hla_decoy_ebv'
+        },
+        'variant_class': analysis.get('variant_class')
+    }
+
+    for f in args.files_to_upload:
+        file_info = get_files_info(f, args, input_wf, variant_type)
+        payload['files'].append(file_info)
+
+    with open("%s.%s.payload.json" % (str(uuid.uuid4()), output_analysis_type), 'w') as f:
+        f.write(json.dumps(payload, indent=2))
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,4 @@
+docker {
+    enabled = true
+    runOptions = '-u \$(id -u):\$(id -g)'
+}
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +.gitignore
 +.nextflow*
 +tests
 +work
 +outdir