#!/usr/bin/env cwl-runner
cwlVersion: v1.2
class: Workflow
requirements:
  StepInputExpressionRequirement: {}
  InlineJavascriptRequirement: {}
  MultipleInputFeatureRequirement: {}
  SubworkflowFeatureRequirement: {}
  ScatterFeatureRequirement: {}

label: (Hybrid) Metagenomics workflow
doc: |
  **Workflow (hybrid) metagenomic assembly and binning  **<br>
    - Workflow Illumina Quality: 
      - FastQC (control)
      - fastp (quality trimming)
    - Workflow Longread Quality:	
      - NanoPlot (control)
      - filtlong (quality trimming)
      - minimap2 contamination filter
    - Kraken2 taxonomic classification of FASTQ reads
    - SPAdes/Flye (Assembly)
    - Medaka/PyPolCA (Assembly polishing)
    - QUAST (Assembly quality report)

    (optional)
    - Workflow binnning
      - Metabat2/MaxBin2/SemiBin
      - Binette
      - BUSCO
      - GTDB-Tk

    (optional)
    - Workflow Genome-scale metabolic models https://workflowhub.eu/workflows/372
      - CarveMe (GEM generation)
      - MEMOTE (GEM test suite)
      - SMETANA (Species METabolic interaction ANAlysis)

  Other UNLOCK workflows on WorkflowHub: https://workflowhub.eu/projects/16/workflows?view=default<br><br>

  **All tool CWL files and other workflows can be found here:**<br>
    https://gitlab.com/m-unlock/cwl/ <br>

  **How to setup and use an UNLOCK workflow:**<br>
  https://docs.m-unlock.nl/docs/workflows/setup.html<br>

outputs:
  read_filtering_output_keep:
    label: Read filtering output
    doc: Read filtering stats + filtered reads
    type: Directory?
    outputSource: keep_readfilter_files_to_folder/results
  read_filtering_output:
    label: Read filtering output
    doc: Read filtering stats
    type: Directory?
    outputSource: readfilter_files_to_folder/results
  assembly_output:
    label: Assembly output
    doc: Output from different assembly steps
    type: Directory
    outputSource: assembly_files_to_folder/results
  binning_output:
    label: Binning output
    doc: Binning outputfolders
    type: Directory?
    outputSource: binning_files_to_folder/results
  # gem_output:
  #   label: Community GEM output
  #   doc: Community GEM output folder
  #   type: Directory?
  #   outputSource: GEM_files_to_folder/results

inputs:
  identifier:
    type: string
    label: Identifier
    doc: Identifier for this dataset used in this workflow (required)
  threads:
    type: int
    doc: Number of threads to use for each computational processe (default 2)
    label: Number of threads
    default: 2
  memory:
    type: int
    doc: Maximum memory usage in megabytes. This mostly important for SPAdes assembly. (default 8GB)
    label: Memory usage (MB)
    default: 8000

  illumina_forward_reads:
    type: File[]?
    doc: Illumina Forward sequence file(s)
    label: Forward reads
    loadListing: no_listing
  illumina_reverse_reads:
    type: File[]?
    label: Reverse reads
    doc: Illumina Reverse sequence file(s)
    loadListing: no_listing
  pacbio_reads:
    type: File[]?
    label: PacBio reads
    doc: File(s) with PacBio reads in FASTQ format
    loadListing: no_listing
  nanopore_reads:
    type: File[]?
    label: Oxford Nanopore reads
    doc: File(s) with Oxford Nanopore reads in FASTQ format
    loadListing: no_listing
  fastq_rich:
    type: boolean
    doc: | 
          Input fastq is generated by albacore, MinKNOW or guppy  with additional information concerning channel and time. 
          Used to creating more informative quality plots (default false)
    label: Fastq rich (ONT)
    default: false

# Longread filter options
  longread_minimum_length:
    type: int?
    label: Minimum length required
    doc: Reads shorter will be discarded. (default 100)
  longread_length_limit:
    type: int?
    label: Maximum length limit
    doc: Reads longer than length_limit will be discarded. (default no limit)
  longread_qualified_quality_phred:
    type: int?
    label: Qualified_quality_phred
    doc: The quality value that a base is qualified. (default 9 means phred quality >=Q9 is qualified)
    default: 9
  longread_mean_qual:
    type: int?
    label: Mean quality
    doc: If one read's mean_qual quality score < mean_qual, then this read is discarded. (default 10)
    default: 10
  longread_trim_front:
    type: int?
    label: Trim_front
    doc: Trimming how many bases in front for read. (default 0)
  longread_trim_tail:
    type: int?
    label: trim_tail
    doc: Trimming how many bases in tail for read. (default 0)
  longread_trim_poly_x:
    type: boolean?
    label: Trim_poly_x
    doc: Enable polyX trimming in 3' ends. (default false)
  longread_poly_x_min_len:
    type: int?
    label: Poly_x_min_len
    doc: The minimum length to detect polyX in the read tail. (default 10 when trim_poly_x is true)
  longread_start_adapter:
    type: string?
    label: start_adapter
    doc: The adapter sequence at read start (5'). (default auto-detect)
  longread_end_adapter:
    type: string?
    label: End adapter
    doc: The adapter sequence at read end (3'). (default auto-detect)
  longread_adapter_fasta:
    type: File?
    label: Adapter fasta
    doc: Specify a FASTA file to trim both read ends by all the sequences in this FASTA file. (default None)
  longread_disable_adapter_trimming:
    type: boolean?
    label: Disable adapter trimming
    doc: Adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled. (default false)

  # Read filtering parameters
  illumina_humandb:
    type: Directory?
    doc: Bowtie2 index folder. Provide the folder in which the in index files are located. (optional)
    label: Filter human reads
    loadListing: no_listing
  longread_humandb:
    type: File?
    doc: A fasta file or minimap2 indexed filed (.mmi) index needs to be provided. Preindexed is much faster. (optional)
    label: Filter human illumina reads
    loadListing: no_listing
  illumina_reference_filter_db:
    type: Directory?
    doc: | 
      Custom reference database for filtering with Hostile. 
      Provide the folder in which the bowtie2 index files are located. (optional)
    label: Illumina reference filter db
    loadListing: no_listing
  longread_reference_filter_db:
    type: File?
    doc: A fasta file or minimap2 indexed filed (.mmi) index needs to be provided. Preindexed is much faster. (optional)
    label: Longread reference filter db
    loadListing: no_listing

  use_reference_mapped_reads:
    type: boolean
    doc: Discard unmapped and keep reads mapped to the given reference. (default false (discard mapped))
    label: Keep mapped reads
    default: false

  keep_filtered_reads:
    type: boolean
    doc: Keep filtered reads in the final output (default false)
    label: Keep filtered reads
    default: false
  deduplicate_illumina_reads:
    type: boolean
    doc: Remove exact duplicate reads Illumina reads with fastp (default false)
    label: Deduplicate illumina reads
    default: false

  # KRAKEN2
  run_kraken2_illumina:
    type: boolean
    doc: Run kraken2 on Illumina reads. A kraken2 database needs to be provided using the input kraken2_database. (default false)
    label: Run kraken2 on Illumina reads
    default: false
  skip_bracken:
    type: boolean
    label: Run Bracken
    doc: Skip Bracken analysis. Illumina only. A bracken compatible kraken2 database needs to be provided using the input kraken2_database. (default false)
    default: false
  bracken_levels:
    type: string[]
    label: Bracken levels
    doc: Taxonomy levels in bracken estimate abundances on. Default runs through; [P,C,O,F,G,S]
    default: [P,C,O,F,G,S]
  illumina_read_length:
    type: int?
    label: Read length
    doc: Read length to use in bracken only atm. Usually 50,75,100,150,200,250 or 300. (default 150)
    default: 150
  kraken2_confidence:
    type: float?
    label: Kraken2 confidence threshold
    doc: Confidence score threshold must be in [0, 1] (default 0.0) 
  kraken2_database:
    type: Directory[]?
    doc: Database location of kraken2. (optional)
    label: Kraken2 database
    default: []
    loadListing: no_listing
  kraken2_standard_report:
    type: boolean
    label: Kraken2 standard report
    doc: Also output Kraken2 standard report with per read classification. These can be large. (default false)
    default: false
  
  # SYLPH
  #
  #
  #  

  # ASSEMBLY
  genome_size:
    type: string?
    label: Genome Size
    doc: Estimated genome size (for example, 5m or 2.6g). Used in Flye. (optional)
  metagenome:
    type: boolean
    default: true
    doc: Metagenome option for assemblers (default true)
    label: When working with metagenomes
  run_spades:
    type: boolean
    label: Use SPAdes
    doc: Run with SPAdes assembler (default true)
    default: true
  only_assembler_mode_spades:
    type: boolean
    label: Only spades assembler
    doc: Run spades in only assembler mode (without read error correction). (default false)
    default: true
  use_spades_scaffolds:
    type: boolean
    label: Use SPAdes scaffolds
    doc: Use SPAdes scaffolds instead of contigs for post-processing (polishing/mapping/binning). (default false)
    default: false
  run_flye:
    type: boolean
    label: Use Flye
    doc: Run with Flye assembler. Requires long reads (default false)
    default: false
  flye_deterministic:
    type: boolean
    label: Deterministic Flye
    doc: Perform disjointig assembly single-threaded in Flye assembler (slower). (default false)
    default: false
  run_medaka:
    type: boolean
    label: Use Medaka
    doc: Run with Mekada assembly polishing using nanopore (not pacbio) reads only. (default false)
    default: false
  run_pypolca:
    type: boolean
    label: Use PyPolCA
    doc: Run with PyPolCA assembly polishing using Illumina reads only. (default false)
    default: false
  # allow user to choose assembler(s) for post-assembly processes (polishing/mapping/binning)
  assembly_choice:
    type: 
      - "null"
      - type: enum
        symbols: ["spades", "flye", "pypolca", "medaka"]
    label: Assembly choice
    doc: |
      User's choice of assembly for post-assembly (binning) processes ('spades', 'flye', 'pypolca', 'medaka'). Optional. Only one choice allowed.
      When none is given, the first available assembly in this order is chosen: pypolca, medaka, flye, spades.
  output_bam_file:
    type: boolean
    label: Output BAM file
    doc: Output BAM file of mapped reads to assembly of choice. (default false)
    default: false
  ont_basecall_model:
    type: string?
    label: ONT Basecalling model used for MEDAKA
    doc: |
      Used in MEDAKA
      Basecalling model used with guppy default r941_min_high.
      Available: r103_fast_g507, r103_fast_snp_g507, r103_fast_variant_g507, r103_hac_g507, r103_hac_snp_g507, r103_hac_variant_g507, r103_min_high_g345, r103_min_high_g360, r103_prom_high_g360, r103_prom_snp_g3210, r103_prom_variant_g3210, r103_sup_g507, r103_sup_snp_g507, r103_sup_variant_g507, r1041_e82_400bps_fast_g615, r1041_e82_400bps_fast_variant_g615, r1041_e82_400bps_hac_g615, r1041_e82_400bps_hac_variant_g615, r1041_e82_400bps_sup_g615, r1041_e82_400bps_sup_variant_g615, r104_e81_fast_g5015, r104_e81_fast_variant_g5015, r104_e81_hac_g5015, r104_e81_hac_variant_g5015, r104_e81_sup_g5015, r104_e81_sup_g610, r104_e81_sup_variant_g610, r10_min_high_g303, r10_min_high_g340, r941_e81_fast_g514, r941_e81_fast_variant_g514, r941_e81_hac_g514, r941_e81_hac_variant_g514, r941_e81_sup_g514, r941_e81_sup_variant_g514, r941_min_fast_g303, r941_min_fast_g507, r941_min_fast_snp_g507, r941_min_fast_variant_g507, r941_min_hac_g507, r941_min_hac_snp_g507, r941_min_hac_variant_g507, r941_min_high_g303, r941_min_high_g330, r941_min_high_g340_rle, r941_min_high_g344, r941_min_high_g351, r941_min_high_g360, r941_min_sup_g507, r941_min_sup_snp_g507, r941_min_sup_variant_g507, r941_prom_fast_g303, r941_prom_fast_g507, r941_prom_fast_snp_g507, r941_prom_fast_variant_g507, r941_prom_hac_g507, r941_prom_hac_snp_g507, r941_prom_hac_variant_g507, r941_prom_high_g303, r941_prom_high_g330, r941_prom_high_g344, r941_prom_high_g360, r941_prom_high_g4011, r941_prom_snp_g303, r941_prom_snp_g322, r941_prom_snp_g360, r941_prom_sup_g507, r941_prom_sup_snp_g507, r941_prom_sup_variant_g507, r941_prom_variant_g303, r941_prom_variant_g322, r941_prom_variant_g360, r941_sup_plant_g610, r941_sup_plant_variant_g610
      (required for Medaka)

  # BINNING
  binning:
    type: boolean
    label: Run binning workflow
    doc: Run with contig binning workflow (default false)
    default: false
  run_maxbin2:
    type: boolean
    doc: Run with MaxBin2 binner. (default true)
    label: Run Maxbin2
    default: true
  run_semibin2:
    type: boolean
    doc: Run with SemiBin2 binner. (default true)
    label: Run SemiBin
    default: true
  semibin2_environment:
    doc: |
          Semibin2 Built-in models (none/global/human_gut/dog_gut/ocean/soil/cat_gut/human_oral/mouse_gut/pig_gut/built_environment/wastewater/chicken_caecum). 
          Choosing a built-in model is generally faster. Otherwise it will do (single-sample) training on the data.
          Default global. Choose none if you want to do training on your own data.
    label: SemiBin Environment
    type:
      - type: enum
        symbols:
        - none
        - global
        - human_gut
        - dog_gut
        - ocean
        - soil
        - cat_gut
        - human_oral
        - mouse_gut
        - pig_gut
        - built_environment
        - wastewater
        - chicken_caecum
    default: global
  gtdbtk_data:
    type: Directory?
    doc: Directory containing the GTDBTK repository
    label: gtdbtk data directory
    loadListing: no_listing
  busco_data:
    type: Directory?
    label: BUSCO dataset
    doc: Path to the BUSCO dataset downloaded location. (optional)
    loadListing: no_listing
  
  # BIN ANNOTATION
  annotate_bins:
    type: boolean
    label: Annotate bins
    doc: Annotate bins. (default false)
    default: false
  annotate_unbinned:
    type: boolean
    label: Annotate unbinned
    doc: Annotate unbinned contigs. Will be treated as metagenome. (default false)
    default: false
  bakta_db:
    type: Directory?
    label: Bakta DB
    doc: Bakta Database directory. Default is built-in bakta-light db. (optional)
  skip_bakta_crispr:
    type: boolean
    label: Skip bakta CRISPR
    doc: Skip bakta CRISPR array prediction using PILER-CR. (default false)
    default: false

  interproscan_directory:
    type: Directory?
    label: InterProScan 5 directory
    doc: Directory of the (full) InterProScan 5 program. Used for annotating bins. (optional)
  eggnog_dbs:
    type:
      - 'null'
      - type: record
        name: eggnog_dbs
        fields:
          data_dir:
            type: Directory?
            doc: Directory containing all data files for the eggNOG database.
          db:
            type: File?
            doc: eggNOG database file
          diamond_db:
            type: File?
            doc: eggNOG database file for diamond blast search
  run_kofamscan:
    type: boolean
    label: Run kofamscan
    doc: Run with KEGG KO KoFamKOALA annotation. (default false)
    default: false
  kofamscan_limit_sapp:
    type: int?
    label: SAPP kofamscan limit
    doc: Limit max number of entries of kofamscan hits per locus in SAPP. (default 5)
    default: 5
  run_eggnog:
    type: boolean
    label: Run eggNOG-mapper
    doc: Run with eggNOG-mapper annotation. Requires eggnog database files. (default false)
    default: false
  run_interproscan:
    type: boolean
    label: Run InterProScan
    doc: Run with eggNOG-mapper annotation. Requires InterProScan v5 program files. (default false)
    default: false
  interproscan_applications:
    type: string
    label: InterProScan applications
    doc: |
          Comma separated list of analyses:
          FunFam,SFLD,PANTHER,Gene3D,Hamap,PRINTS,ProSiteProfiles,Coils,SUPERFAMILY,SMART,CDD,PIRSR,ProSitePatterns,AntiFam,Pfam,MobiDBLite,PIRSF,NCBIfam

          default Pfam,SFLD,SMART,AntiFam,NCBIfam
    default: 'Pfam,SFLD,SMART,AntiFam,NCBIfam'

  # GEM WORKFLOW  
  # run_GEM:
  #   type: boolean
  #   label: Run GEM workflow
  #   doc: | 
  #         Run the community GEnomescale Metabolic models workflow on bins. (default false)
  #         NOTE: Uses by default private docker containers 
  #   default: false
  # run_smetana:
  #   type: boolean
  #   label: Run SMETANA
  #   doc: Run SMETANA (Species METabolic interaction ANAlysis) (default false)
  #   default: false
  # smetana_solver:
  #   type: string?
  #   doc: Solver to be used in SMETANA (now only run with cplex)
  # memote_solver:
  #   type: string?
  #   label: MEMOTE solver
  #   doc: MEMOTE solver Choice (cplex, glpk, gurobi, glpk_exact); by default glpk
  # gapfill:
  #   type: string?
  #   label: Gap fill
  #   doc: Gap fill model for given media
  # mediadb:
  #   type: File?
  #   label: Media database
  #   doc: Media database file
  # carveme_solver:
  #   type: string?
  #   label: CarveMe solver
  #   doc: CarveMe solver (default scip), possible to use cplex in private container (not provided in public container)  


  # Input provenance (used for cwl-prov)
  destination:
    type: string?
    label: Output Destination
    doc: Optional output destination only used for cwl-prov reporting.
  source:
    label: Input URLs used for this run
    doc: A provenance element to capture the original source of the input data
    type: string[]?

steps:
#############################################
#### Quality workflow Oxford Nanopore
  workflow_quality_illumina:
    label: Oxford Nanopore quality workflow
    doc: Quality, filtering and taxonomic classification workflow for Oxford Nanopore reads
    when: $(inputs.forward_reads !== null)
    run: workflow_illumina_quality.cwl
    in:
      identifier: identifier

      forward_reads: illumina_forward_reads
      reverse_reads: illumina_reverse_reads

      deduplicate: deduplicate_illumina_reads

      humandb: illumina_humandb
      reference_filter_db: illumina_reference_filter_db
      keep_reference_mapped_reads: use_reference_mapped_reads

      threads: threads
    out: [QC_forward_reads, QC_reverse_reads, reports_folder]

#############################################
#### Quality workflow Oxford Nanopore
  workflow_quality_nanopore:
    label: Oxford Nanopore quality workflow
    doc: Quality, filtering and taxonomic classification workflow for Oxford Nanopore reads
    when: $(inputs.longreads !== null)
    run: workflow_longread_quality.cwl
    in:
      identifier: identifier
      longreads: nanopore_reads
      readtype:
        default: "nanopore"

      humandb: longread_humandb 
      reference_filter_db: longread_reference_filter_db
      keep_reference_mapped_reads: use_reference_mapped_reads

      minimum_length: longread_minimum_length
      length_limit: longread_length_limit
      
      qualified_quality_phred: longread_qualified_quality_phred
      mean_qual: longread_mean_qual
      
      trim_front: longread_trim_front
      trim_tail: longread_trim_tail

      trim_poly_x: longread_trim_poly_x
      poly_x_min_len: longread_poly_x_min_len
      
      start_adapter: longread_start_adapter
      end_adapter: longread_end_adapter
      
      adapter_fasta: longread_adapter_fasta
      disable_adapter_trimming: longread_disable_adapter_trimming

      threads: threads
    out: [filtered_reads, reports_folder]
#############################################
#### Quality workflow PacBio
  workflow_quality_pacbio:
    label: PacBio quality and filtering workflow
    when: $(inputs.longreads !== null)
    doc: Quality, filtering and taxonomic classification for PacBio reads
    run: workflow_longread_quality.cwl
    in:

      identifier: identifier
      longreads: pacbio_reads
      readtype:
        default: "pacbio"
      
      humandb: longread_humandb 
      reference_filter_db: longread_reference_filter_db
      keep_reference_mapped_reads: use_reference_mapped_reads

      minimum_length: longread_minimum_length
      length_limit: longread_length_limit
      
      qualified_quality_phred: longread_qualified_quality_phred
      mean_qual: longread_mean_qual
      
      trim_front: longread_trim_front
      trim_tail: longread_trim_tail

      trim_poly_x: longread_trim_poly_x
      poly_x_min_len: longread_poly_x_min_len
      
      start_adapter: longread_start_adapter
      end_adapter: longread_end_adapter
      
      adapter_fasta: longread_adapter_fasta
      disable_adapter_trimming: longread_disable_adapter_trimming


      threads: threads
    out: [filtered_reads, reports_folder]

#############################################
#### Kraken2
  workflow_kraken2_illumina:
    label: Kraken2 illumina
    doc: Taxonomic classification using kraken2 Illumina reads
    when: $(!inputs.run_kraken2_illumina && inputs.kraken2_database !== null && inputs.kraken2_database.length !== 0)
    run: workflow_kraken2-bracken.cwl
    scatter: kraken2_database
    in:
      run_kraken2_illumina: run_kraken2_illumina
      identifier: identifier
      threads: threads
      skip_bracken: skip_bracken
      kraken2_database: kraken2_database
      kraken2_confidence: kraken2_confidence
      illumina_forward_reads: workflow_quality_illumina/QC_forward_reads
      illumina_reverse_reads: workflow_quality_illumina/QC_reverse_reads
      output_standard_report: kraken2_standard_report
      read_length: illumina_read_length
      bracken_levels: bracken_levels
    out: [kraken2_folder, bracken_folder]

#### Assembly
#############################################
#### Assembly using SPAdes
  spades:
    doc: Genome assembly using SPAdes with illumina and or long reads
    label: SPAdes assembly
    when: $(inputs.run_spades && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run: ../tools/spades/spades.cwl
    in:
      run_spades: run_spades

      output_filename_prefix: identifier

      forward_reads:
        source: [ workflow_quality_illumina/QC_forward_reads ]
        linkMerge: merge_nested
      reverse_reads:
        source: [ workflow_quality_illumina/QC_reverse_reads ]
        linkMerge: merge_nested
      pacbio_reads:
        source: workflow_quality_pacbio/filtered_reads
        valueFrom:
          ${
            var reads = null;
            if (self !== null) { reads = [self]; }
            return reads;
          }
      nanopore_reads:
        source: workflow_quality_nanopore/filtered_reads
        valueFrom:
          ${
            var reads = null;
            if (self !== null) { reads = [self]; }
            return reads;
          }
      only_assembler: only_assembler_mode_spades
      metagenome: metagenome
      memory: memory
      threads: threads
    out: [contigs, scaffolds, assembly_graph, assembly_graph_with_scaffolds, contigs_assembly_paths, scaffolds_assembly_paths, contigs_before_rr, params, log, internal_config, internal_dataset]

  spades_assembly:
    label: SPAdes contigs or scaffolds
    doc: Get chosen spades assembly. Contigs or scaffolds
    when: $(inputs.run_spades && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run:
      class: ExpressionTool
      requirements:
        InlineJavascriptRequirement: {}
      inputs:
        spades_contigs: File
        spades_scaffolds: File
        use_spades_scaffolds: boolean
      outputs:
        assembly: File
      expression: |
        ${ 
          return {'assembly' : inputs.use_spades_scaffolds ? inputs.spades_scaffolds : inputs.spades_contigs};
        }
    in:
      run_spades: run_spades
      forward_reads: illumina_forward_reads
      use_spades_scaffolds: use_spades_scaffolds
      spades_contigs: spades/contigs
      spades_scaffolds: spades/scaffolds
    out:
      [assembly]

  compress_spades:
    label: SPAdes compressed
    doc: Compress the large Spades assembly output files
    when: $(inputs.run_spades && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run: ../tools/bash/pigz.cwl
    scatter: [inputfile]
    scatterMethod: dotproduct
    in:
      run_spades: run_spades
      forward_reads: illumina_forward_reads

      threads: threads
      inputfile:
        source: [spades/contigs, spades/scaffolds, spades/assembly_graph, spades/assembly_graph_with_scaffolds, spades/contigs_before_rr, spades/contigs_assembly_paths, spades/scaffolds_assembly_paths]
        linkMerge: merge_flattened
        pickValue: all_non_null
    out: [outfile]

#############################################
#### De novo assembly with Flye
  flye:
    label: Flye assembly
    doc: De novo assembly of single-molecule reads with Flye
    when: $(inputs.run_flye && ((inputs.nanopore_reads !== null && inputs.nanopore_reads.length !== 0) || (inputs.pacbio_reads !== null && inputs.pacbio_reads.length !== 0)))
    run: ../tools/flye/flye.cwl
    in:
      run_flye: run_flye

      output_filename_prefix: identifier

      nanopore_reads: nanopore_reads
      pacbio_reads: pacbio_reads

      # consider adding:
      # --keep-haplotypes --no-alt-contigs -i 0 (when in metagenome mode)

      nano_raw: workflow_quality_nanopore/filtered_reads
      pacbio_raw: workflow_quality_pacbio/filtered_reads
      threads: threads
      metagenome: metagenome
      deterministic: flye_deterministic
      genome_size: genome_size
    out: [00_assembly, 10_consensus, 20_repeat, 30_contigger, 40_polishing, assembly, assembly_info, flye_log, params, assembly_graph]

#############################################
#### Polishing of Flye assembly with Medaka
  medaka:
    label: Medaka polishing of assembly
    doc: Medaka for (ont reads) polishing of an assembled (flye) genome
    when: $(inputs.run_medaka && inputs.run_flye && inputs.nanopore_reads !== null && inputs.nanopore_reads.length !== 0)
    run: ../tools/medaka/medaka_consensus_py.cwl
    in:
      run_flye: run_flye
      run_medaka: run_medaka

      nanopore_reads: nanopore_reads
      threads: threads
      draft_assembly: flye/assembly
      reads: workflow_quality_nanopore/filtered_reads
      basecall_model: ont_basecall_model
    out: [polished_assembly, gaps_in_draft_coords] # probs, calls_to_draft

############################################
#### Workflow PyPolca assembly polishing
  workflow_pypolca:
    label: Run PyPolCA assemlby polishing 
    doc: PyPolCA polishing of longreads assembly with illumina reads
    when: $(inputs.run_pypolca && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run: ../tools/pypolca/pypolca.cwl
    in:
      run_pypolca: run_pypolca

      identifier: identifier
      assembly:
        source:
        - medaka/polished_assembly
        - flye/assembly
        - spades_assembly/assembly
        pickValue: first_non_null

      forward_reads: workflow_quality_illumina/QC_forward_reads
      reverse_reads: workflow_quality_illumina/QC_reverse_reads
      threads: threads
    out: [polished_genome, vcf, report, log, logs_dir]

#############################################
#### Get post process assembly 
  get_assembly_to_use:
    label: Assembly choice
    doc: Get assembly choice
    run:
      class: ExpressionTool
      requirements:
        InlineJavascriptRequirement: {}
      inputs:
        assembly_choice_string:
          type: 
            - "null"
            - type: enum
              symbols: ["spades", "medaka", "flye", "pypolca"]
        workflow_pypolca_polished_genome: File?
        medaka_polished_assembly: File?
        flye_assembly: File?
        spades_assembly: File?
        use_spades_scaffolds: boolean
      outputs:
        assembly: File
        assembly_choice: string
      expression: |
        ${
          // Define the mapping of user choices to assembly step outputs
          var assemblyOutputs = {
            "pypolca": inputs.workflow_pypolca_polished_genome, 
            "medaka": inputs.medaka_polished_assembly,
            "flye": inputs.flye_assembly,
            "spades": inputs.spades_assembly
          };
          // If the user has made a specific assembly choice, return the corresponding output
          // Otherwise, return the first non-null assembly output
          var assembly_to_use = inputs.assembly_choice_string ? assemblyOutputs[inputs.assembly_choice_string] : Object.values(assemblyOutputs).find(function(file) { return file !== null; });
          var assembly_choice = inputs.assembly_choice_string || Object.keys(assemblyOutputs).find(function(key) { return assemblyOutputs[key] !== null; });
          
          // If the user has chosen to use SPAdes, append "_scaffolds" or "_contigs" to the name
          if (assembly_choice == "spades" && inputs.use_spades_scaffolds) {
            assembly_choice = assembly_choice+"_scaffolds";
          } else if (assembly_choice == "spades" && !inputs.use_spades_scaffolds) {
            assembly_choice = assembly_choice+"_contigs";
          }
          return {
            'assembly': assembly_to_use,
            'assembly_choice': assembly_choice
          };
        }
    in:
      assembly_choice_string: assembly_choice
      workflow_pypolca_polished_genome: workflow_pypolca/polished_genome
      medaka_polished_assembly: medaka/polished_assembly
      flye_assembly: flye/assembly
      
      spades_contigs: spades/contigs
      spades_scaffolds: spades/scaffolds
      use_spades_scaffolds: use_spades_scaffolds
      spades_assembly:   
        valueFrom: |
          ${return inputs.use_spades_scaffolds ? inputs.spades_scaffolds : inputs.spades_contigs;}
    out:
      [assembly, assembly_choice]

##############################################
#### Illumina read mapping
  assembly_read_mapping_illumina:
    label: Minimap2
    doc: Illumina read mapping using Minimap2 on assembled scaffolds
    when: |
      ${
         return (inputs.output_bam_file && inputs.forward_reads !== null && inputs.forward_reads.length !== 0) ||
         (inputs.binning && inputs.forward_reads !== null && inputs.forward_reads.length !== 0);
      }
    run: ../tools/minimap2/minimap2_to_sorted-bam_PE.cwl
    in:
      output_bam_file: output_bam_file
      binning: binning

      assembly_choice: get_assembly_to_use/assembly_choice
      identifier:
        source: identifier
        valueFrom: $(self+"_"+inputs.assembly_choice)

      reference: get_assembly_to_use/assembly
      forward_reads: workflow_quality_illumina/QC_forward_reads
      reverse_reads: workflow_quality_illumina/QC_reverse_reads
      preset: 
        default: "sr"
      threads: threads
    out: [sorted_bam]
#############################################
#### Reports per contig alignment statistics
  contig_read_counts:
    label: Samtools idxstats
    doc: Reports alignment summary statistics
    when: |
      ${
         return (inputs.output_bam_file && inputs.forward_reads !== null && inputs.forward_reads.length !== 0) ||
         (inputs.binning && inputs.forward_reads !== null && inputs.forward_reads.length !== 0);
      }
    run: ../tools/samtools/samtools_idxstats.cwl
    in:
      output_bam_file: output_bam_file
      binning: binning
      forward_reads: illumina_forward_reads

      assembly_choice: get_assembly_to_use/assembly_choice
      identifier:
        source: identifier
        valueFrom: $(self+"_"+inputs.assembly_choice)
      bam_file: assembly_read_mapping_illumina/sorted_bam
      threads: threads
    out: [contigReadCounts]

#############################################
#### Binning workflow
  workflow_binning:
    label: Binning workflow
    doc: Binning workflow to create bins
    when: $(inputs.binning && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run: workflow_metagenomics_binning.cwl
    in:
      binning: binning
      forward_reads: illumina_forward_reads
      # run_GEM: run_GEM

      busco_data: busco_data
      identifier: identifier
      gtdbtk_data: gtdbtk_data
      skip_bakta_crispr: skip_bakta_crispr
      run_maxbin2: run_maxbin2
      run_semibin2: run_semibin2
      semibin2_environment: semibin2_environment
      assembly_choice: get_assembly_to_use/assembly_choice
      assembly: get_assembly_to_use/assembly
      bam_file: assembly_read_mapping_illumina/sorted_bam

      # run_binspreader: run_binspreader
      # assembly_graph: 
      #   # assembly_choice is one of pypolca, medaka will use flye. Otherwise use spades.
      #   source:
      #     - flye/assembly_graph
      #     - spades/assembly_graph_with_scaffolds
      #   linkMerge: merge_flattened
      #   pickValue: first_non_null

      annotate_bins: annotate_bins
      annotate_unbinned: annotate_unbinned
      bakta_db: bakta_db
      interproscan_directory: interproscan_directory
      interproscan_applications: interproscan_applications
      eggnog_dbs: eggnog_dbs
      run_interproscan: run_interproscan
      run_eggnog: run_eggnog
      run_kofamscan: run_kofamscan
      kofamscan_limit_sapp: kofamscan_limit_sapp
      
      threads: threads

    out: [bins, binette_output, maxbin2_output, semibin_output, metabat2_output, gtdbtk_output, busco_output, bins_summary_table, bins_read_stats, eukrep_fasta, eukrep_stats_file, annotation_output]
#############################################
#### GEM workflow
  # workflow_GEM:
  #   label: GEM workflow
  #   doc: CarveMe community genomescale metabolic models workflow from bins
  #   when: $(inputs.binning && inputs.run_GEM)
  #   run: workflow_metagenomics_GEM.cwl
  #   in:
  #     binning: binning
  #     run_smetana: run_smetana
  #     run_GEM: run_GEM
  #     smetana_solver: smetana_solver
  #     memote_solver: memote_solver
  #     carveme_solver: carveme_solver
  #     mediadb: mediadb
  #     gapfill: gapfill
  #     identifier: identifier
  #     bins: workflow_binning/bins
  #     threads: threads
  #   out: [carveme_gems_folder,protein_fasta_folder,memote_folder,smetana_output,gemstats_out]

#############################################

# OUTPUT FOLDER PREPARATION #

#############################################
#### Filtered reads output folder (reads + reports)
  keep_readfilter_files_to_folder:
    doc: Preparation of read filtering output files to a specific output folder
    label: Read filtering output folder
    when: $(inputs.keep_filtered_reads)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      keep_filtered_reads: keep_filtered_reads

      files:
        source: [workflow_quality_illumina/QC_forward_reads, workflow_quality_illumina/QC_reverse_reads, workflow_quality_nanopore/filtered_reads, workflow_quality_pacbio/filtered_reads]
        linkMerge: merge_flattened
        pickValue: all_non_null
      folders:
        source: [workflow_quality_illumina/reports_folder, workflow_quality_nanopore/reports_folder, workflow_quality_pacbio/reports_folder]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("read_filtering_and_classification")
    out:
      [results]
#############################################
#### Filtered reads output folder (only reports)
  readfilter_files_to_folder:
    doc: Preparation of read filtering reports specific output folder
    label: Read filtering output folder
    when: $(inputs.keep_filtered_reads == false)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      keep_filtered_reads: keep_filtered_reads

      folders:
        source: [workflow_quality_illumina/reports_folder, workflow_quality_nanopore/reports_folder, workflow_quality_pacbio/reports_folder]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("read_filtering_and_classification")
    out:
      [results]

#############################################
#### SPAdes Output folder
  spades_files_to_folder:
    doc: Preparation of SPAdes output files to a specific output folder
    label: SPADES output to folder
    when: $(inputs.run_spades)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      run_spades: run_spades

      files:
        source: [compress_spades/outfile, spades/params, spades/log, spades/internal_config, spades/internal_dataset]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("spades_assembly")
    out:
      [results]
#############################################
#### Flye output folder
  flye_files_to_folder:
    doc: Preparation of Flye output files to a specific output folder
    label: Flye output folder
    when: $(inputs.run_flye)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      run_flye: run_flye

      files:
        source: [flye/assembly, flye/assembly_info, flye/flye_log, flye/params, flye/assembly_graph]
        linkMerge: merge_flattened
        pickValue: all_non_null
      # folders:
        # source: [workflow_flye/00_assembly, workflow_flye/10_consensus, workflow_flye/20_repeat, workflow_flye/30_contigger, workflow_flye/40_polishing]
        # linkMerge: merge_flattened
      destination:
        valueFrom: $("flye_assembly")
    out:
      [results]

#############################################
#### Medaka output folder
  medaka_files_to_folder:
    doc: Preparation of Medaka output files to a specific output folder
    label: Medaka output folder
    when: $(inputs.run_medaka && inputs.nanopore_reads !== null && inputs.nanopore_reads.length !== 0)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      run_medaka: run_medaka
      nanopore_reads: nanopore_reads

      files:
        source: [medaka/polished_assembly, medaka/gaps_in_draft_coords] # , workflow_medaka/probs, workflow_medaka/calls_to_draft
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("medaka_assembly_polishing")
    out:
      [results]

#############################################
#### PyPolca files to folder
  pypolca_files_to_folder:
    doc: Preparation of PyPolCA output files to a specific output folder
    label: PyPolca output folder
    when: $(inputs.run_pypolca && inputs.forward_reads !== null && inputs.forward_reads.length !== 0)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      run_pypolca: run_pypolca
      forward_reads: illumina_forward_reads

      files:
        source: [workflow_pypolca/polished_genome, workflow_pypolca/vcf, workflow_pypolca/report, workflow_pypolca/log]
        linkMerge: merge_flattened
        pickValue: all_non_null
      folders:
        source: [workflow_pypolca/logs_dir]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("pypolca_polished_assembly")
    out:
      [results]

#############################################
#### Combined assembly steps output folder
  output_bamfile:
    label: Output bam file
    doc: Step needed to output bam file because there is an option to.
    when: |
      ${
         return (inputs.output_bam_file && inputs.forward_reads !== null && inputs.forward_reads.length !== 0) ||
         (inputs.binning && inputs.forward_reads !== null && inputs.forward_reads.length !== 0);
      }
    run:
      class: ExpressionTool
      requirements:
        InlineJavascriptRequirement: {}
      inputs:
        bam_in: File
      outputs:
        bam_out: File
      expression: |
       ${ return {'bam_out': inputs.bam_in}; }
    in:
      output_bam_file: output_bam_file
      forward_reads: illumina_forward_reads
      binning: binning
      bam_in: assembly_read_mapping_illumina/sorted_bam
    out: 
      [bam_out]

  assembly_files_to_folder:
    doc: Preparation of Flye output files to a specific output folder
    label: Flye output folder
    run: ../tools/expressions/files_to_folder.cwl
    in:
      files:
        source: [output_bamfile/bam_out, contig_read_counts/contigReadCounts]
        linkMerge: merge_flattened
        pickValue: all_non_null
      folders:
        source: [spades_files_to_folder/results, flye_files_to_folder/results, medaka_files_to_folder/results,pypolca_files_to_folder/results]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("assembly")
    out:
      [results]
#############################################
#### Binning output folder
  binning_files_to_folder:
    doc: Preparation of binning output files and folders to a specific output folder
    label: Binning output to folder
    when: $(inputs.binning)
    run: ../tools/expressions/files_to_folder.cwl
    in:
      binning: binning

      folders:
        source: [workflow_binning/binette_output, workflow_binning/metabat2_output, workflow_binning/maxbin2_output, workflow_binning/semibin_output, workflow_binning/gtdbtk_output, workflow_binning/busco_output, workflow_binning/annotation_output]
        linkMerge: merge_flattened
        pickValue: all_non_null
      files:
        source: [workflow_binning/bins_read_stats, workflow_binning/bins_summary_table, workflow_binning/eukrep_fasta, workflow_binning/eukrep_stats_file]
        linkMerge: merge_flattened
        pickValue: all_non_null
      destination:
        valueFrom: $("binning")
    out:
      [results]
#############################################
#### GEMs output folder
  # GEM_files_to_folder:
  #   doc: Preparation of GEM workflow output files and folders to a specific output folder
  #   label: GEM workflow output to folder
  #   when: $(inputs.binning && inputs.run_GEM)
  #   run: ../tools/expressions/files_to_folder.cwl
  #   in:
  #     binning: binning

  #     run_GEM: run_GEM

  #     folders:
  #       source: [workflow_GEM/carveme_gems_folder, workflow_GEM/protein_fasta_folder, workflow_GEM/memote_folder]
  #       linkMerge: merge_flattened
  #       pickValue: all_non_null
  #     files:
  #       source: [workflow_GEM/smetana_output, workflow_GEM/gemstats_out]
  #       linkMerge: merge_flattened
  #       pickValue: all_non_null
  #     destination:
  #       valueFrom: $("metaGEM")
  #   out:
  #     [results]
#############################################

s:author:
  - class: s:Person
    s:identifier: https://orcid.org/0000-0001-8172-8981
    s:email: mailto:jasper.koehorst@wur.nl
    s:name: Jasper Koehorst
  - class: s:Person
    s:identifier: https://orcid.org/0000-0001-9524-5964
    s:email: mailto:bart.nijsse@wur.nl
    s:name: Bart Nijsse
  - class: s:Person
    s:identifier: https://orcid.org/0009-0001-1350-5644
    s:email: mailto:changlin.ke@wur.nl
    s:name: Changlin Ke

s:citation: https://m-unlock.nl
s:codeRepository: https://gitlab.com/m-unlock/cwl
s:dateCreated: "2025-05-05"
s:dateModified: "2025-08-04"
s:license: https://spdx.org/licenses/Apache-2.0
s:copyrightHolder: "UNLOCK - Unlocking Microbial Potential"

$namespaces:
  s: https://schema.org/