Configuration File
To download a configuration file template users just use --get_config
parameter. Using a config file your code is lot more clean and concise.
# get config
nextflow run fmalmeida/mpgap --get_config
# run with config
nextflow run fmalmeida/mpgap -c [path-to-config]
Default configuration
/*
* Configuration File to run fmalmeida/mpgap pipeline.
*/
params {
/*
* Input parameter
*/
// Path to YAML samplesheet file.
// Please read the documentation https://mpgap.readthedocs.io/en/latest/samplesheet.html to know how to create a samplesheet file.
input = null
// what is the type of organism of input? Bacteria / Eukaryote / Fungus
// This changes the parameters for gene detection and genome assessment for Quast and BUSCO.
organism = 'bacteria'
/*
* Output parameters
*/
// Output folder name
output = "output"
tracedir = "${params.output}/pipeline_info"
/*
* Resources parameters
*/
// Memory allocation for pilon polish.
// Values in Gb. Default 50G. 50G has been proved to be enough in most cases.
// This step is crucial because with not enough memory will crash and not correct your assembly.
pilon_memory_limit = 50
pilon_polish_rounds = 4 // how many rounds should be performed?
/*
* General parameters
*
* These parameters will set the default for all samples.
* However, they can also be set inside the YAML, if this happens
* the pipeline will use the value inside the YAML to overwrite
* the parameter for that specific sample.
*
* Please read the documentation https://mpgap.readthedocs.io/en/latest/samplesheet.html to know more about the samplesheet file.
*/
// This parameter only needs to be set if the software chosen is Canu, wtdbg2 or Haslr. Is optional for Flye.
// It is an estimate of the size of the genome. Common suffices are allowed, for example, 3.7m or 2.8g
genome_size = null
// Select the appropriate value to pass to wtdbg2 to assemble input.
// Options are: "ont" for Nanopore reads, "rs" for PacBio RSII, "sq" for PacBio Sequel, "ccs" for PacBio CCS reads.
// By default, if not given, the pipeline will use the value "ont" if nanopore reads are used and "sq" if pacbio reads are used
wtdbg2_technology = null
// Select the appropriate shasta config to use for assembly
// Since shasta v0.8 (Oct/2021) this parameter is now mandatory.
// You can check availability at: https://paoloshasta.github.io/shasta/Configurations.html
shasta_config = "Nanopore-Oct2021"
// Tells the pipeline to interpret the long reads as "corrected" long reads.
// This will activate (if available) the options for corrected or even high
// quality (hq) reads in the assemblers.
// Be cautious when using this parameter. If your reads are not corrected|hq, and
// you use this parameter, you will probably do not generate any contig.
corrected_longreads = false
high_quality_longreads = false
// This parameter below (hybrid_strategy) is to select the hybrid strategies adopted by the pipeline.
// Read the documentation https://mpgap.readthedocs.io/en/latest/manual.html to know more about the hybrid strategies.
//
// Whenever using this parameter, it is also possible to polish the longreads-only assemblies with Nanopolish,
// Medaka or VarianCaller (Arrow) before the polishing with shortreads (using Pilon). For that it is necessary to set
// the right parameters: pacbio_bam and nanopolish_fast5 (files given only inside YAML) or medaka_model.
hybrid_strategy = 1
// Default medaka model used for polishing nanopore long reads assemblies.
// Please read their manual https://github.com/nanoporetech/medaka to know more about the available models.
medaka_model = "r941_min_high_g360"
// This parameter sets to nanopolish the max number of haplotypes to be considered.
// Sometimes the pipeline may crash because to much variation was found exceeding the limit
nanopolish_max_haplotypes = 1000
// BUSCO dataset. The pipeline runs BUSCO after assemblies and the user can select on of the
// available BUSCO datasets listed in their website: https://busco.ezlab.org/busco_userguide.html#running-busco
//
// If blank, bacteria_odb10 will be used
//
// If unsure you can set the param to 'auto' which will tell BUSCO to automatically select the most
// appropriate one (it takes a little bit more of time and space).
busco_lineage = null
/*
* Advanced parameters
*
* Controlling the execution of assemblers and other tools.
* It must be set as true to skip the software and false to use it.
* Also adding the possibility to pass additional parameters to them
* Additional parameters must be in quotes and separated by spaces.
*/
quast_additional_parameters = null // Give additional parameters to Quast while assessing assembly metrics.
// Must be given as shown in Quast manual. E.g. " --large --eukaryote ".
skip_raw_assemblies_polishing = false // This will make the pipeline not polish raw assemblies on hybrid strategy 2.
// For example, if a sample is assembled with flye and polished with medaka,
// by default, both assemblies will be passed to pilon so you can compare them.
// If you don't need this comparison and don't want to polish the raw assembly,
// use this parameter.
skip_spades = false // Hybrid and shortreads only assemblies
spades_additional_parameters = null // Must be given as shown in Spades manual. E.g. " --meta --plasmids "
skip_shovill = false // Paired shortreads only assemblies
shovill_additional_parameters = null // Must be given as shown in Shovill manual. E.g. " --depth 15 "
// The pipeline already executes shovill with spades, skesa and megahit, so please, do not use it with shovill's ``--assembler`` parameter.
skip_unicycler = false // Hybrid and shortreads only assemblies
unicycler_additional_parameters = null // Must be given as shown in Unicycler manual. E.g. " --mode conservative --no_correct "
skip_megahit = false // Shortreads only assemblies
megahit_additional_parameters = null // Must be given as shown in Megahit manual. E.g. " --presets meta-large "
skip_haslr = false // Hybrid assemblies
haslr_additional_parameters = null // Must be given as shown in Haslr manual. E.g. " --cov-lr 30 "
skip_canu = false // Longreads only assemblies
canu_additional_parameters = null // Must be given as shown in Canu manual. E.g. " correctedErrorRate=0.075 corOutCoverage=200 "
skip_flye = false // Longreads only assemblies
flye_additional_parameters = null // Must be given as shown in Flye manual. E.g. " --meta --iterations 4 "
skip_raven = false // Longreads only assemblies
raven_additional_parameters = null // Must be given as shown in Raven manual. E.g. " --polishing-rounds 4 "
skip_wtdbg2 = false // Longreads only assemblies
wtdbg2_additional_parameters = null // Must be given as shown in wtdbg2 manual. E.g. " --tidy-reads 5000 "
skip_shasta = false // Nanopore longreads only assemblies
shasta_additional_parameters = null // Must be given as shown in shasta manual. E.g. " --Reads.minReadLength 5000 "
skip_hifiasm = false // Longreads only assemblies
hifiasm_additional_parameters = null // Must be given as shown in Hifiasm manual. E.g. " --ul ul.fq.gz "
skip_pilon = false // Skip pilon polisher when performing hybrid assembly strategy 2
skip_polypolish = false // Skip polypolisher polisher when performing hybrid assembly strategy 2
/*
* Resources controlling parameters
*
* Here some parameters that allow the user to better tune the resources used by the pipeline.
*
* The start_asm_{mem,cpus} parameter tells the pipeline how much memory should the assembly
* modules and quast request in the first try. This is essential for bigger genomes in order
* to avoid having to fail the first try due lack of memory and then running again (automatically)
* using all the max values allowed with the max_{mem,cpus} parameters.
*
* The max_memory and max_cpus parameters, tell the pipeline how much is the maximum number of
* these items that is allowoed per job. The pipeline start by requesting less mem&cpus than
* what is defined by these params, and, in case the first try fails, it then maxes out the job
* to use the maximum number you allowed.
*
* The max_time parameter defines how long a single job is allowed to run.
*/
// starting values for the assembly jobs (and quast) to ask for in the very first try
start_asm_mem = 20.GB
start_asm_cpus = 6
// maximum values to be used on automatic second try in case of lack of memory (all jobs)
max_memory = 40.GB
max_cpus = 10
max_time = '40.h'
}