task PreprocessIntervals { File intervals File ref_fasta File ref_fasta_fai File ref_fasta_dict # Determine output filename String filename = select_first([intervals, "wgs"]) String base_filename = basename(filename, ".interval_list") command <<< set -e gatk PreprocessIntervals \ ${"-L " + intervals} \ --sequence-dictionary ${ref_fasta_dict} \ --reference ${ref_fasta} \ --padding 0 \ --bin-length 100000 \ --interval-merging-rule OVERLAPPING_ONLY \ --output ${base_filename}.preprocessed.interval_list >>> output { File preprocessed_intervals = "${base_filename}.preprocessed.interval_list" } } task AnnotateIntervals { File intervals File ref_fasta File ref_fasta_fai File ref_fasta_dict command <<< set -e gatk AnnotateIntervals \ -L ${intervals} \ --reference ${ref_fasta} \ --feature-query-lookahead 1000000 \ --interval-merging-rule OVERLAPPING_ONLY \ --output annotated_intervals.tsv >>> output { File annotated_intervals = "annotated_intervals.tsv" } } task CollectCounts { File intervals File bam File bam_idx File ref_fasta File ref_fasta_fai File ref_fasta_dict # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") String counts_filename = "${base_filename}.counts.hdf5" command <<< set -e gatk CollectReadCounts \ -L ${intervals} \ --input ${bam} \ --reference ${ref_fasta} \ --format HDF5 \ --interval-merging-rule OVERLAPPING_ONLY \ --output ${counts_filename} >>> output { String entity_id = base_filename File counts = counts_filename } } task CollectAllelicCounts { File common_sites File bam File bam_idx File ref_fasta File ref_fasta_fai File ref_fasta_dict # Sample name is derived from the bam filename String base_filename = basename(bam, ".bam") String allelic_counts_filename = "${base_filename}.allelicCounts.tsv" command <<< set -e gatk CollectAllelicCounts \ -L ${common_sites} \ --input ${bam} \ --reference ${ref_fasta} \ --minimum-base-quality 20 \ --output ${allelic_counts_filename} >>> output { String entity_id = base_filename File allelic_counts = allelic_counts_filename } } task ScatterIntervals { File interval_list Int num_intervals_per_scatter String base_filename = basename(interval_list, ".interval_list") command <<< set -e grep @ ${interval_list} > header.txt grep -v @ ${interval_list} > all_intervals.txt split -l ${num_intervals_per_scatter} --numeric-suffixes all_intervals.txt ${base_filename}.scattered. for i in ${base_filename}.scattered.*; do cat header.txt $i > $i.interval_list; done >>> output { Array[File] scattered_interval_lists = glob("${base_filename}.scattered.*.interval_list") } } task PostprocessGermlineCNVCalls { String entity_id Array[File] gcnv_calls_tars Array[File] gcnv_model_tars File contig_ploidy_calls_tar Array[String] allosomal_contigs Int ref_copy_number_autosomal_contigs Int sample_index String genotyped_intervals_vcf_filename = "genotyped-intervals-${entity_id}.vcf.gz" String genotyped_segments_vcf_filename = "genotyped-segments-${entity_id}.vcf.gz" Boolean allosomal_contigs_specified = defined(allosomal_contigs) && length(select_first([allosomal_contigs, []])) > 0 String dollar = "$" #WDL workaround for using array[@], see https://github.com/broadinstitute/cromwell/issues/1819 command <<< set -e # untar calls to CALLS_0, CALLS_1, etc directories and build the command line gcnv_calls_tar_array=(${sep=" " gcnv_calls_tars}) calls_args="" for index in ${dollar}{!gcnv_calls_tar_array[@]}; do gcnv_calls_tar=${dollar}{gcnv_calls_tar_array[$index]} mkdir CALLS_$index tar xzf $gcnv_calls_tar -C CALLS_$index calls_args="$calls_args --calls-shard-path CALLS_$index" done # untar models to MODEL_0, MODEL_1, etc directories and build the command line gcnv_model_tar_array=(${sep=" " gcnv_model_tars}) model_args="" for index in ${dollar}{!gcnv_model_tar_array[@]}; do gcnv_model_tar=${dollar}{gcnv_model_tar_array[$index]} mkdir MODEL_$index tar xzf $gcnv_model_tar -C MODEL_$index model_args="$model_args --model-shard-path MODEL_$index" done mkdir extracted-contig-ploidy-calls tar xzf ${contig_ploidy_calls_tar} -C extracted-contig-ploidy-calls allosomal_contigs_args="--allosomal-contig ${sep=" --allosomal-contig " allosomal_contigs}" gatk PostprocessGermlineCNVCalls \ $calls_args \ $model_args \ ${true="$allosomal_contigs_args" false="" allosomal_contigs_specified} \ --autosomal-ref-copy-number ${ref_copy_number_autosomal_contigs} \ --contig-ploidy-calls extracted-contig-ploidy-calls \ --sample-index ${sample_index} \ --output-genotyped-intervals ${genotyped_intervals_vcf_filename} \ --output-genotyped-segments ${genotyped_segments_vcf_filename} >>> output { File genotyped_intervals_vcf = genotyped_intervals_vcf_filename File genotyped_segments_vcf = genotyped_segments_vcf_filename } }