workflow RBCWorkflow { File uBAM File ref_dict File ref_fasta File ref_fasta_extracted File ref_fasta_index File ref_alt File ref_sa File ref_amb File ref_bwt File ref_ann File ref_pac File hapmap File hapmap_idx File mills File mills_idx File dbsnp_vcf File dbsnp_vcf_index File calling_interval_list File target_interval_list File bait_interval_list Int haplotype_scatter_count Int break_bands_at_multiples_of Array[File] known_indels_sites_vcfs Array[File] known_indels_sites_indices String batch_base_file_name call GetBwaVersion call CollectQualityYieldMetrics { input: input_bam = uBAM, metrics_filename = batch_base_file_name + ".unmapped.quality_yield_metrics", } call SamToFastqAndBwaMemAndMba { input: input_bam = uBAM, bwa_version = GetBwaVersion.bwa_version, output_bam_basename = batch_base_file_name + ".aligned.unsorted", ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, ref_alt = ref_alt, ref_sa = ref_sa, ref_amb = ref_amb, ref_bwt = ref_bwt, ref_ann = ref_ann, ref_pac = ref_pac } call CollectUnsortedReadgroupBamQualityMetrics { input: input_bam = SamToFastqAndBwaMemAndMba.output_aligned_bam, output_bam_prefix = batch_base_file_name + ".readgroup" } call MarkDuplicates { input: input_bams = SamToFastqAndBwaMemAndMba.output_aligned_bam, output_bam_basename = batch_base_file_name + ".aligned.unsorted.duplicates_marked", metrics_filename = batch_base_file_name + ".duplicate_metrics" } call SortSam { input: input_bam = MarkDuplicates.output_bam, output_bam_basename = batch_base_file_name + ".aligned.duplicate_marked.sorted" } call indel_realignment { input: input_bam = SortSam.output_bam, input_bam_index = SortSam.output_bam_index, known_indels_sites_vcfs = known_indels_sites_vcfs, known_indels_sites_indices = known_indels_sites_indices, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, output_bam_basename = batch_base_file_name + ".indel_realigned" } call CreateSequenceGroupingTSV { input: ref_dict = ref_dict } Int num_of_bqsr_scatters = length(CreateSequenceGroupingTSV.sequence_grouping) Int potential_bqsr_divisor = num_of_bqsr_scatters - 10 Int bqsr_divisor = if potential_bqsr_divisor > 1 then potential_bqsr_divisor else 1 scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { call BaseRecalibrator { input: input_bam = indel_realignment.output_bam, input_bam_index = indel_realignment.output_bam_index, recalibration_report_filename = batch_base_file_name + ".recal_data.csv", sequence_group_interval = subgroup, dbsnp_vcf = dbsnp_vcf, dbsnp_vcf_index = dbsnp_vcf_index, known_indels_sites_vcfs = known_indels_sites_vcfs, known_indels_sites_indices = known_indels_sites_indices, ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, bqsr_scatter = bqsr_divisor } } call GatherBqsrReports { input: input_bqsr_reports = BaseRecalibrator.recalibration_report, output_report_filename = batch_base_file_name + ".recal_data.csv" } scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping_with_unmapped) { call ApplyBQSR { input: input_bam = indel_realignment.output_bam, input_bam_index = indel_realignment.output_bam_index, output_bam_basename = batch_base_file_name, recalibration_report = GatherBqsrReports.output_bqsr_report, sequence_group_interval = subgroup, ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, bqsr_scatter = bqsr_divisor } } call GatherSortedBamFiles { input: input_bams = ApplyBQSR.recalibrated_bam, output_bam_basename = batch_base_file_name } call CollectReadgroupBamQualityMetrics { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, output_bam_prefix = batch_base_file_name + ".readgroup", ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index } call CollectAggregationMetrics { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, output_bam_prefix = batch_base_file_name, ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index } call CalculateReadGroupChecksum { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, read_group_md5_filename = batch_base_file_name + ".bam.read_group_md5" } call ScatterIntervalList { input: interval_list = calling_interval_list, scatter_count = haplotype_scatter_count, break_bands_at_multiples_of = break_bands_at_multiples_of } Int potential_hc_divisor = ScatterIntervalList.interval_count - 20 Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 scatter (index in range(ScatterIntervalList.interval_count)) { call HaplotypeCaller { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, interval_list = ScatterIntervalList.out[index], vcf_basename = batch_base_file_name, ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, hc_scatter = hc_divisor, make_gvcf = true } call iHaplotypeCaller { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, interval_list = ScatterIntervalList.out[index], vcf_basename = batch_base_file_name + ".individual", ref_dict = ref_dict, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, hc_scatter = hc_divisor, make_gvcf = false } } call MergeVCFs { input: input_vcfs = HaplotypeCaller.output_vcf, input_vcfs_indexes = HaplotypeCaller.output_vcf_index, output_vcf_name = batch_base_file_name + ".g.vcf.gz" } call iMergeVCFs { input: input_vcfs = iHaplotypeCaller.output_vcf, input_vcfs_indexes = iHaplotypeCaller.output_vcf_index, output_vcf_name = batch_base_file_name + ".individual.vcf.gz" } call ValidateGVCF { input: input_vcf = MergeVCFs.output_vcf, input_vcf_index = MergeVCFs.output_vcf_index, dbsnp_vcf = dbsnp_vcf, dbsnp_vcf_index = dbsnp_vcf_index, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, calling_interval_list = calling_interval_list } call iValidateGVCF { input: input_vcf = iMergeVCFs.output_vcf, input_vcf_index = iMergeVCFs.output_vcf_index, dbsnp_vcf = dbsnp_vcf, dbsnp_vcf_index = dbsnp_vcf_index, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, calling_interval_list = calling_interval_list } call CollectGvcfCallingMetrics { input: input_vcf = MergeVCFs.output_vcf, input_vcf_index = MergeVCFs.output_vcf_index, metrics_basename = batch_base_file_name, dbsnp_vcf = dbsnp_vcf, dbsnp_vcf_index = dbsnp_vcf_index, ref_dict = ref_dict, evaluation_interval_list = calling_interval_list } call iCollectGvcfCallingMetrics { input: input_vcf = iMergeVCFs.output_vcf, input_vcf_index = iMergeVCFs.output_vcf_index, metrics_basename = batch_base_file_name + ".individual", dbsnp_vcf = dbsnp_vcf, dbsnp_vcf_index = dbsnp_vcf_index, ref_dict = ref_dict, evaluation_interval_list = calling_interval_list } call CollectHsMetrics { input: input_bam = GatherSortedBamFiles.output_bam, input_bam_index = GatherSortedBamFiles.output_bam_index, metrics_filename = batch_base_file_name + ".hybrid_selection_metrics", ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, target_interval_list = target_interval_list, bait_interval_list = bait_interval_list } call Hardfiltering { input: input_vcf = iMergeVCFs.output_vcf, input_vcf_index = iMergeVCFs.output_vcf_index, ref_fasta = ref_fasta_extracted, ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, vcf_basename = batch_base_file_name } output { File quality_yield_metrics = CollectQualityYieldMetrics.quality_yield_metrics File SamToFastqAndBwaMemAndMba_bam = SamToFastqAndBwaMemAndMba.output_aligned_bam File SamToFastqAndBwaMemAndMba_stderr = SamToFastqAndBwaMemAndMba.bwa_stderr_log File CollectUnsortedReadgroupBamQualityMetrics_base_distribution_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_pdf File CollectUnsortedReadgroupBamQualityMetrics_base_distribution_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.base_distribution_by_cycle_metrics File CollectUnsortedReadgroupBamQualityMetrics_insert_size_histogram_pdf = CollectUnsortedReadgroupBamQualityMetrics.insert_size_histogram_pdf File CollectUnsortedReadgroupBamQualityMetrics_insert_size_metrics = CollectUnsortedReadgroupBamQualityMetrics.insert_size_metrics File CollectUnsortedReadgroupBamQualityMetrics_quality_by_cycle_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_pdf File CollectUnsortedReadgroupBamQualityMetrics_quality_by_cycle_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_by_cycle_metrics File CollectUnsortedReadgroupBamQualityMetrics_quality_distribution_pdf = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_pdf File CollectUnsortedReadgroupBamQualityMetrics_quality_distribution_metrics = CollectUnsortedReadgroupBamQualityMetrics.quality_distribution_metrics File MarkDuplicates_metrics = MarkDuplicates.duplicate_metrics File BQSR_reports = GatherBqsrReports.output_bqsr_report File final_output_bam = GatherSortedBamFiles.output_bam File final_output_bam_index = GatherSortedBamFiles.output_bam_index File aQC_aggregation_alignment_summary_metrics = CollectAggregationMetrics.alignment_summary_metrics File aQC_aggregation_bait_bias_detail_metrics = CollectAggregationMetrics.bait_bias_detail_metrics File aQC_aggregation_bait_bias_summary_metrics = CollectAggregationMetrics.bait_bias_summary_metrics File aQC_aggregation_gc_bias_detail_metrics = CollectAggregationMetrics.gc_bias_detail_metrics File aQC_aggregation_gc_bias_pdf = CollectAggregationMetrics.gc_bias_pdf File aQC_aggregation_gc_bias_summary_metrics = CollectAggregationMetrics.gc_bias_summary_metrics File aQC_aggregation_insert_size_histogram_pdf = CollectAggregationMetrics.insert_size_histogram_pdf File aQC_aggregation_insert_size_metrics = CollectAggregationMetrics.insert_size_metrics File aQC_aggregation_pre_adapter_detail_metrics = CollectAggregationMetrics.pre_adapter_detail_metrics File aQC_aggregation_pre_adapter_summary_metrics = CollectAggregationMetrics.pre_adapter_summary_metrics File aQC_aggregation_quality_distribution_pdf = CollectAggregationMetrics.quality_distribution_pdf File aQC_aggregation_quality_distribution_metrics = CollectAggregationMetrics.quality_distribution_metrics File aQC_aggregation_error_summary_metrics = CollectAggregationMetrics.error_summary_metrics File aQC_RG_CheckSum_md5_file = CalculateReadGroupChecksum.md5_file File HaplotypeCalling_gVCF_output_vcf = MergeVCFs.output_vcf File HaplotypeCalling_gVCF_output_vcf_index = MergeVCFs.output_vcf_index File gvcf_summary_metrics = CollectGvcfCallingMetrics.summary_metrics File gvcf_detail_metrics = CollectGvcfCallingMetrics.detail_metrics File hybrid_selection_metrics = CollectHsMetrics.metrics File iHaplotypeCalling_gVCF_output_vcf = iMergeVCFs.output_vcf File iHaplotypeCalling_gVCF_output_vcf_index = iMergeVCFs.output_vcf_index File ivcf_summary_metrics = iCollectGvcfCallingMetrics.summary_metrics File ivcf_detail_metrics = iCollectGvcfCallingMetrics.detail_metrics File hardfiltered_individual_vcf = Hardfiltering.hardfiltered_vcf File hardfiltered_vcf_individual_index = Hardfiltering.hardfiltered_vcf_index } } task GetBwaVersion { command { bwa 2>&1 | \ grep -e '^Version' | \ sed 's/Version: //' } output { String bwa_version = read_string(stdout()) } } task CollectQualityYieldMetrics { File input_bam String metrics_filename command { java -Xms2G -Xmx20G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ CollectQualityYieldMetrics \ INPUT=${input_bam} \ OQ=true \ OUTPUT=${metrics_filename} } output { File quality_yield_metrics = "${metrics_filename}" } } task SamToFastqAndBwaMemAndMba { File input_bam File ref_dict File ref_fasta File ref_fasta_index File ref_alt File ref_sa File ref_amb File ref_bwt File ref_ann File ref_pac String output_bam_basename String bwa_version command { java -Xms2G -Xmx20G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ SamToFastq \ INPUT=${input_bam} \ FASTQ=/dev/stdout \ INTERLEAVE=true \ NON_PF=true | \ bwa mem -K 100000000 -p -v 3 -t 25 -Y ${ref_fasta} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ java -Xms2G -Xmx20G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ MergeBamAlignment \ VALIDATION_STRINGENCY=SILENT \ EXPECTED_ORIENTATIONS=FR \ ATTRIBUTES_TO_RETAIN=X0 \ ATTRIBUTES_TO_REMOVE=NM \ ATTRIBUTES_TO_REMOVE=MD \ ALIGNED_BAM=/dev/stdin \ UNMAPPED_BAM=${input_bam} \ OUTPUT=${output_bam_basename}.bam \ REFERENCE_SEQUENCE=${ref_fasta} \ PAIRED_RUN=true \ SORT_ORDER="unsorted" \ IS_BISULFITE_SEQUENCE=false \ ALIGNED_READS_ONLY=false \ CLIP_ADAPTERS=false \ MAX_RECORDS_IN_RAM=2000000 \ ADD_MATE_CIGAR=true \ MAX_INSERTIONS_OR_DELETIONS=-1 \ PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ PROGRAM_RECORD_ID="bwamem" \ PROGRAM_GROUP_VERSION="${bwa_version}" \ PROGRAM_GROUP_COMMAND_LINE="bwa mem -K 100000000 -p -v 3 -t 10 -Y" \ PROGRAM_GROUP_NAME="bwamem" \ UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ ALIGNER_PROPER_PAIR_FLAGS=true \ UNMAP_CONTAMINANT_READS=true \ ADD_PG_TAG_TO_READS=false } output { File output_aligned_bam = "${output_bam_basename}.bam" File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" } } task CollectUnsortedReadgroupBamQualityMetrics { File input_bam String output_bam_prefix command { java -Xms2G -Xmx30G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ CollectMultipleMetrics \ INPUT=${input_bam} \ OUTPUT=${output_bam_prefix} \ ASSUME_SORTED=true \ PROGRAM=null \ PROGRAM=CollectBaseDistributionByCycle \ PROGRAM=CollectInsertSizeMetrics \ PROGRAM=MeanQualityByCycle \ PROGRAM=QualityScoreDistribution \ METRIC_ACCUMULATION_LEVEL=null \ METRIC_ACCUMULATION_LEVEL=ALL_READS touch ${output_bam_prefix}.insert_size_metrics touch ${output_bam_prefix}.insert_size_histogram.pdf } output { File base_distribution_by_cycle_pdf = "${output_bam_prefix}.base_distribution_by_cycle.pdf" File base_distribution_by_cycle_metrics = "${output_bam_prefix}.base_distribution_by_cycle_metrics" File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf" File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics" File quality_by_cycle_pdf = "${output_bam_prefix}.quality_by_cycle.pdf" File quality_by_cycle_metrics = "${output_bam_prefix}.quality_by_cycle_metrics" File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf" File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics" } } task MarkDuplicates { File input_bams String output_bam_basename String metrics_filename command { java -Xms4G -Xmx50G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ MarkDuplicates \ INPUT=${sep=' INPUT=' input_bams} \ OUTPUT=${output_bam_basename}.bam \ METRICS_FILE=${metrics_filename} \ VALIDATION_STRINGENCY=SILENT \ OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ ASSUME_SORT_ORDER="queryname" \ CLEAR_DT="false" \ ADD_PG_TAG_TO_READS=false } output { File output_bam = "${output_bam_basename}.bam" File duplicate_metrics = "${metrics_filename}" } } task SortSam { File input_bam String output_bam_basename command { java -Xms4G -Xmx50G -jar ~/OPERATION/RedCellNGS/tools/picard/picard/picard.jar \ SortSam \ INPUT=${input_bam} \ OUTPUT=${output_bam_basename}.bam \ SORT_ORDER="coordinate" \ CREATE_INDEX=true \ CREATE_MD5_FILE=true \ MAX_RECORDS_IN_RAM=300000 } output { File output_bam = "${output_bam_basename}.bam" File output_bam_index = "${output_bam_basename}.bai" File output_bam_md5 = "${output_bam_basename}.bam.md5" } } task indel_realignment { File input_bam File input_bam_index Array[File] known_indels_sites_vcfs Array[File] known_indels_sites_indices File ref_fasta File ref_fasta_index File ref_dict String output_bam_basename command { java -jar ~/OPERATION/RedCellNGS/tools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R ${ref_fasta} \ -o indel_realignment.intervals \ -known ${sep=" -known " known_indels_sites_vcfs} \ -I ${input_bam} java -jar ~/OPERATION/RedCellNGS/tools/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar \ -T IndelRealigner \ -R ${ref_fasta} \ -I ${input_bam} \ -known ${sep=" -known " known_indels_sites_vcfs} \ -targetIntervals indel_realignment.intervals \ -o ${output_bam_basename}.bam } output { File output_bam = "${output_bam_basename}.bam" File output_bam_index = "${output_bam_basename}.bai" } } task CreateSequenceGroupingTSV { File ref_dict command <<< python </dev/null python3 <