| 144 | | This workflow aligns reads per lane and chromosome, including: |
| 145 | | * re-alignment to prevend false SNP calls caused by indels (using known indels) |
| 146 | | * markduplicates to prevend false coverage caused by PCR errors (per library = lane) |
| 147 | | * base quality recalibration to correct for false low scores caused by true variation |
| 148 | | |
| 149 | | Workflow Inputs: |
| 150 | | * lane.1.fq.gz - raw reads for lane, pair end 1 |
| 151 | | * lane.2.fq.gz - raw reads for lane, pair end 2 |
| 152 | | * genome.chr.fasta - reference genome split on chromosome |
| 153 | | * genome.chr.realign.intervals - targets for realignment per chromosome |
| 154 | | * genome.chr.dbsnpXYZ.rod - known snp variants, here from dpbsnp |
| 155 | | * genome.chr.indelsXYZ.vcf - known indels from, here from 1KG |
| 156 | | |
| 157 | | Workflow ouputs: |
| 158 | | * lane.chr.1.sai - alignment index for first pair |
| 159 | | * lane.chr.2.sai - alignment index for second pair |
| 160 | | * lane.chr.sam - alignment map for |
| 161 | | * lane.chr.bam - alignment map in binary format |
| 162 | | * lane.chr.sorted.bam - sorted alignment map |
| 163 | | * lane.chr.sorted.bai - sorted alignment index |
| 164 | | * lane.chr.dedup.bam - marked duplicate PCR elements |
| 165 | | * lane.chr.dedup.metrics - metrics describing deduplication |
| 166 | | * lane.chr.realigned.bam - realigned based on known indels |
| 167 | | * lane.chr.matefixed.bam - fixed the mate pair ends |
| 168 | | * lane.chr.covariate_table.csv - table of countcovariates output for recalibration |
| 169 | | * lane.chr.recal.bam - alignment map with recalibrated quality scores |
| 170 | | |
| 171 | | === align === |
| 172 | | Align each end of paired end. |
| 173 | | |
| 174 | | ||tool: ||bwa-align || |
| 175 | | ||input: ||chr.fasta, lane.1.fq.gz, lane.2.fq.gz || |
| 176 | | ||output: ||lane.chr.1.sai, lane.chr.2.sai || |
| 177 | | ||docs: ||http://bio-bwa.sourceforge.net/bwa.shtml || |
| 178 | | |
| 179 | | === align-pe === |
| 180 | | Align the pairs as one |
| 181 | | |
| 182 | | ||tool: ||bwa sampe || |
| 183 | | ||inputs: ||chr.fasta [[BR]] lane.1.fq.gz [[BR]] lane.2.fq.gz [[BR]] lane.chr.1.sai [[BR]] lane.chr.2.sai || |
| 184 | | ||outputs: ||lane.chr.sam || |
| 185 | | ||docs: ||http://bio-bwa.sourceforge.net/bwa.shtml || |
| 186 | | |
| 187 | | === sam-to-bam === |
| 188 | | Convert sam to bam |
| 189 | | |
| 190 | | ||tool: ||samtools view || |
| 191 | | ||inputs: ||lane.chr.sam || |
| 192 | | ||outputs: ||lane.chr.bam || |
| 193 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
| 194 | | |
| 195 | | (Question: can this not index and sort?) |
| 196 | | |
| 197 | | === sam-sort === |
| 198 | | Sort bam file on coordinate |
| 199 | | |
| 200 | | ||tool: ||samtools sort || |
| 201 | | ||inputs: ||lane.chr.bam || |
| 202 | | ||outputs: ||lane.chr.sorted.bam || |
| 203 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
| 204 | | |
| 205 | | === sam-index === |
| 206 | | Index bam file for quicker access |
| 207 | | |
| 208 | | ||tool: ||samtools index || |
| 209 | | ||inputs: ||lane.chr.sorted.bam || |
| 210 | | ||outputs: ||lane.chr.sorted.bai || |
| 211 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
| 212 | | |
| 213 | | === !MarkDuplicates === |
| 214 | | Mark duplicate PCR fragments to be filtered in analysis |
| 215 | | |
| 216 | | ||tool: ||MarkDuplicates.jar || |
| 217 | | ||inputs: ||lane.chr.sorted.bam || |
| 218 | | ||outputs: ||lane.chr.dedup.bam [[BR]] lane.chr.dedup.metrics || |
| 219 | | ||docs: ||http://picard.sourceforge.net/command-line-overview.shtml#MarkDuplicates || |
| 220 | | |
| 221 | | === !IndelRealigner-!KnownsOnly === |
| 222 | | Improve the alignment using known indel information (will reduce false SNP calls) |
| 223 | | |
| 224 | | ||tool: ||GenomeAnalysisTK.jar -T IndelRealigner || |
| 225 | | ||inputs: ||lane.chr.dedup.bam [[BR]] genome.chr.realign.intervals [[BR]] genome.chr.dbsnpXYZ.rod [[BR]] genome.chr.indelsXYZ.vcf || |
| 226 | | ||outputs: ||lane.chr.realigned.bam || |
| 227 | | ||docs ||http://www.broadinstitute.org/gsa/wiki/index.php/Local_realignment_around_indels#Running_the_Indel_Realigner_only_at_known_sites || |
| 228 | | |
| 229 | | === !FixMateInformation === |
| 230 | | Fix the paired end information as consequence of the realignment. |
| 231 | | |
| 232 | | ||tool: ||FixMateInformation.jar || |
| 233 | | ||inputs: ||lane.chr.realigned.bam |
| 234 | | ||outputs: ||lane.chr.matefixed.bam || |
| 235 | | ||docs: ||http://picard.sourceforge.net/command-line-overview.shtml#FixMateInformation, |
| 236 | | |
| 237 | | http://www.broadinstitute.org/gsa/wiki/index.php/Local_realignment_around_indels#Fixing_Mate_Pairs || |
| 238 | | |
| 239 | | === !CountCovariates === |
| 240 | | Count covariants, such as machine cycle and bp position, to be used as basis for quality recalibration. |
| 241 | | Optionally: plot the results to pdf using AnalyzeCovariates |
| 242 | | |
| 243 | | ||tool: ||GenomeAnalysisTK.jar -T CountCovariates, AnalyzeCovariates.jar || |
| 244 | | ||inputs: ||lane.chr.matefixed.bam [[BR]] genome.chr.dbsnpXYZ.rod || |
| 245 | | ||outputs: ||lane.chr.covariate_table.csv || |
| 246 | | ||docs: ||http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#CountCovariates [[BR]] |
| 247 | | |
| 248 | | http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#AnalyzeCovariates.jar || |
| 249 | | |
| 250 | | === !TableRecalibration === |
| 251 | | Recalibrate quality scores based on the covariate table |
| 252 | | ||tool: ||GenomeAnalysisTK.jar -T TableRecalibration || |
| 253 | | ||inputs: ||lane.chr.matefixed.bam [[BR]]lanec.chr.recal_table.csv [[BR]]chr.fasta || |
| 254 | | ||outputs: ||lane.chr.recal.bam |
| 255 | | ||docs: ||http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#TableRecalibration || |
| 256 | | |
| 257 | | === Repeat: sam-sort, sam-index, countcovariates === |
| 258 | | See steps above for commands and docs. |
| 259 | | |
| 260 | | ||inputs: ||lane.chr.recal.bam || |
| 261 | | ||outputs: ||lane.chr.recal.sorted.bam, lane.chr.recal.sorted.bam.bai, lane.chr.recal.covariate_table.csv || |
| 262 | | |
| 263 | | Discussion: |
| 264 | | > wy do we need to sort and index after recalibration? does it mess up the order of things? |
| 265 | | |