144 | | This workflow aligns reads per lane and chromosome, including: |
145 | | * re-alignment to prevend false SNP calls caused by indels (using known indels) |
146 | | * markduplicates to prevend false coverage caused by PCR errors (per library = lane) |
147 | | * base quality recalibration to correct for false low scores caused by true variation |
148 | | |
149 | | Workflow Inputs: |
150 | | * lane.1.fq.gz - raw reads for lane, pair end 1 |
151 | | * lane.2.fq.gz - raw reads for lane, pair end 2 |
152 | | * genome.chr.fasta - reference genome split on chromosome |
153 | | * genome.chr.realign.intervals - targets for realignment per chromosome |
154 | | * genome.chr.dbsnpXYZ.rod - known snp variants, here from dpbsnp |
155 | | * genome.chr.indelsXYZ.vcf - known indels from, here from 1KG |
156 | | |
157 | | Workflow ouputs: |
158 | | * lane.chr.1.sai - alignment index for first pair |
159 | | * lane.chr.2.sai - alignment index for second pair |
160 | | * lane.chr.sam - alignment map for |
161 | | * lane.chr.bam - alignment map in binary format |
162 | | * lane.chr.sorted.bam - sorted alignment map |
163 | | * lane.chr.sorted.bai - sorted alignment index |
164 | | * lane.chr.dedup.bam - marked duplicate PCR elements |
165 | | * lane.chr.dedup.metrics - metrics describing deduplication |
166 | | * lane.chr.realigned.bam - realigned based on known indels |
167 | | * lane.chr.matefixed.bam - fixed the mate pair ends |
168 | | * lane.chr.covariate_table.csv - table of countcovariates output for recalibration |
169 | | * lane.chr.recal.bam - alignment map with recalibrated quality scores |
170 | | |
171 | | === align === |
172 | | Align each end of paired end. |
173 | | |
174 | | ||tool: ||bwa-align || |
175 | | ||input: ||chr.fasta, lane.1.fq.gz, lane.2.fq.gz || |
176 | | ||output: ||lane.chr.1.sai, lane.chr.2.sai || |
177 | | ||docs: ||http://bio-bwa.sourceforge.net/bwa.shtml || |
178 | | |
179 | | === align-pe === |
180 | | Align the pairs as one |
181 | | |
182 | | ||tool: ||bwa sampe || |
183 | | ||inputs: ||chr.fasta [[BR]] lane.1.fq.gz [[BR]] lane.2.fq.gz [[BR]] lane.chr.1.sai [[BR]] lane.chr.2.sai || |
184 | | ||outputs: ||lane.chr.sam || |
185 | | ||docs: ||http://bio-bwa.sourceforge.net/bwa.shtml || |
186 | | |
187 | | === sam-to-bam === |
188 | | Convert sam to bam |
189 | | |
190 | | ||tool: ||samtools view || |
191 | | ||inputs: ||lane.chr.sam || |
192 | | ||outputs: ||lane.chr.bam || |
193 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
194 | | |
195 | | (Question: can this not index and sort?) |
196 | | |
197 | | === sam-sort === |
198 | | Sort bam file on coordinate |
199 | | |
200 | | ||tool: ||samtools sort || |
201 | | ||inputs: ||lane.chr.bam || |
202 | | ||outputs: ||lane.chr.sorted.bam || |
203 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
204 | | |
205 | | === sam-index === |
206 | | Index bam file for quicker access |
207 | | |
208 | | ||tool: ||samtools index || |
209 | | ||inputs: ||lane.chr.sorted.bam || |
210 | | ||outputs: ||lane.chr.sorted.bai || |
211 | | ||docs: ||http://samtools.sourceforge.net/samtools.shtml || |
212 | | |
213 | | === !MarkDuplicates === |
214 | | Mark duplicate PCR fragments to be filtered in analysis |
215 | | |
216 | | ||tool: ||MarkDuplicates.jar || |
217 | | ||inputs: ||lane.chr.sorted.bam || |
218 | | ||outputs: ||lane.chr.dedup.bam [[BR]] lane.chr.dedup.metrics || |
219 | | ||docs: ||http://picard.sourceforge.net/command-line-overview.shtml#MarkDuplicates || |
220 | | |
221 | | === !IndelRealigner-!KnownsOnly === |
222 | | Improve the alignment using known indel information (will reduce false SNP calls) |
223 | | |
224 | | ||tool: ||GenomeAnalysisTK.jar -T IndelRealigner || |
225 | | ||inputs: ||lane.chr.dedup.bam [[BR]] genome.chr.realign.intervals [[BR]] genome.chr.dbsnpXYZ.rod [[BR]] genome.chr.indelsXYZ.vcf || |
226 | | ||outputs: ||lane.chr.realigned.bam || |
227 | | ||docs ||http://www.broadinstitute.org/gsa/wiki/index.php/Local_realignment_around_indels#Running_the_Indel_Realigner_only_at_known_sites || |
228 | | |
229 | | === !FixMateInformation === |
230 | | Fix the paired end information as consequence of the realignment. |
231 | | |
232 | | ||tool: ||FixMateInformation.jar || |
233 | | ||inputs: ||lane.chr.realigned.bam |
234 | | ||outputs: ||lane.chr.matefixed.bam || |
235 | | ||docs: ||http://picard.sourceforge.net/command-line-overview.shtml#FixMateInformation, |
236 | | |
237 | | http://www.broadinstitute.org/gsa/wiki/index.php/Local_realignment_around_indels#Fixing_Mate_Pairs || |
238 | | |
239 | | === !CountCovariates === |
240 | | Count covariants, such as machine cycle and bp position, to be used as basis for quality recalibration. |
241 | | Optionally: plot the results to pdf using AnalyzeCovariates |
242 | | |
243 | | ||tool: ||GenomeAnalysisTK.jar -T CountCovariates, AnalyzeCovariates.jar || |
244 | | ||inputs: ||lane.chr.matefixed.bam [[BR]] genome.chr.dbsnpXYZ.rod || |
245 | | ||outputs: ||lane.chr.covariate_table.csv || |
246 | | ||docs: ||http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#CountCovariates [[BR]] |
247 | | |
248 | | http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#AnalyzeCovariates.jar || |
249 | | |
250 | | === !TableRecalibration === |
251 | | Recalibrate quality scores based on the covariate table |
252 | | ||tool: ||GenomeAnalysisTK.jar -T TableRecalibration || |
253 | | ||inputs: ||lane.chr.matefixed.bam [[BR]]lanec.chr.recal_table.csv [[BR]]chr.fasta || |
254 | | ||outputs: ||lane.chr.recal.bam |
255 | | ||docs: ||http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration#TableRecalibration || |
256 | | |
257 | | === Repeat: sam-sort, sam-index, countcovariates === |
258 | | See steps above for commands and docs. |
259 | | |
260 | | ||inputs: ||lane.chr.recal.bam || |
261 | | ||outputs: ||lane.chr.recal.sorted.bam, lane.chr.recal.sorted.bam.bai, lane.chr.recal.covariate_table.csv || |
262 | | |
263 | | Discussion: |
264 | | > wy do we need to sort and index after recalibration? does it mess up the order of things? |
265 | | |