Benchmark Recipe for GATK* Best Practices Pipeline Deployment

Deploying GATK Best Practices Pipeline

Below are the scripts files accompanying and as documented in the Infrastructure for Deploying GATK Best Practices Pipeline paper.


GATK Best Practices Pipeline_README

The following scripts are designed to take the same arguments to keep it consistent for running tests. For single threaded baseline analysis with no optimizations, run the data_colletion_gatk_best_practices_pl script with NumThreads as “1”.

For both thread level and process level parallelism analysis, use the data_collection_gatk_best_practices_optimized.pl script with NumThreads as proposed in the paper. ______

Data Collection Script for GATK Best Practices Pipeline (wgs_end2end_data_collection_gatk_best_practices.pl)

#!/usr/bin/perl if (scalar(@ARGV) < 5) { die("Usage: SampleName NumThreads InputDataDirectory TempOutputDirectory profiling \n[if profiling is enabled, then the following is required]: collectstatspath interval stats \n"); } my $sample = $ARGV[0]; my $numThreads = $ARGV[1]; my $inDataDir = $ARGV[2]; my $tmpDir = "".$ARGV[3]; my $profiling = $ARGV[4]; #by default profiling is turned ON if invoked from the workflow profiler

# arguments for collect_stats my $collectstatspath = $ARGV[5]; my $interval = $ARGV[6]; # by default sampling interval is 30s from the workflow profiler. my $stats = $ARGV[7];

#my $numLanes =$ARGV[1]; my $called = "$0 @ARGV"; my $numLanes = 0; my $sampleprefix = $sample.'_'.$numThreads.'T';


Data Collection Script for GATK Best Practices Pipeline (cont)

# INPUT FASTQ FILES #OTHER FORMATS FOR FQ: "_1.fastq.gz; #"_1.fastq"; my $fqFile1, $fqFile2; $fqFile1 = $inDataDir.$sample."_1.fq"; $fqFile2 = $inDataDir.$sample."_2.fq";

# Pipeline executables and its directories ### SPECIFY PATH IN THE VARIABLES BELOW ### my $toolsDir = '/PATH/TO/TOOLS_DIR'; my $homosapiensrefgenomeDir = '/PATH/TO/REF'; # TOOLS my $bwaDir = "$toolsDir/bwa"; my $bwa = "$bwaDir/bwa"; my $gatkDir = "$toolsDir/gatk-protected/target"; my $gatk = "$gatkDir/GenomeAnalysisTK.jar"; my $picardDir ="$toolsDir/picard/dist"; my $picard = "$picardDir/picard.jar"; # HOMOSAPIENSREFGENOME my $refgenomeFastaFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta"; my $refgenomeBwtFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta.bwt"; my $dbSNPvcf = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.dbsnp.vcf"; my $dbSNPindel = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.known_indels.vcf"; # EXOME TARGET INTERVALS my $exome_targets_intervals = "$homosapiensrefgenomeDir/nexterarapidcapture_exome_uniqueintervals.bed"; unless(-d $inDataDir) { die("Error: The InputDataDirectory $inDataDir doesn't exist\n"); } unless(-d $tmpDir) { die("Error: The TempOutputDirectory $tmpDir doesn't exist\n"); }

# Output names for each stage of the pipeline my $ = $sample; my $baseNameLane = $baseName.'_'.$numLanes.'L_'.$numThreads.'T'; my $bwamem_samFile = $tmpDir.$baseNameLane.".sam"; my $sort_bamFile = $tmpDir.$baseName."_sorted.bam"; my $duplicateMetricsFile = $tmpDir.$baseName."_dup.metrics"; my $bamDupRemFile = $tmpDir.$baseName."_dupRem.bam"; my $bamRealignFile = $tmpDir.$baseName."_realign.bam"; my $realnInterval = $tmpDir.$baseName."_realn.intervals"; my $finalBam = $tmpDir.$baseName."_final.bam"; my $HCvcf = $tmpDir.$baseName."_HaplotypeCaller.vcf"; my $genomeImportFile = $refgenomeFastaFile.".fai"; #ADD the relevant platform my $readGroupHeader = "\@RG\\tID:$baseNameLane\\tLB:$baseName\\tSM:$baseName\\tPL:PLATFORM"; my $recalOut = $tmpDir .$baseName."_recal.grp"; my $dryRun = 0; my $ = `pwd`; chomp $pwd; my $ = `hostname`; chomp $host; my $uname = ``; chomp $uname; my $runningTime = ; my $commandsfile = $tmpDir.$uname."_".$sampleprefix."_processing.log"; my $outputfile = $tmpDir.$uname."_".$sampleprefix."_output.log"; open(LOG,">$commandsfile");


Data Collection Script for GATK Best Practices Pipeline (cont) print LOG "#$called (version $version) in $pwd on $host.\n"; print LOG "#Started ".`date +"%F %T"`."\n"; print LOG "#temporary files created in $tmpDir\n"; my $procTime = time; my $procFlag = 0; sub run_and_log { my $command = $_[0]; my $execute = !$dryRun; my $exitValue = 0; #a command we don't run is considered successful my $redirect;

if (@_ >1){ $execute=!$_[1]; }

#several of the programs like to output to STDERR, so we that to log file $redirect = "1>>$outputfile 2>&1"; #that is because we redirect STDOUT in many cases, so let's not mess with it. $redirect = "2>>$outputfile" if $command =~ m/>/; # $redirect = "" if $command =~ m/>/;

$command = $command." ".$redirect;

if ($procFlag == 0){ $procFlag++; } else { $procTime = time - $procTime; LOG "#Processing Time %02d:%02d:%02d\n",int($procTime /3600),int(($procTime % 3600) /60),int($procTime %60); $procTime = time; }

print LOG "#not run\n" if !$execute; print LOG "#".`date +"%F %T"`; print LOG $command."\n";

$exitValue = system($command) if $execute;

#necessary if we use `` instead of system() #$exitValue = $? >>8;

##If the command failed, we want to stop it here. if ($exitValue != 0){ my $error = "Command failed with return value $exitValue : $command \n"; print LOG $error; close LOG; die $error; } } sub Start_profiling { my ($tag) = @_; if ($profiling) { system("$collectstatspath $stats -d $interval -td $tmpDir -n $sampleprefix -tag $tag -l 5 -u 1 -s 600 &"); } } sub Stop_Profiling {


Data Collection Script for GATK Best Practices Pipeline (cont) if ($profiling) { system("$collectstatspath ---all"); } } my $stage_tag=BwaMem; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "$bwa mem -t $numThreads -Ma -R \'$readGroupHeader\' $refgenomeFastaFile $fqFile1 $fqFile2 > $bwamem_samFile"; Stop_Profiling(); (60); my $stage_tag=SortSam; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -jar $picard SortSam I=$bwamem_samFile O=$sort_bamFile SO=coordinate CREATE_INDEX=true"; Stop_Profiling(); sleep(60); my $stage_tag=MarkDuplicates; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -jar $picard MarkDuplicates I=$sort_bamFile O=$bamDupRemFile M=$duplicateMetricsFile CREATE_INDEX=true TMP_DIR=$tmpDir"; Stop_Profiling(); sleep(60); my $stage_tag=RealignerTargetCreator; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -jar $gatk -T RealignerTargetCreator -nt $numThreads -R $refgenomeFastaFile -o $realnInterval -known:indels,vcf $dbSNPindel -I $bamDupRemFile"; Stop_Profiling(); sleep(60); my $stage_tag=IndelRealigner; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -Djava.io.tmpdir=$tmpDir -jar $gatk -T IndelRealigner -R $refgenomeFastaFile - targetIntervals $realnInterval -known:indels,vcf $dbSNPindel -I $bamDupRemFile -o $bamRealignFile -- filter_bases_not_stored"; Stop_Profiling(); sleep(60); my $stage_tag=BaseRecalibrator; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx4g -jar $gatk -T BaseRecalibrator -I $bamRealignFile -R $refgenomeFastaFile - knownSites:mask,vcf $dbSNPvcf -o $recalOut"; Stop_Profiling(); sleep(60);


Stages of GATK Best Practices Pipeline (cont) my $stage_tag=PrintReads; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -jar $gatk -T PrintReads -R $refgenomeFastaFile -I $bamRealignFile -BQSR $recalOut -o $finalBam"; Stop_Profiling(); sleep(60); my $stage_tag=HaplotypeCaller; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Xmx8g -jar $gatk -T HaplotypeCaller -R $refgenomeFastaFile -I $finalBam -o $HCvcf -ERC GVCF --variant_index_type LINEAR --variant_index_parameter 128000"; Stop_Profiling(); sleep(60);

$runningTime = time - $runningTime; printf LOG "#done in %02d:%02d:%02d\n",int($runningTime /3600),int(($runningTime % 3600) /60),int($runningTime %60); 0;

Data Collection Script for GATK Best Practices Pipeline Optimized (wgs_end2end_data_collection_gatk_best_practices_optimized.pl)

#!/usr/bin/perl if (scalar(@ARGV) < 5) { die("Usage: SampleName NumThreads InputDataDirectory TempOutputDirectory profiling \n[if profiling is enabled, then the following is required]: collectstatspath interval stats \n"); } my $sample = $ARGV[0]; my $numThreads = $ARGV[1]; my $inDataDir = $ARGV[2]; my $tmpDir = "".$ARGV[3]; my $profiling = $ARGV[4]; #by default profiling is turned ON if invoked from the workflow profiler

# arguments for collect_stats my $collectstatspath = $ARGV[5]; my $interval = $ARGV[6]; # by default sampling interval is 30s from the workflow profiler. my $stats = $ARGV[7];

#my $numLanes =$ARGV[1]; my $called = "$0 @ARGV"; my $numLanes = 0; my $sampleprefix = $sample.'_'.$numThreads.'T';


Data Collection Script for GATK Best Practices Pipeline Optimized (cont)

# INPUT FASTQ FILES #OTHER FORMATS FOR FQ: "_1.fastq.gz; #"_1.fastq"; my $fqFile1, $fqFile2; $fqFile1 = $inDataDir.$sample."_1.fq"; $fqFile2 = $inDataDir.$sample."_2.fq";

# Pipeline executables and its directories ### SPECIFY PATH IN THE VARIABLES BELOW ### my $toolsDir = '/PATH/TO/TOOLS_DIR'; #SCALA_SCRIPTS: Scripts to be passed to Queue.jar my $QueueBroadBestPracticesDir = '/PATH/TO/SCALA_SCRIPTS'; my $homosapiensrefgenomeDir = '/PATH/TO/REF'; # TOOLS my $bwaDir = "$toolsDir/bwa"; my $bwa = "$bwaDir/bwa"; my $gatkDir = "$toolsDir/gatk-protected/target"; my $gatk = "$gatkDir/GenomeAnalysisTK.jar"; my $gatk_queue = "$QueueBroadBestPracticesDir/Queue.jar"; my $picardDir ="$toolsDir/picard/dist"; my $picard = "$picardDir/picard.jar"; # HOMOSAPIENSREFGENOME my $refgenomeFastaFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta"; my $refgenomeBwtFile = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.fasta.bwt"; my $dbSNPvcf = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.dbsnp.vcf"; my $dbSNPindel = "$homosapiensrefgenomeDir/Homo_sapiens_assembly19.known_indels.vcf"; # QUEUE my $nfs_ExampleIndelRealigner = "$QueueBroadBestPracticesDir/ExampleIndelRealigner.scala"; my $nfs_ExampleBaseRecalibrator = "$QueueBroadBestPracticesDir/ExampleBaseRecalibrator.scala"; my $nfs_ExamplePrintReads = "$QueueBroadBestPracticesDir/ExamplePrintReads.scala"; my $nfs_ExampleHaplotypeCaller = "$QueueBroadBestPracticesDir/ExampleHaplotypeCaller.scala";

# REMOVE QUEUE RELATED FOLDERS AFTER COMPLETION my $cwd = `cwd`; print "$cwd\n"; my $queueDir = $cwd . "/.queue"; print "$queueDir\n"; my $jobReport = $cwd . "*.jobreport.txt"; my $ExampleDir = $tmpDir . "/Example*"; unless(-d $inDataDir) { die("Error: The InputDataDirectory $inDataDir doesn't exist\n"); } unless(-d $tmpDir) { die("Error: The TempOutputDirectory $tmpDir doesn't exist\n"); }

# Output file names for each stage of the pipeline my $baseName = $sample; my $baseNameLane = $baseName.'_'.$numLanes.'L_'.$numThreads.'T'; my $bwamem_samFile = $tmpDir.$baseNameLane.".sam"; my $sort_bamFile = $tmpDir.$baseName."_sorted.bam"; my $duplicateMetricsFile = $tmpDir.$baseName."_dup.metrics";


Data Collection Script for GATK Best Practices Pipeline Optimized (cont) my $bamDupRemFile = $tmpDir.$baseName."_dupRem.bam"; my $bamRealignFile = $tmpDir.$baseName."_realign.bam"; my $realnInterval = $tmpDir.$baseName."_realn.intervals"; my $finalBam = $tmpDir.$baseName."_final.bam"; my $HCvcf = $tmpDir.$baseName."_HaplotypeCaller.vcf"; my $genomeImportFile = $refgenomeFastaFile.".fai"; #ADD the relevant platform my $readGroupHeader = "\@RG\\tID:$baseNameLane\\tLB:$baseName\\tSM:$baseName\\tPL:PLATFORM"; my $recalOut = $tmpDir .$baseName."_recal.grp"; my $dryRun = 0; my $pwd = `pwd`; chomp $pwd; my $host = `hostname`; chomp $host; my $uname = `whoami`; chomp $uname; my $runningTime = time; my $commandsfile = $tmpDir.$uname."_".$sampleprefix."_processing.log"; my $outputfile = $tmpDir.$uname."_".$sampleprefix."_output.log"; open(LOG,">$commandsfile"); print LOG "#$called (version $version) in $pwd on $host.\n"; print LOG "#Started at ".`date +"%F %T"`."\n"; print LOG "#temporary files created in $tmpDir\n"; my $procTime = time; my $procFlag = 0; sub run_and_log { my $command = $_[0]; my $execute = !$dryRun; my $exitValue = 0; #a command we don't run is considered successful my $redirect;

if (@_ >1){ $execute=!$_[1]; }

#several of the programs like to output to STDERR, so we link that to log file $redirect = "1>>$outputfile 2>&1"; #that is because we redirect STDOUT in many cases, so let's not mess with it. $redirect = "2>>$outputfile" if $command =~ m/>/; # $redirect = "" if $command =~ m/>/;

$command = $command." ".$redirect;

if ($procFlag == 0){ $procFlag++; } else { $procTime = time - $procTime; printf LOG "#Processing Time %02d:%02d:%02d\n",int($procTime /3600),int(($procTime % 3600) /60),int($procTime %60); $procTime = time;


Data Collection Script for GATK Best Practices Pipeline Optimized (cont)

} print LOG "#not run\n" if !$execute; print LOG "#".`date +"%F %T"`; print LOG $command."\n";

$exitValue = system($command) if $execute; #necessary if we use `` instead of system() #$exitValue = $? >>8;

##If the command failed, we want to stop it here. if ($exitValue != 0){ my $error = "Command failed with return value $exitValue : $command \n"; print LOG $error; close LOG; die $error; } } sub Start_profiling { my ($tag) = @_; if ($profiling) { system("$collectstatspath $stats -d $interval -td $tmpDir -n $sampleprefix -tag $tag -l 5 -u 1 -s 600 &"); } } sub Stop_Profiling { if ($profiling) { system("$collectstatspath --kill-all"); } } my $stage_tag=BwaMem; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "$bwa mem -t $numThreads -Ma -R \'$readGroupHeader\' $refgenomeFastaFile $fqFile1 $fqFile2 > $bwamem_samFile"; Stop_Profiling(); sleep(60); my $stage_tag=SortSam; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Dsamjdk.try_use_intel_deflater=true -jar $picard SortSam I=$bwamem_samFile O=$sort_bamFile SO=coordinate CREATE_INDEX=true TMP_DIR=$tmpDir"; Stop_Profiling(); sleep(60); my $stage_tag=MarkDuplicates; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Dsamjdk.try_use_intel_deflater=true -jar $picard MarkDuplicates I=$sort_bamFile O=$bamDupRemFile M=$duplicateMetricsFile CREATE_INDEX=true TMP_DIR=$tmpDir"; Stop_Profiling(); sleep(60);


Data Collection Script for GATK Best Practices Pipeline Optimized (cont) my $stage_tag=RealignerTargetCreator; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -jar $gatk -T RealignerTargetCreator -nt $numThreads -R $refgenomeFastaFile -o $realnInterval -known:indels,vcf $dbSNPindel -I $bamDupRemFile"; Stop_Profiling(); sleep(60); my $stage_tag=IndelRealigner; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $bamDupRemFile -indels $dbSNPindel -S $nfs_ExampleIndelRealigner -l DEBUG -run -jobRunner CMPShell" ; Stop_Profiling(); system(" -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir"); sleep(60); my $stage_tag=BaseRecalibrator; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -I $bamRealignFile -R $refgenomeFastaFile -D $dbSNPvcf -S $nfs_ExampleBaseRecalibrator -l DEBUG -run -jobRunner CMPShell" ; Stop_Profiling(); system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir"); sleep(60); my $stage_tag=PrintReads; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $bamRealignFile -B $recalOut -l DEBUG -S $nfs_ExamplePrintReads -run -jobRunner CMPShell" ; Stop_Profiling(); system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir"); sleep(60); my $stage_tag=HaplotypeCaller; Start_profiling($stage_tag); print "$stage_tag\n"; run_and_log "java -Djava.io.tmpdir=$tmpDir -jar $gatk_queue -R $refgenomeFastaFile -I $finalBam -l DEBUG -S $nfs_ExampleHaplotypeCaller -run -jobRunner CMPShell"; Stop_Profiling(); system("rm -rf $queueDir; rm -r $jobReport; rm -rf $ExampleDir"); sleep(60);

$runningTime = time - $runningTime; printf LOG "#done in %02d:%02d:%02d\n",int($runningTime /3600),int(($runningTime % 3600) /60),int($runningTime %60); exit 0; _____



Steps to run Queue using CMPShell:

1) The 'cmpshell' directory needs to be created with the CMPShellJobRunner.scala and CMPShellJobManager.scala and added here : scala/src/org/broadinstitute/sting/queue/engine/cmpshell 2) Queue.jar must be recompiled to use 'CMPShell' 3) Pass 'CMPShell' as the argument to -jobRunner when running Queue.jar.


Package org.broadinstitute.sting.queue.engine.shell

Import org.broadinstitute.sting.queue.function.CommandLineFunction import org.broadinstitute.sting.queue.engine.{RunnerStatus, CommandLineJobRunner} import java.util.Date import org.broadinstitute.sting.utils.Utils import org.broadinstitute.sting.utils.runtime.{ProcessSettings, OutputStreamSettings, ProcessController} import java.util.concurrent.{Executors, ExecutorService} import Array._ /** * Runs jobs one at a time locally * @param function Command to run. */ class CMPShellJobRunner(val function: CommandLineFunction, pool: ExecutorService, creationCount: Int) extends CommandLineJobRunner { // Controller on the thread that started the job private var controller: ProcessController = null

/** * Runs the function on the local shell. */ def start() { var executionMachineName = Utils.resolveHostname() val commandSep = Array(";") val sshCommandLine = Array("ssh", executionMachineName) val scriptCommandLine = Array("sh", jobScript.getAbsolutePath) var inputCommandLine = Array[String]() if (!function.jobEnvironmentNames.isEmpty) { inputCommandLine = function.jobEnvironmentNames.mkString("").(" ") inputCommandLine = concat(inputCommandLine, commandSep); } val commandLine = concat(sshCommandLine, inputCommandLine, scriptCommandLine)

val stdoutSettings = new OutputStreamSettings val stderrSettings = new OutputStreamSettings val mergeError = (function.jobErrorFile == null)

stdoutSettings.setOutputFile(function.jobOutputFile, true) if (function.jobErrorFile != null) stderrSettings.setOutputFile(function.jobErrorFile, true) if (logger.isDebugEnabled) { stdoutSettings.printStandard(true) stderrSettings.printStandard(true) }


CMPShellJobRunner.scala (cont)

val processSettings = new ProcessSettings( commandLine, mergeError, function.commandDirectory, null, null, stdoutSettings, stderrSettings)


updateStatus(RunnerStatus.RUNNING) val runIt = new Thread( new Runnable { def run() { getRunInfo.exechosts = executionMachineName getRunInfo.startTime = new Date()

controller = ProcessController.getThreadLocal val exitStatus = controller.exec(processSettings).getExitValue getRunInfo.doneTime = new Date() updateStatus(if (exitStatus == 0) RunnerStatus.DONE else RunnerStatus.FAILED) } })

pool.execute( runIt)


def updateJobStatus() = { true }

/** * Possibly invoked from a thread, and * stop the controller from the originating thread */ def tryStop() { // Assumes that after being set the job may be // reassigned but will not be reset back to null if (controller != null) { try { controller.tryDestroy() } catch { case e => logger.error("Unable to kill shell job: " + function.description) } } } }

CMPShellJobManager.scala package org.broadinstitute.sting.queue.engine.shell import org.broadinstitute.sting.queue.function.CommandLineFunction import org.broadinstitute.sting.queue.engine.CommandLineJobManager


CMPShellJobManager.scala (cont) import java.util.concurrent.{Executors, ExecutorService} class CMPShellJobManager extends CommandLineJobManager[CMPShellJobRunner] { protected var pool: ExecutorService = null

var creationCount : Int = 0;

def runnerType = classOf[CMPShellJobRunner] def create(function: CommandLineFunction) = {

creationCount += 1 new CMPShellJobRunner(function, pool, creationCount) }

override def init() { creationCount = 0; pool = Executors.newFixedThreadPool( 64) }

override def exit() { pool.shutdown() }

override def updateStatus(runners: Set[CMPShellJobRunner]) = { var updatedRunners = Set.empty[CMPShellJobRunner] runners.foreach(runner => if (runner.updateJobStatus()) {updatedRunners += runner}) updatedRunners }

override def tryStop(runners: Set[CMPShellJobRunner]) { runners.foreach(_.tryStop()) } }

For information: • Intel Life Sciences code optimizations: www.intel.com/healthcare/optimizecode • GATK optimizations & reference architecture: http://www.intel.com/content/www/us/en/healthcare-it/solutions/genomicscode-gatk.html • GATK Best Practices Guide: https://www.broadinstitute.org/gatk/guide/best-practices.php

