diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml index 5d2e7f5f497..15a550e859e 100644 --- a/.github/workflows/gatk-tests.yml +++ b/.github/workflows/gatk-tests.yml @@ -345,12 +345,6 @@ jobs: echo "Running M2 WDL"; bash scripts/m2_cromwell_tests/run_m2_wdl.sh; - - name: "CNN_WDL_TEST" - if: ${{ matrix.wdlTest == 'RUN_CNN_WDL' }} - run: | - echo "Running CNN WDL"; - bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh; - - name: "VCF_SITE_LEVEL_FILTERING_WDL_TEST" if: ${{ matrix.wdlTest == 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' }} run: | diff --git a/Dockerfile b/Dockerfile index 6349f4088fa..65abb0cb939 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.3.0 +ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.3.1 # stage 1 for constructing the GATK zip FROM ${BASE_DOCKER} AS gradleBuild diff --git a/build.gradle b/build.gradle index 8ee831b3c33..c9a174dc825 100644 --- a/build.gradle +++ b/build.gradle @@ -12,16 +12,18 @@ plugins { id 'maven-publish' id 'signing' // id "jacoco" - id "de.undercouch.download" version "5.4.0" //used for downloading GSA lib - id "com.github.johnrengelman.shadow" version "8.1.1" //used to build the shadow and sparkJars - id "com.github.ben-manes.versions" version "0.12.0" //used for identifying dependencies that need updating - id 'com.palantir.git-version' version '0.5.1' //version helper - id 'org.sonatype.gradle.plugins.scan' version '2.6.1' // scans for security vulnerabilities in our dependencies + id "de.undercouch.download" version "5.6.0" //used for downloading GSA lib + id "com.gradleup.shadow" version "8.3.3" //used to build the shadow and sparkJars + id 'com.palantir.git-version' version '3.1.0' //version helper + id 'org.sonatype.gradle.plugins.scan' version '2.8.3' // scans for security vulnerabilities in our dependencies } import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar +import com.github.jengelman.gradle.plugins.shadow.transformers.AppendingTransformer + import java.time.format.DateTimeFormatter +import java.util.stream.Collectors application { mainClass = "org.broadinstitute.hellbender.Main" @@ -36,8 +38,8 @@ startScripts { } } -task downloadGsaLibFile(type: Download) { - src 'http://cran.r-project.org/src/contrib/gsalib_2.2.1.tar.gz' +tasks.register('downloadGsaLibFile', Download) { + src 'https://cran.r-project.org/src/contrib/gsalib_2.2.1.tar.gz' dest "src/main/resources/org/broadinstitute/hellbender/utils/R/gsalib.tar.gz" overwrite false } @@ -57,8 +59,8 @@ repositories { mavenLocal() } -final htsjdkVersion = System.getProperty('htsjdk.version','4.1.1') -final picardVersion = System.getProperty('picard.version','3.2.0') +final htsjdkVersion = System.getProperty('htsjdk.version','4.1.3') +final picardVersion = System.getProperty('picard.version','3.3.0') final barclayVersion = System.getProperty('barclay.version','5.0.0') final sparkVersion = System.getProperty('spark.version', '3.5.0') final hadoopVersion = System.getProperty('hadoop.version', '3.3.6') @@ -68,7 +70,7 @@ final bigQueryVersion = System.getProperty('bigQuery.version', '2.35.0') final bigQueryStorageVersion = System.getProperty('bigQueryStorage.version', '2.47.0') final guavaVersion = System.getProperty('guava.version', '32.1.3-jre') final log4j2Version = System.getProperty('log4j2Version', '2.17.1') -final testNGVersion = '7.0.0' +final testNGVersion = '7.7.0' final googleCloudNioDependency = 'com.google.cloud:google-cloud-nio:0.127.8' @@ -91,7 +93,7 @@ def checkForLFSStubFiles(targetFolder) { } def targetFiles = fileTree(dir: targetFolder) return targetFiles.any() { f -> - final byte[] actualBytes = readBytesFromFile(f, lfsStubFileHeader.length()); + final byte[] actualBytes = readBytesFromFile(f, lfsStubFileHeader.length()) return new String(actualBytes, "UTF-8") == lfsStubFileHeader } } @@ -104,7 +106,7 @@ def resolveLargeResourceStubFiles(largeResourcesFolder, buildPrerequisitesMessag def retCode = gitLFSExecCommand.execute().waitFor() if (retCode.intValue() != 0) { throw new GradleException("Execution of \"$gitLFSExecCommand\" failed with exit code: $retCode. " + - " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage"); + " git-lfs is required to build GATK but may not be installed. $buildPrerequisitesMessage") } return retCode } catch (IOException e) { @@ -163,13 +165,13 @@ if (versionOverridden) { println "Version number overridden as " + version } -configurations.all { +configurations.configureEach { resolutionStrategy { // the snapshot folder contains a dev version of guava, we don't want to use that. force 'com.google.guava:guava:' + guavaVersion // force the htsjdk version so we don't get a different one transitively force 'com.github.samtools:htsjdk:' + htsjdkVersion - force 'com.google.protobuf:protobuf-java:3.23.4' + force 'com.google.protobuf:protobuf-java:3.25.5' // force testng dependency so we don't pick up a different version via GenomicsDB force 'org.testng:testng:' + testNGVersion force 'org.broadinstitute:barclay:' + barclayVersion @@ -187,9 +189,9 @@ configurations.all { configurations*.exclude group: 'junit', module: 'junit' } -tasks.withType(JavaCompile) { - options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose'] - options.encoding = 'UTF-8' +tasks.withType(JavaCompile).configureEach { + options.compilerArgs = ['-proc:none', '-Xlint:all', '-Werror', '-Xdiags:verbose'] + options.encoding = 'UTF-8' } sourceSets { @@ -273,7 +275,7 @@ dependencies { implementation 'org.hipparchus:hipparchus-stat:2.0' implementation 'org.apache.commons:commons-collections4:4.4' implementation 'org.apache.commons:commons-vfs2:2.9.0' - implementation 'org.apache.commons:commons-configuration2:2.9.0' + implementation 'org.apache.commons:commons-configuration2:2.10.1' constraints { implementation('org.apache.commons:commons-text') { version { @@ -285,7 +287,7 @@ dependencies { implementation 'org.apache.httpcomponents:httpclient:4.5.13' implementation 'commons-beanutils:commons-beanutils:1.9.4' - implementation 'commons-io:commons-io:2.7' + implementation 'commons-io:commons-io:2.14.0' implementation 'org.reflections:reflections:0.9.10' implementation 'it.unimi.dsi:fastutil:7.0.13' @@ -437,9 +439,9 @@ run { test { // transform the list test configuration --add-opens (which must include both the runtime and test args) into // command line argument format - final testJVMAddOpens = new ArrayList<>(); - testJVMAddOpens.addAll(runtimeAddOpens); - testJVMAddOpens.addAll(testAddOpens); + final testJVMAddOpens = new ArrayList<>() + testJVMAddOpens.addAll(runtimeAddOpens) + testJVMAddOpens.addAll(testAddOpens) final testConfigurationJVMArgs = testJVMAddOpens.stream() .flatMap(openSpec -> ['--add-opens', openSpec].stream()) .toList() @@ -488,17 +490,17 @@ def createGatkSymlinks(destinationDir, archiveFile, suffix, baseJarName, seconda logger.info("build for version:" + version) group = 'org.broadinstitute' -tasks.withType(Jar) { +tasks.withType(Jar).configureEach { // transform the list of --add-opens directives into manifest format, which requires only the source // package (unlike the command line equivalent, in the manifest the "ALL-UNNAMED" target is implied // and can't be included in the manifest entry syntax) final manifestAddOpens = runtimeAddOpens.stream() .map(o -> o.substring(0, (o.length() - "ALL-UNNAMED".length()) - 1)) - .collect(java.util.stream.Collectors.joining(' ')) + .collect(Collectors.joining(' ')) manifest { attributes 'Implementation-Title': 'The Genome Analysis Toolkit (GATK)', 'Implementation-Version': archiveVersion.get(), - 'Toolkit-Short-Name' : 'GATK', + 'Toolkit-Short-Name': 'GATK', 'Main-Class': application.mainClass, 'Picard-Version': picardVersion, 'htsjdk-Version': htsjdkVersion, @@ -509,10 +511,10 @@ tasks.withType(Jar) { } wrapper { - gradleVersion = '8.2.1' + gradleVersion = '8.10.2' } -tasks.withType(ShadowJar) { +tasks.withType(ShadowJar).configureEach { from(project.sourceSets.main.output) archiveBaseName = project.name + '-package' mergeServiceFiles() @@ -524,7 +526,7 @@ tasks.withType(ShadowJar) { // Suggested by the akka devs to make sure that we do not get the spark configuration error. // http://doc.akka.io/docs/akka/snapshot/general/configuration.html#When_using_JarJar__OneJar__Assembly_or_any_jar-bundler - transform(com.github.jengelman.gradle.plugins.shadow.transformers.AppendingTransformer) { + transform(AppendingTransformer) { resource = 'reference.conf' } } @@ -543,9 +545,9 @@ shadowJar { } } -task localJar{ dependsOn shadowJar } +tasks.register('localJar') { dependsOn shadowJar } -task sparkJar(type: ShadowJar) { +tasks.register('sparkJar', ShadowJar) { group = "Shadow" description = "Create a combined jar of project and runtime dependencies that excludes provided spark dependencies" configurations = [project.configurations.sparkConfiguration] @@ -559,7 +561,7 @@ task sparkJar(type: ShadowJar) { } // A jar that only contains the test classes and resources (to be extracted for testing) -task shadowTestClassJar(type: ShadowJar){ +tasks.register('shadowTestClassJar', ShadowJar) { group = "Shadow" from sourceSets.test.output description = "Create a jar that packages the compiled test classes" @@ -567,19 +569,19 @@ task shadowTestClassJar(type: ShadowJar){ } // A minimal jar that only contains the extra dependencies needed for running the tests -task shadowTestJar(type: ShadowJar){ +tasks.register('shadowTestJar', ShadowJar) { dependsOn 'compileTestUtilsJava', 'processTestUtilsResources' group = "Shadow" description = " A minimal jar that only contains the extra dependencies needed for running the tests that arent packaged in the main shadow jar" from { - (project.configurations.testRuntimeClasspath - project.configurations.runtimeClasspath ).collect { + (project.configurations.testRuntimeClasspath - project.configurations.runtimeClasspath).collect { it.isDirectory() ? it : it.getName().endsWith(".jar") ? zipTree(it) : it } } archiveClassifier = "testDependencies" } -task collectBundleIntoDir(type: Copy) { +tasks.register('collectBundleIntoDir', Copy) { dependsOn shadowJar, sparkJar, 'condaEnvironmentDefinition', 'gatkTabComplete', 'gatkDoc' doFirst { @@ -605,11 +607,11 @@ task collectBundleIntoDir(type: Copy) { from("scripts/sv", { into("scripts/sv") }) from("scripts/cnv_wdl/", { into("scripts/cnv_wdl") }) from("scripts/mutect2_wdl/", { into("scripts/mutect2_wdl") }) - from("scripts/dataproc-cluster-ui", { into("scripts/")}) + from("scripts/dataproc-cluster-ui", { into("scripts/") }) into "$buildDir/bundle-files-collected" } -task bundle(type: Zip) { +tasks.register('bundle', Zip) { dependsOn collectBundleIntoDir zip64 true @@ -639,17 +641,18 @@ task bundle(type: Zip) { //} //} -task jacocoTestReport { +//This is a stub, so we don't have to change our github action targets while we've disabled jacoco +tasks.register('jacocoTestReport') { dependsOn test } -task condaStandardEnvironmentDefinition(type: Copy) { +tasks.register('condaStandardEnvironmentDefinition', Copy) { from "scripts" into buildDir include gatkCondaTemplate rename { file -> gatkCondaYML } - expand(["condaEnvName":"gatk", - "condaEnvDescription" : "Conda environment for GATK Python Tools"]) + expand(["condaEnvName" : "gatk", + "condaEnvDescription": "Conda environment for GATK Python Tools"]) doLast { logger.lifecycle("Created standard Conda environment yml file: $gatkCondaYML") } @@ -657,12 +660,12 @@ task condaStandardEnvironmentDefinition(type: Copy) { // Create GATK conda environment yml file from the conda enc template -task condaEnvironmentDefinition() { +tasks.register('condaEnvironmentDefinition') { dependsOn 'pythonPackageArchive', 'condaStandardEnvironmentDefinition' } // Create the Python package archive file -task pythonPackageArchive(type: Zip) { +tasks.register('pythonPackageArchive', Zip) { inputs.dir "src/main/python/org/broadinstitute/hellbender/" outputs.file pythonPackageArchiveName doFirst { @@ -685,36 +688,39 @@ task pythonPackageArchive(type: Zip) { // NOTE: This CREATES a local conda environment; but does not *activate* it. The environment must // be activated manually in the shell from which GATK will be run. // -task localDevCondaEnv(type: Exec) { +tasks.register('localDevCondaEnv', Exec) { dependsOn 'condaEnvironmentDefinition' inputs.file("$buildDir/$pythonPackageArchiveName") workingDir "$buildDir" commandLine "conda", "env", "create", "--yes", "-f", gatkCondaYML } -task localDevCondaUpdate(type: Exec) { +// update the conda environment without completely rebuilding +// this may be faster +tasks.register('localDevCondaUpdate', Exec) { dependsOn 'condaEnvironmentDefinition' inputs.file("$buildDir/$pythonPackageArchiveName") workingDir "$buildDir" commandLine "conda", "env", "update", "-f", gatkCondaYML } -task javadocJar(type: Jar, dependsOn: javadoc) { +tasks.register('javadocJar', Jar) { + dependsOn javadoc archiveClassifier = 'javadoc' from "$docBuildDir/javadoc" } -task sourcesJar(type: Jar) { +tasks.register('sourcesJar', Jar) { from sourceSets.main.allSource archiveClassifier = 'sources' } -task testUtilsJar(type: Jar){ +tasks.register('testUtilsJar', Jar) { archiveBaseName = "$project.name-test-utils" from sourceSets.testUtils.output } -tasks.withType(Javadoc) { +tasks.withType(Javadoc).configureEach { // do this for all javadoc tasks, including gatkDoc options.addStringOption('Xdoclint:none') options.addStringOption('encoding', 'UTF-8') @@ -729,7 +735,7 @@ javadoc { } -task testUtilsJavadoc(type: Javadoc) { +tasks.register('testUtilsJavadoc', Javadoc) { // This is a hack to disable the java default javadoc lint until we fix the html formatting // We only want to do this for the javadoc task, not gatkDoc options.addStringOption('Xdoclint:none', '-quiet') @@ -739,27 +745,29 @@ task testUtilsJavadoc(type: Javadoc) { include '**/*.java' } -task testUtilsJavadocJar(type: Jar, dependsOn: testUtilsJavadoc){ +tasks.register('testUtilsJavadocJar', Jar) { + dependsOn testUtilsJavadoc archiveBaseName = "$project.name-test-utils" archiveClassifier = 'javadoc' from "$docBuildDir/testUtilsJavadoc" } -task testUtilsSourcesJar(type: Jar){ +tasks.register('testUtilsSourcesJar', Jar) { archiveBaseName = "$project.name-test-utils" archiveClassifier = 'sources' from sourceSets.testUtils.allSource } // Generate GATK Online Doc -task gatkDoc(type: Javadoc, dependsOn: classes) { +tasks.register('gatkDoc', Javadoc) { + dependsOn classes final File gatkDocDir = new File("$docBuildDir/gatkdoc") doFirst { // make sure the output folder exists or we can create it if (!gatkDocDir.exists() && !gatkDocDir.mkdirs()) { throw new GradleException(String.format("Failure creating folder (%s) for GATK doc output in task (%s)", gatkDocDir.getAbsolutePath(), - it.name)); + it.name)) } copy { from('src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates') @@ -784,7 +792,7 @@ task gatkDoc(type: Javadoc, dependsOn: classes) { outputs.dir(gatkDocDir) options.destinationDirectory(gatkDocDir) - options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates"); + options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/helpTemplates") if (project.hasProperty('phpDoc')) { // use -PphpDoc to generate .php file extensions, otherwise rely on default of .html final String phpExtension = "php" @@ -796,14 +804,15 @@ task gatkDoc(type: Javadoc, dependsOn: classes) { } // Generate GATK Bash Tab Completion File -task gatkTabComplete(type: Javadoc, dependsOn: classes) { +tasks.register('gatkTabComplete', Javadoc) { + dependsOn classes final File tabCompletionDir = new File("$docBuildDir/tabCompletion") doFirst { // make sure the output folder exists or we can create it if (!tabCompletionDir.exists() && !tabCompletionDir.mkdirs()) { throw new GradleException(String.format("Failure creating folder (%s) for GATK tab completion output in task (%s)", tabCompletionDir.getAbsolutePath(), - it.name)); + it.name)) } } // Include the Picard source jar, which contains various .R, .sh, .css, .html, .xml and .MF files and @@ -849,13 +858,14 @@ task gatkTabComplete(type: Javadoc, dependsOn: classes) { options.addStringOption("caller-post-arg-max-occurs", "1 1 1 1 1 1 1 1 1 1") } -def getWDLInputJSONTestFileNameFromWDLName(File wdlName) { +static def getWDLInputJSONTestFileNameFromWDLName(File wdlName) { String fileWithoutExt = wdlName.name.take(wdlName.name.lastIndexOf('.')) return new File (wdlName.getParentFile(), fileWithoutExt + "Inputs.json").getAbsolutePath() } // Generate GATK Tool WDL -task gatkWDLGen(type: Javadoc, dependsOn: classes) { +tasks.register('gatkWDLGen', Javadoc) { + dependsOn classes final File gatkWDLDir = new File("$docBuildDir/wdlGen") outputs.dir(gatkWDLDir) doFirst { @@ -863,7 +873,7 @@ task gatkWDLGen(type: Javadoc, dependsOn: classes) { if (!gatkWDLDir.exists() && !gatkWDLDir.mkdirs()) { throw new GradleException(String.format("Failure creating folder (%s) for GATK WDL output in task (%s)", gatkWDLDir.getAbsolutePath(), - it.name)); + it.name)) } copy { from('src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates/common.html') @@ -885,7 +895,7 @@ task gatkWDLGen(type: Javadoc, dependsOn: classes) { outputs.dir(gatkWDLDir) options.destinationDirectory(gatkWDLDir) - options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates"); + options.addStringOption("settings-dir", "src/main/resources/org/broadinstitute/hellbender/utils/wdlTemplates") options.addStringOption("output-file-extension", "wdl") options.addStringOption("index-file-extension", "html") @@ -906,11 +916,11 @@ def execWDLValidation = { validateWDL -> } return retCode } catch (IOException e) { - throw new GradleException("An IOException occurred while attempting to execute the command $validateWDL.") + throw new GradleException("An IOException occurred while attempting to execute the command $validateWDL.", e) } } -task gatkValidateScriptsWdl() { +tasks.register('gatkValidateScriptsWdl') { doFirst { // running this task requires a local cromwell installation, with environment variables CROMWELL_JAR, // WOMTOOL_JAR set to the jar locations @@ -933,7 +943,8 @@ task gatkValidateScriptsWdl() { } } -task gatkValidateGeneratedWdl(dependsOn: [gatkWDLGen, shadowJar]) { +tasks.register('gatkValidateGeneratedWdl') { + dependsOn(gatkWDLGen, shadowJar) doFirst { // running this task requires a local cromwell installation, with environment variables CROMWELL_JAR, // WOMTOOL_JAR set to the jar locations @@ -1020,7 +1031,7 @@ signing { def basePomConfiguration = { packaging = 'jar' description = 'Development on GATK 4' - url = 'http://github.com/broadinstitute/gatk' + url = 'https://github.com/broadinstitute/gatk' scm { url = 'scm:git@github.com:broadinstitute/gatk.git' @@ -1093,8 +1104,8 @@ publish { } } -task installSpark{ dependsOn sparkJar } -task installAll{ dependsOn installSpark, installDist } +tasks.register('installSpark') { dependsOn sparkJar } +tasks.register('installAll') { dependsOn installSpark, installDist } installDist.dependsOn downloadGsaLibFile downloadGsaLibFile.dependsOn sourcesJar diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 00000000000..d7a34a5028f --- /dev/null +++ b/gradle.properties @@ -0,0 +1 @@ +org.gradle.jvmargs=-Xmx2g diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index 249e5832f09..033e24c4cdf 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 84a0b92f9af..df97d72b8b9 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.2.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.10.2-bin.zip +networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew index a69d9cb6c20..fcb6fca147c 100755 --- a/gradlew +++ b/gradlew @@ -55,7 +55,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -80,13 +80,10 @@ do esac done -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -APP_NAME="Gradle" +# This is normally unused +# shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,22 +130,29 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac case $MAX_FD in #( '' | soft) :;; #( *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -193,6 +197,10 @@ if "$cygwin" || "$msys" ; then done fi + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + # Collect all arguments for the java command; # * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of # shell script including quotes and variable substitutions, so put them in diff --git a/scripts/cnn_variant_cromwell_tests/README.md b/scripts/cnn_variant_cromwell_tests/README.md deleted file mode 100644 index c137747ab65..00000000000 --- a/scripts/cnn_variant_cromwell_tests/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# CNN Variant Automated Tests for WDL - -**This directory is for GATK devs only** - -This directory contains scripts for running CNN Variant WDL tests in the automated build environment. - -Please note that this only tests whether the WDL will complete successfully. diff --git a/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh b/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh deleted file mode 100644 index 18a4d824c3d..00000000000 --- a/scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -l -set -e -#cd in the directory of the script in order to use relative paths -script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) -cd "$script_path" - -WORKING_DIR=/home/runner/work/gatk - -set -e -echo "Building docker image for CNN WDL tests (skipping unit tests)..." - -#assume Dockerfile is in root -echo "Building docker without running unit tests... =========" -cd $WORKING_DIR/gatk - -# IMPORTANT: This code is duplicated in the cnv and M2 WDL test. -if [ ! -z "$CI_PULL_REQUEST" ]; then - HASH_TO_USE=FETCH_HEAD - sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${CI_PULL_REQUEST}; - echo "using fetch head:"$HASH_TO_USE -else - HASH_TO_USE=${CI_COMMIT} - sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; - echo "using travis commit:"$HASH_TO_USE -fi -echo "Docker build done ==========" - -cd $WORKING_DIR/gatk/scripts/ -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cnn_score_variants_travis.json >$WORKING_DIR/cnn_score_variants_travis.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json >$WORKING_DIR/cnn_score_variants_travis_1d.json -sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" cnn_variant_wdl/jsons/cram2filtered_travis.json >$WORKING_DIR/cram2filtered_travis.json -echo "JSON FILES (modified) =======" -cat $WORKING_DIR/cnn_score_variants_travis.json -cat $WORKING_DIR/cnn_score_variants_travis_1d.json -cat $WORKING_DIR/cram2filtered_travis.json -echo "==================" - - -echo "Running CNN Score Variants WDL through cromwell" -ln -fs $WORKING_DIR/gatk/scripts/cnn_variant_wdl/cnn_score_variants.wdl -cd $WORKING_DIR/gatk/scripts/cnn_variant_wdl/ -java -jar $CROMWELL_JAR run cnn_score_variants.wdl -i $WORKING_DIR/cnn_score_variants_travis_1d.json -java -jar $CROMWELL_JAR run cnn_score_variants.wdl -i $WORKING_DIR/cnn_score_variants_travis.json -java -jar $CROMWELL_JAR run cram2filtered.wdl -i $WORKING_DIR/cram2filtered_travis.json diff --git a/scripts/cnn_variant_wdl/README.md b/scripts/cnn_variant_wdl/README.md deleted file mode 100644 index e3b19930d4b..00000000000 --- a/scripts/cnn_variant_wdl/README.md +++ /dev/null @@ -1,68 +0,0 @@ -# gatk4-cnn-variant-filter - -### Purpose : -These workflows take advantage of GATK's CNN tool which uses a deep learning -approach to filter variants based on Convolutional Neural Networks. - -Please read the following post to learn more about the CNN tool: [Deep Learning in GATK4](https://github.com/broadinstitute/gatk-docs/blob/3333b5aacfd3c48a87b60047395e1febc98c21f9/blog-2012-to-2019/2017-12-21-Deep_learning_in_GATK4.md). - -### cram2filtered.wdl -This workflow takes an input CRAM/BAM to call variants with HaplotypeCaller -then filters the calls with the CNNScoreVariant neural net tool using the filtering model specified. - -The site-level scores are added to the `INFO` field of the VCF. The architecture arguments, -`info_key` and `tensor_type` arguments MUST be in agreement (e.g. 2D models must have -`tensor_type` of `read_tensor` and `info_key` of `CNN_2D`, 1D models must have `tensor_type` of -`reference` and `info_key` of `CNN_1D`). The `INFO` field key will be `CNN_1D` or `CNN_2D` -depending on the neural net architecture used for inference. The architecture arguments -specify pre-trained networks. New networks can be trained by the GATK tools: CNNVariantWriteTensors -and CNNVariantTrain. The CRAM could be generated by the [single-sample pipeline](https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl). -If you would like test to the workflow on a more representative example file, use the following -CRAM file as input and change the scatter count from 4 to 200: gs://gatk-best-practices/cnn-h38/NA12878_NA12878_IntraRun_1_SM-G947Y_v1.cram. - -#### Requirements/expectations : - - CRAM/BAM - - BAM Index (if input is BAM) - -#### Output : - - Filtered VCF and its index. - -### cram2model.wdl -This **optional** workflow is for advanced users who would like to train a CNN model for filtering variants for specific use cases (e.g. custom panels, non-human, or non-Illumina sequencing). - -#### Requirements/expectations : - - CRAM - - Truth VCF and its index - - Truth Confidence Interval Bed - -#### Output : - - Model HD5 - - Model JSON - - Model Plots PNG - -### run_happy.wdl -This **optional** evaluation and plotting workflow runs a filtering model against truth data (e.g. [NIST Genomes in a Bottle](https://github.com/genome-in-a-bottle/giab_latest_release), [Synthic Diploid Truth Set](https://github.com/lh3/CHM-eval/releases) ) and plots the accuracy. - -#### Requirements/expectations : - - File of VCF Files - - Truth VCF and its index - - Truth Confidence Interval Bed - -#### Output : - - Evaluation summary - - Plots - -### Important Notes : -- Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -- For help running workflows on the Google Cloud Platform or locally please -view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://gatk.broadinstitute.org/hc/en-us/articles/360035530952). -- Please visit the [User Guide](https://gatk.broadinstitute.org/hc/en-us/categories/360002310591) site for further documentation on our workflows and tools. - -### Contact Us : -- The following material is provided by the Data Science Platforum group at the Broad Institute. Please direct any questions or concerns to one of our forum sites : [GATK](https://gatk.broadinstitute.org/hc/en-us/community/topics) or [Terra](https://support.terra.bio/hc/en-us/community/topics/360000500432). - -### LICENSING : - This script is released under the GATK source code license (Apache 2.0) (see LICENSE in - https://github.com/broadinstitute/gatk). Note however that the programs it calls may - be subject to different licenses. Users are responsible for checking that they are - authorized to run all programs before running this script. diff --git a/scripts/cnn_variant_wdl/cnn_score_variants.wdl b/scripts/cnn_variant_wdl/cnn_score_variants.wdl deleted file mode 100644 index 9b0abb4223f..00000000000 --- a/scripts/cnn_variant_wdl/cnn_score_variants.wdl +++ /dev/null @@ -1,113 +0,0 @@ -# The CNNScoreVariants tool annotates a VCF with scores from a Neural Net as part of a single-sample workflow. -# The site-level scores are added to the INFO field of the VCF. -# The architecture arguments, info_key and tensor type arguments MUST be in agreement -# (e.g. 2D models must have tensor_type of read_tensor and info_key CNN_2D, 1D models have tensor_type reference and info_key CNN_1D) -# The INFO field key will be "1D_CNN" or "2D_CNN" depending on the neural net architecture used for inference. -# The architecture arguments specify pre-trained networks. -# New networks can be trained by the GATK tools: CNNVariantWriteTensors and CNNVariantTrain -# The bam file and index are only required by 2D CNNs which take a read-level tensor_type such as "read_tensor". -# For 1D CNNs the tensor_type is typically "reference". -# Parallelization over sites is controlled by the scatter_count variable. - -import "cnn_variant_common_tasks.wdl" as CNNTasks - -workflow CNNScoreVariantsWorkflow { - File input_vcf # The VCF to annotate with scores - File input_vcf_index - File reference_fasta - File reference_dict - File reference_fasta_index - Array[File] resources # List of VCF file names of resources of known SNPs and INDELs, (e.g. mills, gnomAD) - Array[File] resources_index # List of VCF file indices of resources - File? bam_file # Bam (or HaplotypeCaller-generated "bamout") file from which input_vcf was called, required by read-level architectures - File? bam_file_index - File? architecture_json # Neural Net configuration for CNNScoreVariants - File? architecture_hd5 # Pre-Trained weights and architecture for CNNScoreVariants - String? tensor_type # Keyword indicating the shape of the input tensor (e.g. read_tensor, reference) - String info_key # The score key for the INFO field of the vcf (e.g. CNN_1D, CNN_2D) - String snp_tranches # Filtering threshold(s) for SNPs in terms of sensitivity to overlapping known variants in resources - String indel_tranches # Filtering threshold(s) for INDELs in terms of sensitivity to overlapping known variants in resources - String? filter_tranches_extra # Additional arguments for filter variant tranches - String output_prefix # Identifying string for this run which will be used to name output files (the gzipped VCF and, for the 2D CNN, bamout) - Int? inference_batch_size # Batch size for python in CNNScoreVariants - Int? transfer_batch_size # Batch size for java transfers to python in CNNScoreVariants - Int? intra_op_threads # Tensorflow threading within nodes - Int? inter_op_threads # Tensorflow threading between nodes - File? gatk_override - String gatk_docker - File calling_intervals - Int scatter_count - Int? preemptible_attempts - Int? cnn_task_mem_gb - Int? cnn_task_cpu - Int? mem_gb - - call CNNTasks.SplitIntervals { - input: - gatk_override = gatk_override, - scatter_count = scatter_count, - intervals = calling_intervals, - ref_fasta = reference_fasta, - ref_dict = reference_dict, - ref_fai = reference_fasta_index, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker - } - - scatter (calling_interval in SplitIntervals.interval_files) { - - call CNNTasks.CNNScoreVariants { - input: - input_vcf = input_vcf, - input_vcf_index = input_vcf_index, - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - bam_file = bam_file, - bam_file_index = bam_file_index, - architecture_json = architecture_json, - architecture_hd5 = architecture_hd5, - tensor_type = tensor_type, - inference_batch_size = inference_batch_size, - transfer_batch_size = transfer_batch_size, - intra_op_threads = intra_op_threads, - inter_op_threads = inter_op_threads, - output_prefix = output_prefix, - interval_list = calling_interval, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = cnn_task_mem_gb, - cpu = cnn_task_cpu - } - } - - call CNNTasks.MergeVCFs as MergeVCF_CNN { - input: - input_vcfs = CNNScoreVariants.cnn_annotated_vcf, - output_prefix = output_prefix, - preemptible_attempts = preemptible_attempts, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - - call CNNTasks.FilterVariantTranches { - input: - input_vcf = MergeVCF_CNN.merged_vcf, - input_vcf_index = MergeVCF_CNN.merged_vcf_index, - resources = resources, - resources_index = resources_index, - output_prefix = output_prefix, - snp_tranches = snp_tranches, - indel_tranches = indel_tranches, - info_key = info_key, - extra_args = filter_tranches_extra, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker - } - - output { - FilterVariantTranches.* - } -} diff --git a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl b/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl deleted file mode 100644 index 3db1b5eacec..00000000000 --- a/scripts/cnn_variant_wdl/cnn_variant_common_tasks.wdl +++ /dev/null @@ -1,359 +0,0 @@ -task CNNScoreVariants { - File input_vcf - File input_vcf_index - File reference_fasta - File reference_dict - File reference_fasta_index - String output_prefix - File? bam_file - File? bam_file_index - File? architecture_json - File? architecture_hd5 - Int? inference_batch_size - Int? transfer_batch_size - Int? intra_op_threads - Int? inter_op_threads - String? tensor_type - - File interval_list - File? gatk_override - - # Runtime parameters - Int? mem_gb - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 6000 - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem / 2 - -command <<< - - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - CNNScoreVariants \ - ${"-I " + bam_file} \ - ${"--read-index " + bam_file_index} \ - -R ${reference_fasta} \ - -V ${input_vcf} \ - -O ${output_prefix}_cnn_annotated.vcf.gz \ - -L ${interval_list} \ - ${"--architecture " + architecture_json} \ - ${"--tensor-type " + tensor_type} \ - ${"--inference-batch-size " + inference_batch_size} \ - ${"--transfer-batch-size " + transfer_batch_size} \ - ${"--intra-op-threads " + intra_op_threads} \ - ${"--inter-op-threads " + inter_op_threads} - ->>> - - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - zones: "us-central1-b" # Needs to be a zone that guarantees CPUs with AVX see (https://cloud.google.com/compute/docs/regions-zones/) - bootDiskSizeGb: "16" - } - - output { - Array[File] log = glob("gatkStreamingProcessJournal*") - File cnn_annotated_vcf = "${output_prefix}_cnn_annotated.vcf.gz" - File cnn_annotated_vcf_index = "${output_prefix}_cnn_annotated.vcf.gz.tbi" - } -} - -task RunHC4 { - File input_bam - File input_bam_index - File reference_fasta - File reference_dict - File reference_fasta_index - String output_prefix - File interval_list - String extra_args - File? gatk_override - - # Runtime parameters - Int? mem_gb - String gatk_docker - Int? preemptible_attempts - Int disk_space_gb - Int? cpu - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 8000 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - HaplotypeCaller \ - -R ${reference_fasta} \ - -I ${input_bam} \ - --read-index ${input_bam_index} \ - -O ${output_prefix}_hc4.vcf.gz \ - -L ${interval_list} \ - -bamout ${output_prefix}_bamout.bam \ - ${extra_args} - } - - output { - File bamout = "${output_prefix}_bamout.bam" - File bamout_index = "${output_prefix}_bamout.bai" - File raw_vcf = "${output_prefix}_hc4.vcf.gz" - File raw_vcf_index = "${output_prefix}_hc4.vcf.gz.tbi" - } - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - # Note that the space before SSD and HDD should be included. - disks: "local-disk " + sub(disk_space_gb, "\\..*", "") + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - bootDiskSizeGb: "16" - } -} - - -task FilterVariantTranches { - File input_vcf - File input_vcf_index - Array[File] resources - Array[File] resources_index - String output_prefix - String snp_tranches - String indel_tranches - String info_key - String? extra_args - File? gatk_override - - # Runtime parameters - Int? mem_gb - String gatk_docker - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - String output_vcf = "${output_prefix}_cnn_filtered.vcf.gz" - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). - Int default_disk_space_gb = 200 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - -command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - FilterVariantTranches \ - -V ${input_vcf} \ - --output ${output_vcf} \ - -resource ${sep=" -resource " resources} \ - -info-key ${info_key} \ - ${snp_tranches} \ - ${indel_tranches} \ - ${extra_args} ->>> - - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - # Note that the space before HDD and HDD should be included. - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - bootDiskSizeGb: "16" - } - - output { - File cnn_filtered_vcf = "${output_vcf}" - File cnn_filtered_vcf_index = "${output_vcf}.tbi" - } -} - -task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ - -O ./ \ - ${split_intervals_extra_args} - } - - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - bootDiskSizeGb: "16" - } - - output { - Array[File] interval_files = glob("*.interval_list") - } -} - -task MergeVCFs { - # inputs - Array[File] input_vcfs - String output_prefix - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - String output_vcf = "${output_prefix}_cnn_scored.vcf.gz" - - Int default_disk_space_gb = 100 - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs \ - -I ${sep=' -I ' input_vcfs} -O "${output_vcf}" - } - - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - bootDiskSizeGb: "16" - } - - output { - File merged_vcf = "${output_vcf}" - File merged_vcf_index = "${output_vcf}.tbi" - } -} - -task CramToBam { - File reference_fasta - File reference_fasta_index - File reference_dict - File cram_file - String output_prefix - - # Runtime parameters - Int? mem_gb - Int? preemptible_attempts - Int disk_space_gb - Int? cpu - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - -command <<< - ls -ltr ${cram_file} ${reference_fasta} && - echo "ls (1): complete" && - samtools view -h -T ${reference_fasta} ${cram_file} | - samtools view -b -o ${output_prefix}.bam - && - echo "samtools view: complete" && - ls -ltr . && - echo "ls (2): complete" && - samtools index -b ${output_prefix}.bam && - echo "samtools index: complete" && - ls -ltr . && - echo "ls (3): complete" && - mv ${output_prefix}.bam.bai ${output_prefix}.bai && - echo "mv: complete" && - ls -ltr . && - echo "ls (4): complete" - >>> - runtime { - docker: "broadinstitute/gatk:gatkbase-3.2.0" - memory: machine_mem + " MB" - - # Note that the space before SSD and HDD should be included. - disks: "local-disk " + disk_space_gb + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File output_bam = "${output_prefix}.bam" - File output_bam_index = "${output_prefix}.bai" - } -} - -task SamtoolsMergeBAMs { - Array[File] input_bams - String output_prefix - Int disk_space_gb - command { - samtools merge ${output_prefix}_bamout.bam ${sep=' ' input_bams} - samtools index ${output_prefix}_bamout.bam ${output_prefix}_bamout.bai - } - - output { - File bamout = "${output_prefix}_bamout.bam" - File bamout_index = "${output_prefix}_bamout.bai" - } - - runtime { - docker: "broadinstitute/gatk:gatkbase-3.2.0" - memory: "16 GB" - disks: "local-disk " + disk_space_gb + " HDD" - } -} diff --git a/scripts/cnn_variant_wdl/cram2filtered.wdl b/scripts/cnn_variant_wdl/cram2filtered.wdl deleted file mode 100755 index ed07fd4543d..00000000000 --- a/scripts/cnn_variant_wdl/cram2filtered.wdl +++ /dev/null @@ -1,158 +0,0 @@ -# This workflow takes an input CRAM to call variants with HaplotypeCaller -# Then filters the calls with the CNNVariant neural net tool -# The site-level scores are added to the INFO field of the VCF. -# The architecture arguments, info_key and tensor type arguments MUST be in agreement -# (e.g. 2D models must have tensor_type of read_tensor and info_key CNN_2D, 1D models have tensor_type reference and info_key CNN_1D) -# The INFO field key will be "1D_CNN" or "2D_CNN" depending on the neural net architecture used for inference. -# The architecture arguments specify pre-trained networks. -# New networks can be trained by the GATK tools: CNNVariantWriteTensors and CNNVariantTrain -# The CRAM could be generated by the single-sample pipeline -# (https://github.com/gatk-workflows/gatk4-data-processing/blob/master/processing-for-variant-discovery-gatk4.wdl) -# Also accepts a BAM as the input file in which case a BAM index is required as well. - -import "cnn_variant_common_tasks.wdl" as CNNTasks - -workflow Cram2FilteredVcf { - File input_file # Aligned CRAM file or Aligned BAM files - File? input_file_index # Index for an aligned BAM file if that is the input, unneeded if input is a CRAM - File reference_fasta - File reference_dict - File reference_fasta_index - Array[File] resources # List of VCF file names of resources of known SNPs and INDELs, (e.g. mills, gnomAD) - Array[File] resources_index # List of VCF file indices of resources - File? architecture_json # Neural Net configuration for CNNScoreVariants - File? architecture_hd5 # Pre-Trained weights and architecture for CNNScoreVariants - Int? inference_batch_size # Batch size for python in CNNScoreVariants - Int? transfer_batch_size # Batch size for java in CNNScoreVariants - Int? intra_op_threads # Tensorflow threading within nodes - Int? inter_op_threads # Tensorflow threading between nodes - String output_prefix # Identifying string for this run will be used to name all output files - String? tensor_type # What kind of tensors the Neural Net expects (e.g. reference, read_tensor) - String info_key # The score key for the info field of the vcf (e.g. CNN_1D, CNN_2D) - String snp_tranches # Filtering threshold(s) for SNPs in terms of sensitivity to overlapping known variants in resources - String indel_tranches # Filtering threshold(s) for INDELs in terms of sensitivity to overlapping known variants in resources - File? gatk_override # GATK Jar file to over ride the one included in gatk_docker - String gatk_docker - File calling_intervals - Int scatter_count # Number of shards for parallelization of HaplotypeCaller and CNNScoreVariants - String extra_args # Extra arguments for HaplotypeCaller - - # Runtime parameters - Int? mem_gb - Int? preemptible_attempts - Float? disk_space_gb - Int? cpu - - Int? increase_disk_size - Int additional_disk = select_first([increase_disk_size, 20]) - Float ref_size = size(reference_fasta, "GB") + size(reference_fasta_index, "GB") + size(reference_dict, "GB") - - # Clunky check to see if the input is a BAM or a CRAM - if (basename(input_file) == basename(input_file, ".bam")){ - call CNNTasks.CramToBam { - input: - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - cram_file = input_file, - output_prefix = output_prefix, - disk_space_gb = round(4*size(input_file, "GB") + ref_size + additional_disk), - preemptible_attempts = preemptible_attempts - } - } - - call CNNTasks.SplitIntervals { - input: - gatk_override = gatk_override, - scatter_count = scatter_count, - intervals = calling_intervals, - ref_fasta = reference_fasta, - ref_dict = reference_dict, - ref_fai = reference_fasta_index, - gatk_docker = gatk_docker, - disk_space = round(additional_disk + ref_size) - } - - String input_bam = select_first([CramToBam.output_bam, input_file]) - Float bam_size = size(input_bam, "GB") - - scatter (calling_interval in SplitIntervals.interval_files) { - call CNNTasks.RunHC4 { - input: - input_bam = input_bam, - input_bam_index = select_first([CramToBam.output_bam_index, input_file_index]), - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - output_prefix = output_prefix, - interval_list = calling_interval, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - extra_args = extra_args, - disk_space_gb = round(bam_size + ref_size + additional_disk) - } - - call CNNTasks.CNNScoreVariants { - input: - input_vcf = RunHC4.raw_vcf, - input_vcf_index = RunHC4.raw_vcf_index, - bam_file = RunHC4.bamout, - bam_file_index = RunHC4.bamout_index, - architecture_json = architecture_json, - architecture_hd5 = architecture_hd5, - reference_fasta = reference_fasta, - tensor_type = tensor_type, - inference_batch_size = inference_batch_size, - transfer_batch_size = transfer_batch_size, - intra_op_threads = intra_op_threads, - inter_op_threads = inter_op_threads, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - output_prefix = output_prefix, - interval_list = calling_interval, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - mem_gb = mem_gb, - disk_space_gb = round((bam_size/scatter_count) + ref_size + additional_disk) - } - } - - call CNNTasks.MergeVCFs as MergeVCF_HC4 { - input: - input_vcfs = CNNScoreVariants.cnn_annotated_vcf, - output_prefix = output_prefix, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker, - disk_space_gb = additional_disk - } - - call CNNTasks.FilterVariantTranches { - input: - input_vcf = MergeVCF_HC4.merged_vcf, - input_vcf_index = MergeVCF_HC4.merged_vcf_index, - resources = resources, - resources_index = resources_index, - output_prefix = output_prefix, - snp_tranches = snp_tranches, - indel_tranches = indel_tranches, - info_key = info_key, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker, - disk_space_gb = additional_disk - } - - call CNNTasks.SamtoolsMergeBAMs { - input: - input_bams = RunHC4.bamout, - output_prefix = output_prefix, - disk_space_gb = round(bam_size + ref_size + additional_disk) - } - - output { - FilterVariantTranches.* - } -} diff --git a/scripts/cnn_variant_wdl/cram2model.wdl b/scripts/cnn_variant_wdl/cram2model.wdl deleted file mode 100755 index 7c932e7058a..00000000000 --- a/scripts/cnn_variant_wdl/cram2model.wdl +++ /dev/null @@ -1,242 +0,0 @@ -# CRAM to trained CNNVariant Model - -import "cnn_variant_common_tasks.wdl" as CNNTasks - -workflow Cram2TrainedModel { - File input_cram - File reference_fasta - File reference_dict - File reference_fasta_index - File truth_vcf - File truth_vcf_index - File truth_bed - String output_prefix - String tensor_type - Int epochs - File calling_intervals - Int scatter_count - String extra_args - - # Runtime parameters - File? gatk_override - String gatk_docker - Int? mem_gb - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Int? increase_disk_size - Int additional_disk = select_first([increase_disk_size, 20]) - Float ref_size = size(reference_fasta, "GB") + size(reference_fasta_index, "GB") + size(reference_dict, "GB") - - call CNNTasks.CramToBam { - input: - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - cram_file = input_cram, - output_prefix = output_prefix, - disk_space_gb = disk_space_gb, - preemptible_attempts = preemptible_attempts - } - - call CNNTasks.SplitIntervals { - input: - scatter_count = scatter_count, - intervals = calling_intervals, - ref_fasta = reference_fasta, - ref_dict = reference_dict, - ref_fai = reference_fasta_index, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts - } - - Float bam_size = size(CramToBam.output_bam, "GB") - - scatter (calling_interval in SplitIntervals.interval_files) { - call CNNTasks.RunHC4 { - input: - input_bam = CramToBam.output_bam, - input_bam_index = CramToBam.output_bam_index, - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - output_prefix = output_prefix, - interval_list = calling_interval, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - extra_args = extra_args, - disk_space_gb = round(bam_size + ref_size + additional_disk) - } - - call WriteTensors { - input: - input_vcf = RunHC4.raw_vcf, - input_vcf_index = RunHC4.raw_vcf_index, - input_bam = RunHC4.bamout, - input_bam_index = RunHC4.bamout_index, - truth_vcf = truth_vcf, - truth_vcf_index = truth_vcf_index, - truth_bed = truth_bed, - tensor_type = tensor_type, - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - output_prefix = output_prefix, - interval_list = calling_interval, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - preemptible_attempts = preemptible_attempts, - disk_space_gb = disk_space_gb - } - } - - call CNNTasks.MergeVCFs as MergeVCF_HC4 { - input: - input_vcfs = RunHC4.raw_vcf, - output_prefix = output_prefix, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space_gb = disk_space_gb - } - - call CNNTasks.SamtoolsMergeBAMs { - input: - input_bams = RunHC4.bamout, - output_prefix = output_prefix, - disk_space_gb = round(bam_size + ref_size + additional_disk) - } - - call TrainModel { - input: - tar_tensors = WriteTensors.tensors, - output_prefix = output_prefix, - tensor_type = tensor_type, - gatk_docker = gatk_docker, - gatk_override = gatk_override, - disk_space_gb = disk_space_gb, - epochs = epochs - } - - output { - MergeVCF_HC4.* - SamtoolsMergeBAMs.* - TrainModel.* - } - -} - -task WriteTensors { - File input_bam - File input_bam_index - File input_vcf - File input_vcf_index - File reference_fasta - File reference_dict - File reference_fasta_index - File truth_vcf - File truth_vcf_index - File truth_bed - String output_prefix - String tensor_type - File interval_list - - # Runtime parameters - String gatk_docker - File? gatk_override - Int? mem_gb - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Int default_ram_mb = 8000 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - mkdir "/root/tensors/" - - gatk --java-options "-Xmx${command_mem}m" \ - CNNVariantWriteTensors \ - -R ${reference_fasta} \ - -V ${input_vcf} \ - -truth-vcf ${truth_vcf} \ - -truth-bed ${truth_bed} \ - -tensor-type ${tensor_type} \ - -output-tensor-dir "/root/tensors/" \ - -bam-file ${input_bam} - - tar -czf "tensors.tar.gz" "/root/tensors/" - } - - output { - File tensors = "tensors.tar.gz" - } - runtime { - docker: "${gatk_docker}" - memory: machine_mem + " MB" - disks: "local-disk " + disk_space_gb + " SSD" - } - -} - -task TrainModel { - Array[File] tar_tensors - String output_prefix - String tensor_type - Int epochs - - # Runtime parameters - String gatk_docker - File? gatk_override - Int? mem_gb - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Int default_ram_mb = 8000 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - for tensors in ${sep=' ' tar_tensors} ; do - tar -xzf $tensors - done - - gatk --java-options "-Xmx${command_mem}m" \ - CNNVariantTrain \ - -input-tensor-dir "./tensors/" \ - -model-name ${output_prefix} \ - -image-dir "./" \ - -tensor-type ${tensor_type} \ - -epochs ${epochs} - } - - output { - File model_json = "${output_prefix}.json" - File model_hd5 = "${output_prefix}.hd5" - File roc_png = "per_class_roc_${output_prefix}.png" - File training_png = "metric_history_${output_prefix}.png" - } - - runtime { - docker: "${gatk_docker}" - #gpuType: "nvidia-tesla-k80" # This will require PAPI v2 and CUDA on VM - #gpuCount: 1 - #zones: ["us-central1-c"] - memory: machine_mem + " MB" - disks: "local-disk 400 SSD" - bootDiskSizeGb: "16" - } -} \ No newline at end of file diff --git a/scripts/cnn_variant_wdl/happy_plot.R b/scripts/cnn_variant_wdl/happy_plot.R deleted file mode 100644 index ca12915b23d..00000000000 --- a/scripts/cnn_variant_wdl/happy_plot.R +++ /dev/null @@ -1,79 +0,0 @@ -library(dplyr) -library(ggplot2) -library(reshape2) - -# Multiple plot function -# -# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) -# - cols: Number of columns in layout -# - layout: A matrix specifying the layout. If present, 'cols' is ignored. -# -# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), -# then plot 1 will go in the upper left, 2 will go in the upper right, and -# 3 will go all the way across the bottom. -# -multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { - library(grid) - - # Make a list from the ... arguments and plotlist - plots <- c(list(...), plotlist) - - numPlots = length(plots) - - # If layout is NULL, then use 'cols' to determine layout - if (is.null(layout)) { - # Make the panel - # ncol: Number of columns of plots - # nrow: Number of rows needed, calculated from # of cols - layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), - ncol = cols, nrow = ceiling(numPlots/cols)) - } - - if (numPlots==1) { - print(plots[[1]]) - - } else { - # Set up the page - grid.newpage() - pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) - - # Make each plot, in the correct location - for (i in 1:numPlots) { - # Get the i,j matrix positions of the regions that contain this subplot - matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) - - print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, - layout.pos.col = matchidx$col)) - } - } -} - -round_digits <- -2 -files <- list.files(pattern = "summary\\.csv$") -dlist <- lapply(files, read.csv) -names <- lapply(files, function(x) gsub("happy_", "", gsub(".summary.csv", "", x))) -dnamed <- mapply(cbind, dlist, "Name"=names, SIMPLIFY=F) -merged <- Reduce(function(...) merge(..., all=T), dnamed) - -names(merged) <- c( "Type", "Filter", "Total", "True Positives", "False Negatives", "QTotal", "False Positives", "Unknown", "Genotype Error", "Recall", "Precision", "NA", "F1 Score", "T TiTv" , "Q TiTv" , "T Het Hom" , "Q Het Hom", "Name") -melted <- melt(merged, id.vars=c("Name", "Filter", "Type")) - -metrics <- subset(melted, variable%in%c("Recall", "Precision", "F1 Score")) -p1 <- ggplot(metrics, aes(x=Name, y=value, color=Filter)) + - geom_point(stat="identity", position = position_jitter(w = 0.06, h = 0)) + - geom_text(aes(label=ifelse(Filter=="PASS", round(value, 3), "")), color="black", size=2.5, hjust=-0.4, vjust=0.5) + - geom_text(aes(label=ifelse(Filter!="PASS", round(value, 3), "")), color="darkgrey", size=2.5, hjust=1.6, vjust=0.5) + - facet_grid( variable ~ Type, scales="free_y" ) + - ylab("Metrics") + - theme(axis.text.x=element_text(angle=30, hjust = 1)) - -counts <- subset(melted, variable%in%c("True Positives", "False Negatives", "False Positives")) -p2 <- ggplot(counts, aes(x=Name, y=value, color=Filter)) + - geom_point(stat="identity", position = position_jitter(w = 0.06, h = 0)) + - facet_grid( variable ~ Type, scales="free_y" ) + - ylab("Counts") + - geom_text(aes(label=ifelse(Filter=="PASS", round(value, round_digits), "")), color="black", size=2.5, hjust=-0.4, vjust=0.5) + - geom_text(aes(label=ifelse(Filter!="PASS", round(value, round_digits), "")), color="darkgrey", size=2.5, hjust=1.6, vjust=0.5) + - theme(axis.text.x=element_text(angle=30, hjust = 1)) - -ggsave(plot=multiplot(p1, p2, cols=2), filename = 'metrics.png', width=4, height=3, units="in") \ No newline at end of file diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json deleted file mode 100755 index 1724eb39a67..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_b37.json +++ /dev/null @@ -1,30 +0,0 @@ -{ - "CNNScoreVariantsWorkflow.calling_intervals": "gs://broad-references/hg19/v0/wgs_calling_regions.v1.chr20.interval_list", - "CNNScoreVariantsWorkflow.gatk_docker": "broadinstitute/gatk", - "CNNScoreVariantsWorkflow.input_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/chr20_tiny_tf_python_gpu.vcf.gz", - "CNNScoreVariantsWorkflow.input_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/chr20_tiny_tf_python_gpu.vcf.gz.tbi", - "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_tiny", - "CNNScoreVariantsWorkflow.reference_fasta": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", - "CNNScoreVariantsWorkflow.reference_dict": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.dict", - "CNNScoreVariantsWorkflow.reference_fasta_index": "gs://broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", - "CNNScoreVariantsWorkflow.resources" : [ - "gs://broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz", - "gs://broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz", - "gs://broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.vcf.gz" - ], - "CNNScoreVariantsWorkflow.resources_index" : [ - "gs://broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz.tbi", - "gs://broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi", - "gs://broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.vcf.gz.tbi" - ], - "CNNScoreVariantsWorkflow.inference_batch_size": "16", - "CNNScoreVariantsWorkflow.transfer_batch_size": "32", - "CNNScoreVariantsWorkflow.tensor_type": "reference", - "CNNScoreVariantsWorkflow.info_key": "CNN_1D", - "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.9 ", - "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.5 ", - "CNNScoreVariantsWorkflow.scatter_count": "2", - "CNNScoreVariantsWorkflow.cnn_task_mem_gb": "8", - "CNNScoreVariantsWorkflow.cnn_task_cpu": "2", - "CNNScoreVariantsWorkflow.preemptible_attempts": "20" -} diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json deleted file mode 100755 index 83439b5143d..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "CNNScoreVariantsWorkflow.bam_file": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bam", - "CNNScoreVariantsWorkflow.bam_file_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bai", - "CNNScoreVariantsWorkflow.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list", - "CNNScoreVariantsWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNNScoreVariantsWorkflow.input_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz", - "CNNScoreVariantsWorkflow.input_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz.tbi", - "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_1m_895", - "CNNScoreVariantsWorkflow.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta", - "CNNScoreVariantsWorkflow.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict", - "CNNScoreVariantsWorkflow.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai", - "CNNScoreVariantsWorkflow.resources": [ - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf", - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf" - ], - "CNNScoreVariantsWorkflow.resources_index": [ - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf.idx", - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf.idx" - ], - "CNNScoreVariantsWorkflow.inference_batch_size": "1", - "CNNScoreVariantsWorkflow.transfer_batch_size": "2", - "CNNScoreVariantsWorkflow.intra_op_threads": 0, - "CNNScoreVariantsWorkflow.inter_op_threads": 0, - "CNNScoreVariantsWorkflow.tensor_type": "read_tensor", - "CNNScoreVariantsWorkflow.info_key": "CNN_2D", - "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.0 ", - "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.0 ", - "CNNScoreVariantsWorkflow.scatter_count": "2" -} diff --git a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json b/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json deleted file mode 100755 index dc1ec66f4f5..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cnn_score_variants_travis_1d.json +++ /dev/null @@ -1,27 +0,0 @@ -{ - "CNNScoreVariantsWorkflow.bam_file": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bam", - "CNNScoreVariantsWorkflow.bam_file_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895_bamout.bai", - "CNNScoreVariantsWorkflow.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list", - "CNNScoreVariantsWorkflow.gatk_docker": "__GATK_DOCKER__", - "CNNScoreVariantsWorkflow.input_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz", - "CNNScoreVariantsWorkflow.input_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/g94982_b37_chr20_1m_895.vcf.gz.tbi", - "CNNScoreVariantsWorkflow.output_prefix": "g94982_b37_chr20_1m_895", - "CNNScoreVariantsWorkflow.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta", - "CNNScoreVariantsWorkflow.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict", - "CNNScoreVariantsWorkflow.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai", - "CNNScoreVariantsWorkflow.resources": [ - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf", - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf" - ], - "CNNScoreVariantsWorkflow.resources_index": [ - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.20.1M-10M.vcf.idx", - "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/Omni25_sites_1525_samples.b37.20.1M-10M.vcf.idx" - ], - "CNNScoreVariantsWorkflow.inference_batch_size": "1", - "CNNScoreVariantsWorkflow.transfer_batch_size": "2", - "CNNScoreVariantsWorkflow.tensor_type": "reference", - "CNNScoreVariantsWorkflow.info_key": "CNN_1D", - "CNNScoreVariantsWorkflow.snp_tranches": " --snp-tranche 99.9 ", - "CNNScoreVariantsWorkflow.indel_tranches": " --indel-tranche 99.5 ", - "CNNScoreVariantsWorkflow.scatter_count": "2" -} diff --git a/scripts/cnn_variant_wdl/jsons/cram2filtered.json b/scripts/cnn_variant_wdl/jsons/cram2filtered.json deleted file mode 100755 index 5272c4398fe..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cram2filtered.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "Cram2FilteredVcf.input_file": "gs://gatk-test-data/wgs_cram/NA12878_20k_hg38/NA12878.cram", - "Cram2FilteredVcf.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "Cram2FilteredVcf.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "Cram2FilteredVcf.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "Cram2FilteredVcf.resources" : [ - "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" - ], - "Cram2FilteredVcf.resources_index" : [ - "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", - "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" - ], - "Cram2FilteredVcf.output_prefix": "hg38_20k_na12878", - "Cram2FilteredVcf.info_key": "CNN_2D", - "Cram2FilteredVcf.snp_tranches": " --snp-tranche 99.9 ", - "Cram2FilteredVcf.indel_tranches": " --indel-tranche 99.5 ", - "Cram2FilteredVcf.tensor_type":"read_tensor", - "Cram2FilteredVcf.calling_intervals": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", - "Cram2FilteredVcf.gatk_docker": "broadinstitute/gatk", - "Cram2FilteredVcf.preemptible_attempts": 10, - "Cram2FilteredVcf.inference_batch_size": 8, - "Cram2FilteredVcf.transfer_batch_size": 32, - "Cram2FilteredVcf.mem_gb": 7, - "Cram2FilteredVcf.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ", - "Cram2FilteredVcf.scatter_count": 4 -} diff --git a/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json b/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json deleted file mode 100755 index f7f07f50cd7..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cram2filtered_travis.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "Cram2FilteredVcf.input_file": "/home/runner/work/gatk/gatk/src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram", - "Cram2FilteredVcf.reference_fasta": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta", - "Cram2FilteredVcf.reference_dict": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.dict", - "Cram2FilteredVcf.reference_fasta_index": "/home/runner/work/gatk/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai", - "Cram2FilteredVcf.resources" : ["/home/runner/work/gatk/gatk/src/test/resources/large/dbsnp_138.b37.20.21.vcf"], - "Cram2FilteredVcf.resources_index" : ["/home/runner/work/gatk/gatk/src/test/resources/large/dbsnp_138.b37.20.21.vcf.idx"], - "Cram2FilteredVcf.output_prefix": "na12878_b37_20_21", - "Cram2FilteredVcf.info_key": "CNN_2D", - "Cram2FilteredVcf.snp_tranches": " --snp-tranche 99.9 ", - "Cram2FilteredVcf.indel_tranches": " --indel-tranche 99.5 ", - "Cram2FilteredVcf.tensor_type":"read_tensor", - "Cram2FilteredVcf.calling_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/VQSR/contig20_1m_10m.interval_list", - "Cram2FilteredVcf.gatk_docker": "__GATK_DOCKER__", - "Cram2FilteredVcf.preemptible_attempts": 0, - "Cram2FilteredVcf.inference_batch_size": 2, - "Cram2FilteredVcf.transfer_batch_size": 4, - "Cram2FilteredVcf.intra_op_threads": 0, - "Cram2FilteredVcf.inter_op_threads": 0, - "Cram2FilteredVcf.mem_gb": 7, - "Cram2FilteredVcf.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ", - "Cram2FilteredVcf.scatter_count": 3 -} diff --git a/scripts/cnn_variant_wdl/jsons/cram2model.json b/scripts/cnn_variant_wdl/jsons/cram2model.json deleted file mode 100755 index a728ecf5b97..00000000000 --- a/scripts/cnn_variant_wdl/jsons/cram2model.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "Cram2TrainedModel.input_cram": "gs://broad-dsde-methods-sam/cnn-variant/bams/NA12878_PLUMBING.cram", - "Cram2TrainedModel.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "Cram2TrainedModel.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "Cram2TrainedModel.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "Cram2TrainedModel.output_prefix": "plumbing_na12878", - "Cram2TrainedModel.tensor_type": "read_tensor", - "Cram2TrainedModel.truth_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz", - "Cram2TrainedModel.truth_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi", - "Cram2TrainedModel.truth_bed": "gs://broad-dsde-methods-sam/cnn-variant/beds/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed", - "Cram2TrainedModel.calling_intervals": "gs://broad-references/hg38/v0/wgs_calling_regions.hg38.interval_list", - "Cram2TrainedModel.extra_args": "-stand-call-conf 0 -A Coverage -A ChromosomeCounts -A BaseQuality -A FragmentLength -A MappingQuality -A ReadPosition ", - "Cram2TrainedModel.gatk_docker": "samfriedman/gatk:44dc3d18e0e204", - "Cram2TrainedModel.gatk_override": "gs://broad-dsde-methods-sam/cnn-variant/jars/sf_gatk2.jar", - "Cram2TrainedModel.preemptible_attempts": 0, - "Cram2TrainedModel.disk_space_gb": 300, - "Cram2TrainedModel.scatter_count": 2, - "Cram2TrainedModel.epochs": 36, - "Cram2TrainedModel.mem_gb": 7 -} diff --git a/scripts/cnn_variant_wdl/jsons/run_happy.json b/scripts/cnn_variant_wdl/jsons/run_happy.json deleted file mode 100644 index 491fc0571b7..00000000000 --- a/scripts/cnn_variant_wdl/jsons/run_happy.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "HappyWorkflow.vcf_files": [ - "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz", - "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/e8031a29-d788-4901-bb77-4f4ba542a024/call-FilterVariantTranches/nova_g947n_na12878_filtered.vcf.gz" - ], - "HappyWorkflow.reference_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "HappyWorkflow.reference_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "HappyWorkflow.reference_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "HappyWorkflow.truth_bed": "gs://broad-dsde-methods-sam/cnn-variant/beds/chr20_conf_1m_10m.bed", - "HappyWorkflow.truth_vcf": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz", - "HappyWorkflow.truth_vcf_index": "gs://broad-dsde-methods-sam/cnn-variant/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi", - "HappyWorkflow.rscript": "gs://broad-dsde-methods-sam/cnn-variant/scripts/happy_plot.R", - "HappyWorkflow.disk_space": "100", - "HappyWorkflow.cpu": "2", - "HappyWorkflow.mem_gb": "8", - "HappyWorkflow.preemptible_attempts": "10" -} \ No newline at end of file diff --git a/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json b/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json deleted file mode 100644 index 22e060d7fa2..00000000000 --- a/scripts/cnn_variant_wdl/jsons/variant_classifier_plots_na12878_hg38.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "VariantClassifierPlots.call_vcf": "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz", - "VariantClassifierPlots.call_vcf_index": "gs://broad-dsde-methods/cromwell-execution-33/Cram2FilteredVcf/1f653862-42d9-470e-acdf-0f00e34263f1/call-FilterVariantTranches/nova_g947y_na12878_filtered.vcf.gz.tbi", - "VariantClassifierPlots.call_sample": "SM-G947Y", - "VariantClassifierPlots.score_key": "CNN_2D", - "VariantClassifierPlots.truth_vcf": "gs://broad-dsde-methods/cnn-variant-score/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz", - "VariantClassifierPlots.truth_vcf_index": "gs://broad-dsde-methods/cnn-variant-score/vcfs/nist_na12878_giab_hg38_sd_fix.vcf.gz.tbi", - "VariantClassifierPlots.truth_sample": "HG001", - "VariantClassifierPlots.intervals": "gs://broad-dsde-methods/cnn-variant-score/beds/HG001_NA12878_GRCh38_GIAB_highconf.interval_list", - "VariantClassifierPlots.rscript": "gs://broad-dsde-methods/cnn-variant-score/scripts/vcf_analysis.R", - "VariantClassifierPlots.gatk_docker": "broadinstitute/gatk", - "VariantClassifierPlots.preemptible_attempts": "0" -} \ No newline at end of file diff --git a/scripts/cnn_variant_wdl/run_happy.wdl b/scripts/cnn_variant_wdl/run_happy.wdl deleted file mode 100644 index e7bcbf2c239..00000000000 --- a/scripts/cnn_variant_wdl/run_happy.wdl +++ /dev/null @@ -1,145 +0,0 @@ -# Run the hap.py VCF evaluation over input vcfs given a validated truth vcf and confidence region -workflow HappyWorkflow { - Array[File] vcf_files # VCF files to evaluate with hap.py - - File reference_fasta - File reference_dict - File reference_fasta_index - - File truth_vcf - File truth_vcf_index - File truth_bed - - File rscript - - Int? preemptible_attempts - Int? disk_space - Int? mem_gb - Int? cpu - - call RunHappy { - input: - vcf_files = vcf_files, - truth_vcf = truth_vcf, - truth_vcf_index = truth_vcf_index, - truth_bed = truth_bed, - reference_fasta = reference_fasta, - reference_dict = reference_dict, - reference_fasta_index = reference_fasta_index, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - call RunHappyPlots{ - input: - happy_outputs = RunHappy.happy_outputs, - rscript = rscript, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - output { - RunHappy.* - RunHappyPlots.* - } -} - -task RunHappy { - Array[File] vcf_files - - File reference_fasta - File reference_dict - File reference_fasta_index - - File truth_vcf - File truth_vcf_index - File truth_bed - - # Runtime parameters - Int? mem_gb - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - for vcf_file in ${sep=" " vcf_files}; do - vname=$(basename "$vcf_file") - /opt/hap.py/bin/hap.py \ - ${truth_vcf} \ - "$vcf_file" \ - -f ${truth_bed} \ - -r ${reference_fasta} \ - -o ./happy_"$vname" - done - } - - output { - Array[File] happy_outputs = glob("./happy_*") - } - - runtime { - docker: "pkrusche/hap.py" - - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } -} - -task RunHappyPlots { - Array[File] happy_outputs - File rscript - - # Runtime parameters - Int? mem_gb - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - for file in ${sep=" " happy_outputs}; do - mv "$file" ./ - done - find `pwd` - - Rscript ${rscript} - } - - output { - Array[File] plots = glob("*png") - } - - runtime { - docker: "rocker/tidyverse" - - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } -} - - diff --git a/scripts/cnn_variant_wdl/variant_classifier_plots.wdl b/scripts/cnn_variant_wdl/variant_classifier_plots.wdl deleted file mode 100644 index 10d291c06d1..00000000000 --- a/scripts/cnn_variant_wdl/variant_classifier_plots.wdl +++ /dev/null @@ -1,272 +0,0 @@ -# Run VCF evaluation over input vcf given a validated truth vcf and confidence region -workflow VariantClassifierPlots { - File call_vcf # VCF to be evaluated - File call_vcf_index # Index of VCF to be evaluated - String? call_sample - String score_key - - File? truth_vcf # Optional truth VCF. If provided, plot colors show true positives and - File? truth_vcf_index # true negatives in green with false positives in red and false negatives in yellow. - String? truth_sample # Otherwise, plot colors show filtered variants in red and passing variant in green. - - File? intervals - - File rscript - - String gatk_docker - File? gatk_override - - Int? preemptible_attempts - Int? disk_space - Int? mem_gb - Int? cpu - - if(defined(truth_vcf)){ - call MakeTables { - input: - call_vcf = call_vcf, - call_vcf_index = call_vcf_index, - call_sample = call_sample, - score_key = score_key, - truth_vcf = truth_vcf, - truth_vcf_index = truth_vcf_index, - truth_sample = truth_sample, - intervals = intervals, - gatk_docker = gatk_docker, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - call MakePlots{ - input: - rscript = rscript, - call_table = MakeTables.call_table, - truth_table = MakeTables.truth_table, - score_key = score_key, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - output { - MakeTables.* - MakePlots.* - } - } - - if(!defined(truth_vcf)){ - call MakeTableNoTruth { - input: - call_vcf = call_vcf, - call_vcf_index = call_vcf_index, - call_sample = call_sample, - score_key = score_key, - gatk_docker = gatk_docker, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - call MakePlots as MakePlotsNoTruth { - input: - rscript = rscript, - call_table = MakeTableNoTruth.call_table, - score_key = score_key, - cpu = cpu, - mem_gb = mem_gb, - disk_space = disk_space, - preemptible_attempts = preemptible_attempts - } - - output { - MakeTableNoTruth.* - MakePlotsNoTruth.* - } - - } - -} - -task MakeTables { - File call_vcf - File call_vcf_index - String? call_sample - String score_key - - File? truth_vcf - File? truth_vcf_index - String? truth_sample - - File? intervals - - # Runtime parameters - String gatk_docker - File? gatk_override - Int? mem_gb - Int? preemptible_attempts - Int? disk_space - Int? cpu - - String call_table_name = basename(call_vcf) + ".table" - String sd_fix_vcf = "call_sd_fix.vcf" - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - UpdateVcfSequenceDictionary \ - --INPUT=${call_vcf} \ - --OUTPUT=${sd_fix_vcf} \ - -SD=${truth_vcf} - - gatk IndexFeatureFile -F ${sd_fix_vcf} - - gatk --java-options "-Xmx${command_mem}m" \ - GenotypeConcordance \ - --CALL_VCF=${sd_fix_vcf} \ - ${"--CALL_SAMPLE=" + call_sample} \ - --TRUTH_VCF=${truth_vcf} \ - ${"--TRUTH_SAMPLE=" + truth_sample} \ - ${"--INTERVALS=" + intervals} \ - --OUTPUT_VCF=true \ - --IGNORE_FILTER_STATUS \ - -O=concordance - - gatk --java-options "-Xmx${command_mem}m" \ - VariantsToTable \ - -V ${sd_fix_vcf} \ - -F CHROM -F POS -F REF -F ALT -F FILTER -F ${score_key} \ - -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE \ - --show-filtered \ - -O ${call_table_name} - - gatk --java-options "-Xmx${command_mem}m" \ - VariantsToTable \ - -V concordance.genotype_concordance.vcf.gz \ - -F CHROM -F POS -F REF -F ALT -F CONC_ST \ - -O truth.table - } - - output { - File call_table = "${call_table_name}" - File truth_table = "truth.table" - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - # Note that the space before SSD and HDD should be included. - disks: "local-disk " + default_disk_space_gb + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - zones: "us-east4-a" - bootDiskSizeGb: "16" - } -} - -task MakeTableNoTruth { - File call_vcf - File call_vcf_index - String? call_sample - String score_key - - # Runtime parameters - String gatk_docker - File? gatk_override - Int? mem_gb - Int? preemptible_attempts - Int? disk_space - Int? cpu - - String call_table_name = basename(call_vcf) + ".table" - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" \ - VariantsToTable \ - -V ${call_vcf} \ - -F CHROM -F POS -F REF -F ALT -F FILTER -F ${score_key} \ - -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE \ - --show-filtered \ - -O ${call_table_name} - } - - output { - File call_table = "${call_table_name}" - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - # Note that the space before SSD and HDD should be included. - disks: "local-disk " + default_disk_space_gb + " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - zones: "us-east4-a" - bootDiskSizeGb: "16" - } -} - -task MakePlots { - File rscript - File call_table - File? truth_table - String score_key - - # Runtime parameters - Int? mem_gb - Int? preemptible_attempts - Int? disk_space - Int? cpu - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 16000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem_gb) then mem_gb *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command { - Rscript ${rscript} ${call_table} ${truth_table} ${score_key} - } - - output { - Array[File] plots = glob("*png") - } - - runtime { - docker: "rocker/tidyverse" - - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } -} diff --git a/scripts/cnn_variant_wdl/vcf_analysis.R b/scripts/cnn_variant_wdl/vcf_analysis.R deleted file mode 100644 index 0794906da74..00000000000 --- a/scripts/cnn_variant_wdl/vcf_analysis.R +++ /dev/null @@ -1,215 +0,0 @@ -#!/usr/bin/env Rscript - -library(tidyr) -library(dplyr) -library(ggplot2) -library(reshape2) - -args = commandArgs(trailingOnly=TRUE) -if (length(args) != 3) { - stop("We need 3 arguments: call_vcf_table concordance_vcf_table score_key") -} - -print("try to load VCF table.") -d <- read.table(args[1], header=TRUE) -print("try to load VCF Truth table.") -dt <- read.table(args[2], header=TRUE) -score_key <- args[3] -score_label <- paste(score_key, " LOD Score") -plot_title <- gsub( ".vcf.gz.table", "", basename(args[1])) -num_bins <- 50 -bin_by_quantile <- FALSE - -get_proportion <- function(d, num_bins, column_to_sum, quality_column) { - x <- rowsum(column_to_sum, quality_column, na.rm =T) - idx <- row.names(x) - - for (i in 1:num_bins) { - qsum <- sum(quality_column==as.numeric(idx[i])) - if (!is.na(x[i]) && qsum>0) { - x[i] <- x[i] / qsum - } - } - return(x[quality_column]) -} - -print("try to merge.") -d <- merge(d, dt, by=c("CHROM", "POS", "REF", "ALT")) -d$TP <- as.numeric(d$CONC_ST!="FP,TN" & d$CONC_ST!="FP" & d$CONC_ST!="EMPTY") -d$True_Positive <- d$CONC_ST!="FP,TN" & d$CONC_ST!="FP" & d$CONC_ST!="EMPTY" -d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "." -d$SNP <- d$EVENTLENGTH == 0 -d$ONE <- 1 -x <- rowsum(d$ONE, d$EVENTLENGTH) -d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)] -d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0)) -d$Truth_Status <- ifelse(d$True_Positive & d$Unfiltered, "True Positive", ifelse(d$True_Positive & !d$Unfiltered, "False Negative", ifelse(!d$True_Positive & d$Unfiltered, "False Positive", "True Negative"))) -statusColor <- c("True Positive" = "springgreen3", "True Negative" = "aquamarine4", "False Positive" = "red", "False Negative" = "orange") - -# All variant plots -print("Make all variant plots.") - -# Plot histogram of scores separately for SNPs and INDELs. -p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) + - scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) + - geom_density(alpha=0.55) + - ggtitle(plot_title) + - xlab(score_label) + - guides(color=FALSE) - -# Violin plot of scores stratified by event length, including all insertions and deletions. -p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status, shape=Variant_Type)) + - scale_color_manual(values=statusColor) + - scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) + - geom_jitter(height = 0, width = 0.1, alpha=0.6) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -# Violin plot of scores stratified by event length, insertions and deletions smaller than 20 base pairs. -p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) + xlim(-20, 20) + - scale_color_manual(values=statusColor) + - geom_jitter(height = 0, width = 0.1, alpha=0.4) + - geom_violin(color="grey", alpha=0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -# Violin plot of scores stratified by event length, insertions and deletions smaller than 10 base pairs. -p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) + xlim(-10, 10) + - scale_color_manual(values=statusColor) + - geom_jitter(height = 0, width = 0.2, alpha=0.4) + - geom_violin(color="grey", alpha=0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -# Violin plot of scores stratified by event length, insertions and deletions smaller than 5 base pairs. -p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Truth_Status)) + - scale_color_manual(values=statusColor) + xlim(-5, 5) + - geom_jitter(height = 0, width = 0.35, alpha=0.4) + - geom_violin(color="grey", alpha=0.0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - - -# SNP specific plots -print("Make SNP plots.") -snps <- subset(d, EVENTLENGTH == 0) -my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) -snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) -snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T) -mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) -df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) -q_means <- rowMeans(df) -snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN] -snps$TPR_PREDICTION <- exp(snps$QUALITY_LOD) / (1 + exp(snps$QUALITY_LOD) ) - -x <- rowsum(snps$ONE, snps$QUALITY_BIN) -snps$BIN_SUM <- x[snps$QUALITY_BIN] -snps$TRANSVERSION <- as.numeric( abs(snps$TRANSITION)==0 ) -snps$TPR <- get_proportion(snps, num_bins, snps$TP, snps$QUALITY_BIN) -ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN) -tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN) -snps$TI_TV <- ti/tv - -# Plot transition transversion ratios as a function of score bins -p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Truth_Status, shape=TRANSITION==1)) + - scale_color_manual(values=statusColor) + - scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) + - geom_point() + - geom_line(color="grey") + - ggtitle("Transition Transversion Ratio per score bin") + - xlab(score_label) + - ylim(0, 4) - -# SNP calibration plot -p7 <- ggplot(snps, aes(x=TPR_PREDICTION, y=TPR, group=QUALITY_BIN, color=Truth_Status)) + - scale_color_manual(values=statusColor) + - geom_jitter(height = 0.01, width = 0.01, alpha=0.4) + - ggtitle(paste("SNP Calibration", plot_title)) + - ylim(0, 1) + xlim(0, 1) - - -# INDEL specific plots -print("Make INDEL plots.") -indels <- subset(d, EVENTLENGTH != 0) -my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) -indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) -indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T) -mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) -df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) -q_means <- rowMeans(df) -indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN] -indels$TPR_PREDICTION <- exp(indels$QUALITY_LOD) / (1 + exp(indels$QUALITY_LOD)) -x <- rowsum(indels$ONE, indels$QUALITY_BIN) -indels$BIN_SUM <- x[indels$QUALITY_BIN] -indels$TPR <- get_proportion(indels, num_bins, indels$TP, indels$QUALITY_BIN) -indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1) -indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN) - -# Plot proportion of each socre bin that are 1 base pair Insertion or deletion -p8 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Truth_Status, shape=EVENTLENGTH<0)) + - scale_color_manual(values=statusColor) + - scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) + - geom_jitter(height = 0.005, width = 0.0, alpha=0.6) + - geom_line(color="grey") + - ggtitle("Proportion of 1bp INDELs per score bin") + - xlab(score_label) - -# INDEL calibration plot -p9 <- ggplot(indels, aes(x=TPR_PREDICTION, y=TPR, group=QUALITY_BIN, color=Truth_Status)) + - scale_color_manual(values=statusColor) + - geom_jitter(height = 0.01, width = 0.01, alpha=0.4) + - ggtitle(paste("INDEL Calibration", plot_title)) + - ylim(0, 1) + xlim(0, 1) - -# Multiple plot function -# -# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) -# - cols: Number of columns in layout -# - layout: A matrix specifying the layout. If present, 'cols' is ignored. -# -# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), -# then plot 1 will go in the upper left, 2 will go in the upper right, and -# 3 will go all the way across the bottom. -# -multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { - library(grid) - - # Make a list from the ... arguments and plotlist - plots <- c(list(...), plotlist) - - numPlots = length(plots) - - # If layout is NULL, then use 'cols' to determine layout - if (is.null(layout)) { - # Make the panel - # ncol: Number of columns of plots - # nrow: Number of rows needed, calculated from # of cols - layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), - ncol = cols, nrow = ceiling(numPlots/cols)) - } - - if (numPlots==1) { - print(plots[[1]]) - - } else { - # Set up the page - grid.newpage() - pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) - - # Make each plot, in the correct location - for (i in 1:numPlots) { - # Get the i,j matrix positions of the regions that contain this subplot - matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) - - print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, - layout.pos.col = matchidx$col)) - } - } -} -ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7,p8,p9, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=22) diff --git a/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R b/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R deleted file mode 100644 index c5dc8cc1039..00000000000 --- a/scripts/cnn_variant_wdl/vcf_analysis_no_truth.R +++ /dev/null @@ -1,193 +0,0 @@ -#!/usr/bin/env Rscript - -library(tidyr) -library(dplyr) -library(ggplot2) -library(reshape2) - -# ./gatk VariantsToTable -V /dsde/data/deep/vqsr/vcfs/illumina_na12878_platinum_scored_chr2.vcf.gz -F CHROM -F POS -F REF -F ALT -F FILTER -F G947_SITE_LABELLED_RRAB -F EVENTLENGTH -F AC -F MULTI-ALLELIC -F TRANSITION -F TYPE -O ~/Documents/illumin_chr2.table -#d <- read.table("illumin_chr2.table", header=TRUE) -#score_key <- "G947_SITE_LABELLED_RRAB" -#d <- read.table("g94982_chr20.table", header=TRUE) -#score_key <- "CNN_2D" -#d <- read.table("new_gnomad_22.table", header=TRUE) -#score_key <- "CNN_1D" - -args = commandArgs(trailingOnly=TRUE) -if (length(args) != 2) { - stop("We need 2 arguments: call_vcf_table score_key") -} - -print("try to load VCF table.") -d <- read.table(args[1], header=TRUE) -score_key <- args[2] -score_label <- paste(score_key, " LOD Score") -plot_title <- gsub(".vcf.gz.table", "", basename(args[1])) -num_bins <- 50 -bin_by_quantile <- FALSE - -get_proportion <- function(d, num_bins, column_to_sum, quality_column) { - x <- rowsum(column_to_sum, quality_column, na.rm =T) - idx <- row.names(x) - - for (i in 1:num_bins) { - qsum <- sum(quality_column==as.numeric(idx[i])) - if (!is.na(x[i]) && qsum>0) { - x[i] <- x[i] / qsum - } - } - return(x[quality_column]) -} - -d$SNP <- d$EVENTLENGTH == 0 -d$ONE <- 1 -x <- rowsum(d$ONE, d$EVENTLENGTH) -d$EVENTLENGTH_SUM <- x[as.factor(d$EVENTLENGTH)] -d$Unfiltered <- d$FILTER == "PASS" | d$FILTER == "." -d$Variant_Type <- paste(d$TYPE, as.factor(d$EVENTLENGTH<0)) - - -# All variant plots -print("Make all variant plots.") -p1 <- ggplot(d, aes(get(score_key), color=SNP, fill=SNP)) + - scale_fill_discrete(name="Variant\nType", breaks=c("TRUE", "FALSE"), labels=c("SNPs", "INDELs")) + - geom_density(alpha=0.55) + - ggtitle(plot_title) + - xlab(score_label) + - guides(color=FALSE) - -p2 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered, shape=Variant_Type)) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - scale_shape_discrete(name='', breaks=c("INDEL TRUE", "INDEL FALSE", "SNP FALSE"), labels=c("Deletion", "Insertion", "SNP")) + - geom_jitter(height = 0, width = 0.2, alpha=0.6) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -p3 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-20, 20) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - geom_jitter(height = 0, width = 0.15, alpha=0.4) + - geom_violin(color="grey", alpha=0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=2, angle=60) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -p4 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + xlim(-10, 10) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - geom_jitter(height = 0, width = 0.2, alpha=0.4) + - geom_violin(color="grey", alpha=0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=3, angle=30) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - -p5 <- ggplot(d, aes(x=EVENTLENGTH, y=get(score_key), group=EVENTLENGTH, color=Unfiltered)) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - xlim(-5, 5) + - geom_jitter(height = 0, width = 0.35, alpha=0.4) + - geom_violin(color="grey", alpha=0.0) + - geom_text(aes(x=EVENTLENGTH, y=14, label=EVENTLENGTH_SUM), color="grey30", size=4, angle=30) + - ggtitle(plot_title) + - ylab(score_label) + - xlab("Event Length: - Deletions, 0 SNPs, + Insertions") - - -# SNP specific plots -print("Make SNP plots.") -snps <- subset(d, EVENTLENGTH == 0) -my_breaks <- ifelse(bin_by_quantile, quantile(snps[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) -snps$QUALITY_BIN <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) -snps$QUALITY_BIN_RANGE <- cut(snps[[score_key]], breaks=my_breaks, include.lowest=T) -mine <- lapply(strsplit(sapply(levels(snps$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) -df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) -q_means <- rowMeans(df) -snps$QUALITY_LOD <- q_means[snps$QUALITY_BIN] - -x <- rowsum(snps$ONE, snps$QUALITY_BIN) -snps$BIN_SUM <- x[snps$QUALITY_BIN] -snps$TRANSVERSION <- as.numeric(abs(snps$TRANSITION)==0) -ti <- get_proportion(snps, num_bins, snps$TRANSITION, snps$QUALITY_BIN) -tv <- get_proportion(snps, num_bins, snps$TRANSVERSION, snps$QUALITY_BIN) -snps$TI_TV <- ti/tv - -p6 <- ggplot(snps, aes(x=get(score_key), y=TI_TV, group=QUALITY_BIN, color=Unfiltered, shape=TRANSITION==1)) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Transition", "Transversion")) + - geom_point() + - geom_line(color="grey") + - xlab(score_label) + - ggtitle("Transition Transversion Ratio per score bin") + - ylim(0, 4) - - -# INDEL specific plots -print("Make INDEL plots.") -indels <- subset(d, EVENTLENGTH != 0) -my_breaks <- ifelse(bin_by_quantile, quantile(indels[[score_key]], probs = seq(0, 1, 1.0/num_bins), na.rm=T), num_bins) -indels$QUALITY_BIN <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T, labels=F) -indels$QUALITY_BIN_RANGE <- cut(indels[[score_key]], breaks=my_breaks, include.lowest=T) -mine <- lapply(strsplit(sapply(levels(indels$QUALITY_BIN_RANGE), function(x) substr(x, 2, nchar(x)-1)), ","), as.numeric) -df <- data.frame(matrix(unlist(mine), nrow=num_bins, byrow=T)) -q_means <- rowMeans(df) -indels$QUALITY_LOD <- q_means[indels$QUALITY_BIN] -x <- rowsum(indels$ONE, indels$QUALITY_BIN) -indels$BIN_SUM <- x[indels$QUALITY_BIN] -indels$ONEBP <- as.numeric(abs(indels$EVENTLENGTH)==1) -indels$PROPORTION_ONEBP <- get_proportion(indels, num_bins, indels$ONEBP, indels$QUALITY_BIN) - -p7 <- ggplot(indels, aes(x=get(score_key), y=PROPORTION_ONEBP, group=QUALITY_BIN, color=Unfiltered, shape=EVENTLENGTH<0)) + - scale_color_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Passed", "Filtered")) + - scale_shape_discrete(name='', breaks=c("TRUE", "FALSE"), labels=c("Deletion", "Insertion")) + - geom_jitter(height = 0.005, width = 0.0, alpha=0.6) + - geom_line(color="grey") + - ggtitle("Proportion of 1bp INDELs per score bin") + - xlab(score_label) - -# Multiple plot function -# -# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) -# - cols: Number of columns in layout -# - layout: A matrix specifying the layout. If present, 'cols' is ignored. -# -# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), -# then plot 1 will go in the upper left, 2 will go in the upper right, and -# 3 will go all the way across the bottom. -# -multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { - library(grid) - - # Make a list from the ... arguments and plotlist - plots <- c(list(...), plotlist) - - numPlots = length(plots) - - # If layout is NULL, then use 'cols' to determine layout - if (is.null(layout)) { - # Make the panel - # ncol: Number of columns of plots - # nrow: Number of rows needed, calculated from # of cols - layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), - ncol = cols, nrow = ceiling(numPlots/cols)) - } - - if (numPlots==1) { - print(plots[[1]]) - - } else { - # Set up the page - grid.newpage() - pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) - - # Make each plot, in the correct location - for (i in 1:numPlots) { - # Get the i,j matrix positions of the regions that contain this subplot - matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) - - print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, - layout.pos.col = matchidx$col)) - } - } -} -ggsave(plot=multiplot(p1,p2,p3,p4,p5,p6,p7, cols=2), filename = paste(plot_title, "_plots.png", sep=""), width=16, height=20) - diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java index 8835707c556..bc7e7eb9f3e 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/CommandLineProgram.java @@ -28,6 +28,7 @@ import java.io.IOException; import java.net.InetAddress; import java.nio.file.*; +import java.nio.file.attribute.PosixFilePermission; import java.text.DecimalFormat; import java.time.Duration; import java.time.ZonedDateTime; @@ -167,6 +168,10 @@ public Object instanceMainPostParseArgs() { final Path p = tmpDir.toPath(); try { p.getFileSystem().provider().checkAccess(p, AccessMode.READ, AccessMode.WRITE); + + // Warn if there's anything that prevents execution in the tmp dir because some tools need that + tryToWriteAnExecutableFileAndWarnOnFailure(p); + System.setProperty("java.io.tmpdir", IOUtils.getAbsolutePathWithoutFileProtocol(p)); } catch (final AccessDeniedException | NoSuchFileException e) { // TODO: it may be that the program does not need a tmp dir @@ -494,4 +499,49 @@ public final CommandLineParser getCommandLineParser() { protected interface AutoCloseableNoCheckedExceptions extends AutoCloseable{ @Override void close(); } + + private void tryToWriteAnExecutableFileAndWarnOnFailure(final Path p) { + Path tempFilePath = null; + try { + // This test relies on the file system supporting posix file permissions + if(p.getFileSystem().supportedFileAttributeViews().contains("posix")) { + // Write an empty file to the tempdir + tempFilePath = Files.createTempFile(p, "gatk_exec_test", null); + // Add execute permissions + final Set executePermissions = EnumSet.of( + PosixFilePermission.OWNER_EXECUTE, + PosixFilePermission.GROUP_EXECUTE, + PosixFilePermission.OTHERS_EXECUTE + ); + final Set newPermissions = Files.getPosixFilePermissions(tempFilePath); + newPermissions.addAll(executePermissions); + + Files.setPosixFilePermissions(tempFilePath, newPermissions); + if(!Files.isExecutable(tempFilePath)) { + logger.warn( + "User has permissions to create executable files within the configured temporary directory, " + + "but cannot execute those files. It is possible the directory has been mounted using the " + + "'noexec' flag. This can cause issues for some GATK tools. You can specify a different " + + "directory using --tmp-dir" + ); + } + } + } catch(Exception e) { + logger.warn( + "Cannot create executable files within the configured temporary directory. It is possible " + + "this user does not have the proper permissions to execute files within this directory. " + + "This can cause issues for some GATK tools. You can specify a different directory using " + + "--tmp-dir" + ); + logger.debug(e); + } finally { + // Make sure we clean up the test file + try { + Files.deleteIfExists(tempFilePath); + } catch(Exception e) { + logger.warn("Failed to delete temp file for testing temp dir", e); + } + } + + } } diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java b/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java index 2d9f7da9099..1d3f00147fb 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/DeprecatedToolsRegistry.java @@ -22,6 +22,14 @@ public class DeprecatedToolsRegistry { // Indicate version in which the tool disappeared, and recommended replacement in parentheses if applicable deprecatedTools.put("IndelRealigner", Pair.of("4.0.0.0", "Please use GATK3 to run this tool")); deprecatedTools.put("RealignerTargetCreator", Pair.of("4.0.0.0", "Please use GATK3 to run this tool")); + deprecatedTools.put("CNNScoreVariants", Pair.of("4.6.1.0", + "Please use the replacement tool NVScoreVariants instead, which produces virtually identical results")); + deprecatedTools.put("CNNVariantTrain", Pair.of("4.6.1.0", + "Please use a version of GATK prior to 4.6.1.0 to run this tool, " + + "or wait for the forthcoming Pytorch-based training tool for NVScoreVariants to be released")); + deprecatedTools.put("CNNVariantWriteTensors", Pair.of("4.6.1.0", + "Please use a version of GATK prior to 4.6.1.0 to run this tool, " + + "or wait for the forthcoming Pytorch-based training tool for NVScoreVariants to be released")); } /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java index 95d02fdd7c4..892571a6547 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/annotator/VariantAnnotator.java @@ -91,8 +91,8 @@ * --expression foo.FILTER * * - *

Caveat

- *

This tool outputs no annotations by default, all annotations/groups must be specified explicitly.

+ *

Caveats

+ *

This tool outputs no annotations by default, all annotations/groups must be specified explicitly. This tool accepts VCF format files only. Using GVCF files as input may result in unexpected behavior.

* *

Special note on RankSumTestAnnotations

*

RankSumAnnotations produced by this tool are not the same as those produced by the HaplotypeCaller. Without the diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java index 2c3b38f0a9d..0406ad861d5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/fasta/FastaAlternateReferenceMaker.java @@ -41,6 +41,7 @@ *

  • If there are multiple variants that start at a site, it chooses one of them randomly.
  • *
  • When there are overlapping indels (but with different start positions) only the first will be chosen.
  • *
  • This tool works only for SNPs and for simple indels (but not for things like complex substitutions).
  • + *
  • This tool works only with VCF files. Using GVCF files as input may result in unexpected behavior.
  • * *

    Input

    diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java deleted file mode 100644 index 88e71d7df40..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java +++ /dev/null @@ -1,562 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.vqsr; - -import java.util.*; -import java.io.File; -import java.io.IOException; -import java.util.stream.Collectors; -import java.io.UnsupportedEncodingException; - -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; - -import org.broadinstitute.hellbender.engine.*; -import org.broadinstitute.barclay.argparser.*; -import org.broadinstitute.hellbender.engine.filters.*; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.utils.downsampling.ReadsDownsamplingIterator; -import org.broadinstitute.hellbender.utils.downsampling.ReservoirDownsampler; -import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter; -import org.broadinstitute.hellbender.utils.io.IOUtils; -import org.broadinstitute.hellbender.utils.io.Resource; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; -import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.utils.runtime.AsynchronousStreamWriter; -import org.broadinstitute.hellbender.utils.python.StreamingPythonScriptExecutor; - -import picard.cmdline.programgroups.VariantFilteringProgramGroup; - -import com.intel.gkl.IntelGKLUtils; - -/** - * Annotate a VCF with scores from a Convolutional Neural Network (CNN). - * - * This tool streams variants and their reference context to a python program, - * which evaluates a pre-trained neural network on each variant. - * The default models were trained on single-sample VCFs. - * The default model should not be used on VCFs with annotations from joint call-sets. - * - * The neural network performs convolutions over the reference sequence surrounding the variant - * and combines those features with a multilayer perceptron on the variant annotations. - * - * 2D models convolve over aligned reads as well as the reference sequence, and variant annotations. - * 2D models require a SAM/BAM file as input and for the --tensor-type argument to be set - * to a tensor type which requires reads, as in the example below. - * - * Pre-trained 1D and 2D models are included in the distribution. - * It is possible to train your own models with the tools: - * {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}. - * CNNVariantTrain will create a json architecture file and an hd5 weights file, which you can use with this tool. - * - * The advanced argument `info-annotation-keys` is available for models trained with different sets info field annotations. - * In order to do this you must first train your own model with the tools {@link CNNVariantWriteTensors} and {@link CNNVariantTrain}. - * Otherwise, providing this argument with anything but the standard set of annotations will result in an error. - * - * - *

    1D Model with pre-trained architecture

    - * - *
    - * gatk CNNScoreVariants \
    - *   -V vcf_to_annotate.vcf.gz \
    - *   -R reference.fasta \
    - *   -O annotated.vcf
    - * 
    - * - *

    2D Model with pre-trained architecture

    - * - *
    - * gatk CNNScoreVariants \
    - *   -I aligned_reads.bam \
    - *   -V vcf_to_annotate.vcf.gz \
    - *   -R reference.fasta \
    - *   -O annotated.vcf \
    - *   -tensor-type read-tensor
    - * 
    - * - *

    1D Model with user-supplied architecture and weights:

    - * - *
    - * gatk CNNScoreVariants \
    - *   -V vcf_to_annotate.vcf.gz \
    - *   -R reference.fasta \
    - *   -O annotated.vcf \
    - *   -architecture path/to/my_model_folder/1dmodel.json
    - *   -weights path/to/my_model_folder/1dmodel.hd5
    - * 
    - * - *

    2D Model with user-supplied model architecture and weights:

    - * - *
    - * gatk CNNScoreVariants \
    - *   -I aligned_reads.bam \
    - *   -V vcf_to_annotate.vcf.gz \
    - *   -R reference.fasta \
    - *   -O annotated.vcf \
    - *   -tensor-type read-tensor \
    - *   -architecture path/to/my_model_folder/2dmodel.json
    - *   -weights path/to/my_model_folder/2dmodel.hd5
    - * 
    - */ -@DeprecatedFeature -@DocumentedFeature -@CommandLineProgramProperties( - summary = CNNScoreVariants.USAGE_SUMMARY, - oneLineSummary = CNNScoreVariants.USAGE_ONE_LINE_SUMMARY, - programGroup = VariantFilteringProgramGroup.class -) - -public class CNNScoreVariants extends TwoPassVariantWalker { - private final static String NL = String.format("%n"); - static final String USAGE_ONE_LINE_SUMMARY = "Apply a Convolutional Neural Net to filter annotated variants"; - static final String USAGE_SUMMARY = "Annotate a VCF with scores from a Convolutional Neural Network (CNN)." + - "The CNN determines a Log Odds Score for each variant." + - "Pre-trained models (1D or 2D) are specified via the architecture argument." + - "1D models will look at the reference sequence and variant annotations." + - "2D models look at aligned reads, reference sequence, and variant annotations." + - "2D models require a BAM file as input as well as the tensor-type argument to be set."; - static final String DISABLE_AVX_CHECK_NAME = "disable-avx-check"; - static final String AVXREQUIRED_ERROR = "This tool requires AVX instruction set support by default due to its dependency on recent versions of the TensorFlow library.\n" + - " If you have an older (pre-1.6) version of TensorFlow installed that does not require AVX you may attempt to re-run the tool with the %s argument to bypass this check.\n" + - " Note that such configurations are not officially supported."; - - private static final int CONTIG_INDEX = 0; - private static final int POS_INDEX = 1; - private static final int REF_INDEX = 2; - private static final int ALT_INDEX = 3; - private static final int KEY_INDEX = 4; - private static final int FIFO_STRING_INITIAL_CAPACITY = 1024; - private static final int MAX_BATCH_SIZE_1D = 1024; - private static final int MAX_BATCH_SIZE_2D = 64; - - // These constants correspond to constants in the python code set in defines.py. They must be kept in sync. - private static final String DATA_VALUE_SEPARATOR = ","; // If changed make change in defines.py - private static final String DATA_TYPE_SEPARATOR = "\t"; // If changed make change in defines.py - private static final String ANNOTATION_SEPARATOR = ";"; // If changed make change in defines.py - private static final String ANNOTATION_SET_STRING = "=";// If changed make change in defines.py - - private List defaultAnnotationKeys = new ArrayList<>(Arrays.asList("MQ", "DP", "SOR", "FS", "QD", "MQRankSum", "ReadPosRankSum")); - - @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "Output file") - private GATKPath outputFile; - - @Argument(fullName = "architecture", shortName = "architecture", doc = "Neural Net architecture configuration json file", optional = true) - private String architecture; - - @Argument(fullName = "weights", shortName = "weights", doc = "Keras model HD5 file with neural net weights.", optional = true) - private String weights; - - @Argument(fullName = "tensor-type", shortName = "tensor-type", doc = "Name of the tensors to generate, reference for 1D reference tensors and read_tensor for 2D tensors.", optional = true) - private TensorType tensorType = TensorType.reference; - - @Argument(fullName = "window-size", shortName = "window-size", doc = "Neural Net input window size", minValue = 0, optional = true) - private int windowSize = 128; - - @Argument(fullName = "read-limit", shortName = "read-limit", doc = "Maximum number of reads to encode in a tensor, for 2D models only.", minValue = 0, optional = true) - private int readLimit = 128; - - @Argument(fullName = "filter-symbolic-and-sv", shortName = "filter-symbolic-and-sv", doc = "If set will filter symbolic and and structural variants from the input VCF", optional = true) - private boolean filterSymbolicAndSV = false; - - @Advanced - @Argument(fullName="info-annotation-keys", shortName="info-annotation-keys", doc="The VCF info fields to send to python. This should only be changed if a new model has been trained which expects the annotations provided here.", optional=true) - private List annotationKeys = defaultAnnotationKeys; - - @Advanced - @Argument(fullName = "inference-batch-size", shortName = "inference-batch-size", doc = "Size of batches for python to do inference on.", minValue = 1, maxValue = 4096, optional = true) - private int inferenceBatchSize = 256; - - @Advanced - @Argument(fullName = "transfer-batch-size", shortName = "transfer-batch-size", doc = "Size of data to queue for python streaming.", minValue = 1, maxValue = 8192, optional = true) - private int transferBatchSize = 512; - - @Advanced - @Argument(fullName = "inter-op-threads", shortName = "inter-op-threads", doc = "Number of inter-op parallelism threads to use for Tensorflow", minValue = 0, maxValue = 4096, optional = true) - private int interOpThreads = 0; - - @Advanced - @Argument(fullName = "intra-op-threads", shortName = "intra-op-threads", doc = "Number of intra-op parallelism threads to use for Tensorflow", minValue = 0, maxValue = 4096, optional = true) - private int intraOpThreads = 0; - - @Advanced - @Argument(fullName = "output-tensor-dir", shortName = "output-tensor-dir", doc = "Optional directory where tensors can be saved for debugging or visualization.", optional = true) - private String outputTensorsDir = ""; - - @Advanced - @Argument(fullName = DISABLE_AVX_CHECK_NAME, shortName = DISABLE_AVX_CHECK_NAME, doc = "If set, no check will be made for AVX support. " + - "Use only if you have installed a pre-1.6 TensorFlow build. ", optional = true) - private boolean disableAVXCheck = false; - - @Hidden - @Argument(fullName = "enable-journal", shortName = "enable-journal", doc = "Enable streaming process journal.", optional = true) - private boolean enableJournal = false; - - @Hidden - @Argument(fullName = "keep-temp-file", shortName = "keep-temp-file", doc = "Keep the temporary file that python writes scores to.", optional = true) - private boolean keepTempFile = false; - - @Hidden - @Argument(fullName = "python-profile", shortName = "python-profile", doc = "Run the tool with the Python CProfiler on and write results to this file.", optional = true) - private File pythonProfileResults; - - private StreamingPythonScriptExecutor pythonExecutor; - - private List batchList = new ArrayList<>(inferenceBatchSize); - - private int curBatchSize = 0; - private int windowEnd = windowSize / 2; - private int windowStart = windowSize / 2; - private boolean waitforBatchCompletion = false; - - private File scoreFile; // use java.io.File here because python code needs to write to this - private String scoreKey; - private Scanner scoreScan; - private VariantContextWriter vcfWriter; - private String annotationSetString; - - private static String resourcePathReadTensor = Resource.LARGE_RUNTIME_RESOURCES_PATH + "/cnn_score_variants/small_2d.json"; - private static String resourcePathReferenceTensor = Resource.LARGE_RUNTIME_RESOURCES_PATH + "/cnn_score_variants/1d_cnn_mix_train_full_bn.json"; - - @Override - protected String[] customCommandLineValidation() { - if (tensorType.equals(TensorType.read_tensor)){ - transferBatchSize = Math.max(transferBatchSize, MAX_BATCH_SIZE_2D); - inferenceBatchSize = Math.max(inferenceBatchSize, MAX_BATCH_SIZE_2D); - } else if (tensorType.equals(TensorType.reference)){ - transferBatchSize = Math.max(transferBatchSize, MAX_BATCH_SIZE_1D); - inferenceBatchSize = Math.max(inferenceBatchSize, MAX_BATCH_SIZE_1D); - } - - if (inferenceBatchSize > transferBatchSize) { - return new String[]{"Inference batch size must be less than or equal to transfer batch size."}; - } - - if (architecture == null || weights == null){ - if (!tensorType.equals(TensorType.read_tensor) && !tensorType.equals(TensorType.reference)){ - return new String[]{"No default architecture for tensor type:" + tensorType.name()}; - } - } - return null; - } - - @Override - public boolean requiresReference() { - return true; - } - - @Override - protected CountingVariantFilter makeVariantFilter() { - return new CountingVariantFilter( - filterSymbolicAndSV ? - VariantFilterLibrary.NOT_SV_OR_SYMBOLIC: - VariantFilterLibrary.ALLOW_ALL_VARIANTS - ); - } - - @Override - public List getDefaultReadFilters() { - List readFilters = new ArrayList<>(); - readFilters.addAll(super.getDefaultReadFilters()); - List filterList = new ArrayList<>(); - filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_HAPLOTYPE_READ_GROUP_ID); - filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_GATK3_HAPLOTYPE_READ_GROUP_ID); - readFilters.add(new ReadGroupBlackListReadFilter(filterList, null)); - return readFilters; - } - - @Override - public void onTraversalStart() { - // Users can disable the AVX check to allow an older version of TF that doesn't require AVX to be used. - if(this.disableAVXCheck == false) { - IntelGKLUtils utils = new IntelGKLUtils(); - utils.load(null); - if (utils.isAvxSupported() == false) { - // Give user the bad news, suggest remedies. - throw new UserException.HardwareFeatureException(String.format(CNNScoreVariants.AVXREQUIRED_ERROR, DISABLE_AVX_CHECK_NAME)); - } - } - - // Create the Python executor. This doesn't actually start the Python process, but verifies that - // the requestedPython executable exists and can be located. - pythonExecutor = new StreamingPythonScriptExecutor<>(true); - - final VCFHeader inputHeader = getHeaderForVariants(); - if (inputHeader.getGenotypeSamples().size() > 1) { - logger.warn("CNNScoreVariants is a single sample tool but the input VCF has more than 1 sample."); - } - - if (!annotationKeys.equals(defaultAnnotationKeys)){ - logger.warn("Annotation keys are not the default you must also provide a trained model that expects these annotations."); - } - - // Start the Python process and initialize a stream writer for streaming data to the Python code - pythonExecutor.start(Collections.emptyList(), enableJournal, pythonProfileResults); - pythonExecutor.initStreamWriter(AsynchronousStreamWriter.stringSerializer); - - batchList = new ArrayList<>(transferBatchSize); - - // Execute Python code to open our output file, where it will write the contents of everything it reads - // from the stream. - try { - // create a local temp that python code can write to - scoreFile = File.createTempFile(outputFile.getBaseName().get(), ".temp"); - if (!keepTempFile) { - scoreFile.deleteOnExit(); - } else { - logger.info("Saving temp file from python:" + scoreFile.getAbsolutePath()); - } - pythonExecutor.sendSynchronousCommand(String.format("tempFile = open('%s', 'w+')" + NL, scoreFile.getAbsolutePath())); - pythonExecutor.sendSynchronousCommand("import vqsr_cnn" + NL); - - scoreKey = getScoreKeyAndCheckModelAndReadsHarmony(); - annotationSetString = annotationKeys.stream().collect(Collectors.joining(DATA_VALUE_SEPARATOR)); - initializePythonArgsAndModel(); - } catch (IOException e) { - throw new GATKException("Error when creating temp file and initializing python executor.", e); - } - - } - - @Override - public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - referenceContext.setWindow(windowStart, windowEnd); - if (tensorType.isReadsRequired()) { - transferReadsToPythonViaFifo(variant, readsContext, referenceContext); - } else { - transferToPythonViaFifo(variant, referenceContext); - } - sendBatchIfReady(); - } - - @Override - public void afterFirstPass() { - if (waitforBatchCompletion) { - pythonExecutor.waitForPreviousBatchCompletion(); - } - if (curBatchSize > 0) { - executePythonCommand(); - pythonExecutor.waitForPreviousBatchCompletion(); - } - - pythonExecutor.sendSynchronousCommand("tempFile.close()" + NL); - pythonExecutor.terminate(); - - try { - scoreScan = new Scanner(scoreFile); - vcfWriter = createVCFWriter(outputFile); - scoreScan.useDelimiter("\\n"); - writeVCFHeader(vcfWriter); - } catch (IOException e) { - throw new GATKException("Error when trying to temporary score file scanner.", e); - } - - } - - @Override - protected void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { - String sv = scoreScan.nextLine(); - String[] scoredVariant = sv.split("\\t"); - - if (variant.getContig().equals(scoredVariant[CONTIG_INDEX]) - && Integer.toString(variant.getStart()).equals(scoredVariant[POS_INDEX]) - && variant.getReference().getBaseString().equals(scoredVariant[REF_INDEX]) - && variant.getAlternateAlleles().toString().equals(scoredVariant[ALT_INDEX])) { - - final VariantContextBuilder builder = new VariantContextBuilder(variant); - if (scoredVariant.length > KEY_INDEX) { - builder.attribute(scoreKey, scoredVariant[KEY_INDEX]); - } - vcfWriter.add(builder.make()); - - } else { - String errorMsg = "Score file out of sync with original VCF. Score file has:" + sv; - errorMsg += "\n But VCF has:" + variant.toStringWithoutGenotypes(); - throw new GATKException(errorMsg); - } - } - - @Override - public void closeTool() { - logger.info("Done scoring variants with CNN."); - if (vcfWriter != null) { - vcfWriter.close(); - } - if (scoreScan != null){ - scoreScan.close(); - } - } - - private void transferToPythonViaFifo(final VariantContext variant, final ReferenceContext referenceContext) { - try { - final String outDat = String.format("%s%s%s%s%s%s%s\n", - getVariantDataString(variant), DATA_TYPE_SEPARATOR, - new String(Arrays.copyOfRange(referenceContext.getBases(), 0, windowSize), "UTF-8"), DATA_TYPE_SEPARATOR, - getVariantInfoString(variant), DATA_TYPE_SEPARATOR, - variant.isSNP() ? "SNP" : variant.isIndel() ? "INDEL" : "OTHER"); - batchList.add(outDat); - curBatchSize++; - } catch (UnsupportedEncodingException e) { - throw new GATKException("Trying to make string from reference, but unsupported encoding UTF-8.", e); - } - - } - - private void sendBatchIfReady() { - if (curBatchSize == transferBatchSize) { - if (waitforBatchCompletion == true) { - // wait for the last batch to complete before we start a new one - pythonExecutor.waitForPreviousBatchCompletion(); - waitforBatchCompletion = false; - } - executePythonCommand(); - waitforBatchCompletion = true; - curBatchSize = 0; - batchList = new ArrayList<>(transferBatchSize); - } - } - - private void transferReadsToPythonViaFifo(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext) { - StringBuilder sb = new StringBuilder(FIFO_STRING_INITIAL_CAPACITY); - try { - sb.append(String.format("%s%s%s%s%s%s%s%s", - getVariantDataString(variant), DATA_TYPE_SEPARATOR, - new String(Arrays.copyOfRange(referenceContext.getBases(), 0, windowSize), "UTF-8"), DATA_TYPE_SEPARATOR, - getVariantInfoString(variant), DATA_TYPE_SEPARATOR, - variant.isSNP() ? "SNP" : variant.isIndel() ? "INDEL" : "OTHER", DATA_TYPE_SEPARATOR)); - } catch (UnsupportedEncodingException e) { - throw new GATKException("Trying to make string from reference, but unsupported encoding UTF-8.", e); - } - Iterator readIt = new ReadsDownsamplingIterator(readsContext.iterator(), new ReservoirDownsampler(readLimit)); - if (!readIt.hasNext()) { - logger.warn("No reads at contig:" + variant.getContig() + " site:" + String.valueOf(variant.getStart())); - } - - while (readIt.hasNext()) { - sb.append(GATKReadToString(readIt.next())); - } - sb.append(NL); - batchList.add(sb.toString()); - curBatchSize++; - } - - private String GATKReadToString(GATKRead read) { - StringBuilder sb = new StringBuilder(FIFO_STRING_INITIAL_CAPACITY); - sb.append(read.getBasesString() + DATA_TYPE_SEPARATOR); - - appendQualityBytes(sb, read.getBaseQualities()); - sb.append(read.getCigar().toString() + DATA_TYPE_SEPARATOR); - sb.append(read.isReverseStrand() + DATA_TYPE_SEPARATOR); - sb.append((read.isPaired() ? read.mateIsReverseStrand() : "false") + DATA_TYPE_SEPARATOR); - sb.append(read.isFirstOfPair() + DATA_TYPE_SEPARATOR); - sb.append(read.getMappingQuality() + DATA_TYPE_SEPARATOR); - sb.append(Integer.toString(read.getUnclippedStart()) + DATA_TYPE_SEPARATOR); - return sb.toString(); - } - - private void appendQualityBytes(StringBuilder sb, byte[] qualities) { - if(qualities.length == 0) { - sb.append(DATA_TYPE_SEPARATOR); - return; - } - - for (int i = 0; i < qualities.length - 1; i++) { - sb.append(Integer.toString(qualities[i]) + DATA_VALUE_SEPARATOR); - } - sb.append(Integer.toString(qualities[qualities.length - 1]) + DATA_TYPE_SEPARATOR); - } - - private String getVariantDataString(final VariantContext variant) { - return String.format("%s%s%d%s%s%s%s", - variant.getContig(), DATA_TYPE_SEPARATOR, - variant.getStart(), DATA_TYPE_SEPARATOR, - variant.getReference().getBaseString(), DATA_TYPE_SEPARATOR, - variant.getAlternateAlleles().toString() - ); - } - - private String getVariantInfoString(final VariantContext variant) { - // Create a string that will easily be parsed as a python dictionary - StringBuilder sb = new StringBuilder(FIFO_STRING_INITIAL_CAPACITY); - for (final String attributeKey : annotationKeys) { - if (variant.hasAttribute(attributeKey)) { - sb.append(attributeKey); - sb.append(ANNOTATION_SET_STRING); - sb.append(variant.getAttributeAsString(attributeKey, "0")); - sb.append(ANNOTATION_SEPARATOR); - } - } - return sb.toString(); - } - - private void executePythonCommand() { - final String pythonCommand = String.format( - "vqsr_cnn.score_and_write_batch(model, tempFile, %d, %d, '%s', '%s', %d, %d, '%s')", - curBatchSize, - inferenceBatchSize, - tensorType, - annotationSetString, - windowSize, - readLimit, - outputTensorsDir) + NL; - pythonExecutor.startBatchWrite(pythonCommand, batchList); - } - - private void writeVCFHeader(VariantContextWriter vcfWriter) { - // setup the header fields - final VCFHeader inputHeader = getHeaderForVariants(); - final Set inputHeaders = inputHeader.getMetaDataInSortedOrder(); - final Set hInfo = new HashSet<>(inputHeaders); - hInfo.add(GATKVCFHeaderLines.getInfoLine(scoreKey)); - final TreeSet samples = new TreeSet<>(); - samples.addAll(inputHeader.getGenotypeSamples()); - hInfo.addAll(getDefaultToolVCFHeaderLines()); - final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); - vcfWriter.writeHeader(vcfHeader); - } - - private String getScoreKeyAndCheckModelAndReadsHarmony() { - if (tensorType.isReadsRequired() && this.hasReads()) { - return GATKVCFConstants.CNN_2D_KEY; - } else if (!tensorType.isReadsRequired() && this.hasReads()) { - logger.warn(String.format("Reads are available, but tensor type %s does not use them.", tensorType.name())); - return GATKVCFConstants.CNN_1D_KEY; - } else if (!tensorType.isReadsRequired()) { - return GATKVCFConstants.CNN_1D_KEY; - } else { - throw new GATKException("2D Models require a SAM/BAM file specified via -I (-input) argument."); - } - } - - private void initializePythonArgsAndModel() { - if (architecture == null && weights == null) { - if (tensorType.equals(TensorType.read_tensor)) { - architecture = IOUtils.writeTempResourceFromPath(resourcePathReadTensor, null).getAbsolutePath(); - weights = IOUtils.writeTempResourceFromPath( - resourcePathReadTensor.replace(".json", ".hd5"), - null).getAbsolutePath(); - } else if (tensorType.equals(TensorType.reference)) { - architecture = IOUtils.writeTempResourceFromPath(resourcePathReferenceTensor, null).getAbsolutePath(); - weights = IOUtils.writeTempResourceFromPath( - resourcePathReferenceTensor.replace(".json", ".hd5"), null).getAbsolutePath(); - } else { - throw new GATKException("No default architecture for tensor type:" + tensorType.name()); - } - } else if (weights == null) { - weights = architecture.replace(".json", ".hd5"); - } else if (architecture == null) { - architecture = weights.replace(".hd5", ".json"); - } - - String getArgsAndModel = String.format("args, model = vqsr_cnn.start_session_get_args_and_model(%d, %d, '%s', weights_hd5='%s')", - intraOpThreads, interOpThreads, architecture, weights) + NL; - logger.info("Using key:" + scoreKey + " for CNN architecture:" + architecture + " and weights:" + weights); - pythonExecutor.sendSynchronousCommand(getArgsAndModel); - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java deleted file mode 100644 index 8fe917a2cff..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantTrain.java +++ /dev/null @@ -1,247 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.vqsr; - -import org.broadinstitute.barclay.argparser.*; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.utils.io.Resource; -import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; -import picard.cmdline.programgroups.VariantFilteringProgramGroup; - - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -/** - * Train a Convolutional Neural Network (CNN) for filtering variants. - * This tool expects requires training data generated by {@link CNNVariantWriteTensors}. - * - * - *

    Inputs

    - *
      - *
    • data-dir The training data created by {@link CNNVariantWriteTensors}.
    • - *
    • The --tensor-type argument determines what types of tensors the model will expect. - * Set it to "reference" for 1D tensors or "read_tensor" for 2D tensors.
    • - *
    - * - *

    Outputs

    - *
      - *
    • output-dir The model weights file and semantic configuration json are saved here. - * This default to the current working directory.
    • - *
    • model-name The name for your model.
    • - *
    - * - *

    Usage example

    - * - *

    Train a 1D CNN on Reference Tensors

    - *
    - * gatk CNNVariantTrain \
    - *   -tensor-type reference \
    - *   -input-tensor-dir my_tensor_folder \
    - *   -model-name my_1d_model
    - * 
    - * - *

    Train a 2D CNN on Read Tensors

    - *
    - * gatk CNNVariantTrain \
    - *   -input-tensor-dir my_tensor_folder \
    - *   -tensor-type read-tensor \
    - *   -model-name my_2d_model
    - * 
    - * - */ -@CommandLineProgramProperties( - summary = "Train a CNN model for filtering variants", - oneLineSummary = "Train a CNN model for filtering variants", - programGroup = VariantFilteringProgramGroup.class -) -@DeprecatedFeature -@DocumentedFeature -public class CNNVariantTrain extends CommandLineProgram { - - @Argument(fullName = "input-tensor-dir", shortName = "input-tensor-dir", doc = "Directory of training tensors to create.") - private String inputTensorDir; - - @Argument(fullName = "output-dir", shortName = "output-dir", doc = "Directory where models will be saved, defaults to current working directory.", optional = true) - private String outputDir = "./"; - - @Argument(fullName = "tensor-type", shortName = "tensor-type", doc = "Type of tensors to use as input reference for 1D reference tensors and read_tensor for 2D tensors.", optional = true) - private TensorType tensorType = TensorType.reference; - - @Argument(fullName = "model-name", shortName = "model-name", doc = "Name of the model to be trained.", optional = true) - private String modelName = "variant_filter_model"; - - @Argument(fullName = "epochs", shortName = "epochs", doc = "Maximum number of training epochs.", optional = true, minValue = 0) - private int epochs = 10; - - @Argument(fullName = "training-steps", shortName = "training-steps", doc = "Number of training steps per epoch.", optional = true, minValue = 0) - private int trainingSteps = 10; - - @Argument(fullName = "validation-steps", shortName = "validation-steps", doc = "Number of validation steps per epoch.", optional = true, minValue = 0) - private int validationSteps = 2; - - @Argument(fullName = "image-dir", shortName = "image-dir", doc = "Path where plots and figures are saved.", optional = true) - private String imageDir; - - @Argument(fullName = "conv-width", shortName = "conv-width", doc = "Width of convolution kernels", optional = true) - private int convWidth = 5; - - @Argument(fullName = "conv-height", shortName = "conv-height", doc = "Height of convolution kernels", optional = true) - private int convHeight = 5; - - @Argument(fullName = "conv-dropout", shortName = "conv-dropout", doc = "Dropout rate in convolution layers", optional = true) - private float convDropout = 0.0f; - - @Argument(fullName = "conv-batch-normalize", shortName = "conv-batch-normalize", doc = "Batch normalize convolution layers", optional = true) - private boolean convBatchNormalize = false; - - @Argument(fullName = "conv-layers", shortName = "conv-layers", doc = "List of number of filters to use in each convolutional layer", optional = true) - private List convLayers = new ArrayList(); - - @Argument(fullName = "padding", shortName = "padding", doc = "Padding for convolution layers, valid or same", optional = true) - private String padding = "valid"; - - @Argument(fullName = "spatial-dropout", shortName = "spatial-dropout", doc = "Spatial dropout on convolution layers", optional = true) - private boolean spatialDropout = false; - - @Argument(fullName = "fc-layers", shortName = "fc-layers", doc = "List of number of filters to use in each fully-connected layer", optional = true) - private List fcLayers = new ArrayList(); - - @Argument(fullName = "fc-dropout", shortName = "fc-dropout", doc = "Dropout rate in fully-connected layers", optional = true) - private float fcDropout = 0.0f; - - @Argument(fullName = "fc-batch-normalize", shortName = "fc-batch-normalize", doc = "Batch normalize fully-connected layers", optional = true) - private boolean fcBatchNormalize = false; - - @Argument(fullName = "annotation-units", shortName = "annotation-units", doc = "Number of units connected to the annotation input layer", optional = true) - private int annotationUnits = 16; - - @Argument(fullName = "annotation-shortcut", shortName = "annotation-shortcut", doc = "Shortcut connections on the annotation layers.", optional = true) - private boolean annotationShortcut = false; - - // Optimizer parameters: - @Argument(fullName = "optimizer-learning-rate", shortName = "optimizer-learning-rate", doc = "Learning rate for the Adam optimizer.", optional = true) - private double optimizerLearningRate = 0.0001; - - @Argument(fullName = "optimizer-beta-1", shortName = "optimizer-beta-1", doc = "Beta 1 parameter for the Adam optimizer.", optional = true) - private double optimizerBeta1 = 0.9; - - @Argument(fullName = "optimizer-beta-2", shortName = "optimizer-beta-2", doc = "Beta 2 parameter for the Adam optimizer.", optional = true) - private double optimizerBeta2 = 0.999; - - @Argument(fullName = "optimizer-epsilon", shortName = "optimizer-epsilon", doc = "Epsilon parameter for the Adam optimizer.", optional = true) - private double optimizerEpsilon = 1e-08; - - @Argument(fullName = "optimizer-clipnorm", shortName = "optimizer-clipnorm", doc = "Clipnorm parameter for the Adam optimizer.", optional = true) - private double optimizerClipnorm = 1.0; - - @Advanced - @Argument(fullName = "channels-last", shortName = "channels-last", doc = "Store the channels in the last axis of tensors, tensorflow->true, theano->false", optional = true) - private boolean channelsLast = true; - - @Advanced - @Argument(fullName = "annotation-set", shortName = "annotation-set", doc = "Which set of annotations to use.", optional = true) - private String annotationSet = "best_practices"; - - private PythonScriptExecutor pythonExecutor; - - - @Override - protected void onStartup() { - PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn"); - // Start the Python executor. This does not actually start the Python process, but fails if python can't be located - pythonExecutor = new PythonScriptExecutor(true); - } - - @Override - protected Object doWork() { - - final Resource pythonScriptResource = new Resource("training.py", CNNVariantTrain.class); - List arguments = new ArrayList<>(Arrays.asList( - "--data_dir", inputTensorDir, - "--output_dir", outputDir, - "--tensor_name", tensorType.name(), - "--annotation_set", annotationSet, - "--conv_width", Integer.toString(convWidth), - "--conv_height", Integer.toString(convHeight), - "--conv_dropout", Float.toString(convDropout), - "--padding", padding, - "--fc_dropout", Float.toString(fcDropout), - "--annotation_units", Integer.toString(annotationUnits), - "--epochs", Integer.toString(epochs), - "--training_steps", Integer.toString(trainingSteps), - "--validation_steps", Integer.toString(validationSteps), - - "--optimizer_learning_rate", Double.toString(optimizerLearningRate), - "--optimizer_beta_1", Double.toString(optimizerBeta1), - "--optimizer_beta_2", Double.toString(optimizerBeta2), - "--optimizer_epsilon", Double.toString(optimizerEpsilon), - "--optimizer_clipnorm", Double.toString(optimizerClipnorm), - - "--gatk_version", this.getVersion(), - "--id", modelName)); - - // Add boolean arguments - if(channelsLast){ - arguments.add("--channels_last"); - } else { - arguments.add("--channels_first"); - } - - if(imageDir != null){ - arguments.addAll(Arrays.asList("--image_dir", imageDir)); - } - - if (convLayers.size() == 0 && fcLayers.size() == 0){ - if (tensorType == TensorType.reference) { - arguments.addAll(Arrays.asList("--mode", "train_default_1d_model")); - } else if (tensorType == TensorType.read_tensor) { - arguments.addAll(Arrays.asList("--mode", "train_default_2d_model")); - } else { - throw new GATKException("Unknown tensor mapping mode:"+ tensorType.name()); - } - } else { // Command line specified custom architecture - if(convBatchNormalize){ - arguments.add("--conv_batch_normalize"); - } - if(fcBatchNormalize){ - arguments.add("--fc_batch_normalize"); - } - if(spatialDropout){ - arguments.add("--spatial_dropout"); - } - if(annotationShortcut){ - arguments.add("--annotation_shortcut"); - } - - // Add list arguments - arguments.add("--conv_layers"); - for(Integer cl : convLayers){ - arguments.add(Integer.toString(cl)); - } - arguments.add("--fc_layers"); - for(Integer fl : fcLayers){ - arguments.add(Integer.toString(fl)); - } - - if (tensorType == TensorType.reference) { - arguments.addAll(Arrays.asList("--mode", "train_args_model_on_reference_and_annotations")); - } else if (tensorType == TensorType.read_tensor) { - arguments.addAll(Arrays.asList("--mode", "train_args_model_on_read_tensors_and_annotations")); - } else { - throw new GATKException("Unknown tensor mapping mode:"+ tensorType.name()); - } - } - - logger.info("Args are:"+ Arrays.toString(arguments.toArray())); - final boolean pythonReturnCode = pythonExecutor.executeScript( - pythonScriptResource, - null, - arguments - ); - return pythonReturnCode; - } - -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java deleted file mode 100644 index 49ce59bdd0c..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantWriteTensors.java +++ /dev/null @@ -1,169 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.vqsr; - -import org.broadinstitute.barclay.argparser.*; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.CommandLineProgram; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.utils.io.Resource; -import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; -import picard.cmdline.programgroups.VariantFilteringProgramGroup; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Write variant tensors for training a Convolutional Neural Network (CNN) for filtering variants. - * After running this tool, a model can be trained with the {@link CNNVariantTrain} tool. - * - * - *

    Inputs

    - *
      - *
    • The input variants to make into tensors. - * These variant calls must be annotated with the standard best practices annotations.
    • - *
    • The truth VCF has validated variant calls, like those in the genomes in a bottle, - * platinum genomes, or CHM VCFs. Variants in both the input VCF and the truth VCF - * will be used as positive training data.
    • - *
    • The truth BED is a bed file define the confident region for the validated calls. - * Variants from the input VCF inside this region, but not included in the truth VCF - * will be used as negative training data.
    • - *
    • The --tensor-type argument determines what types of tensors will be written. - * Set it to "reference" to write 1D tensors or "read_tensor" to write 2D tensors.
    • - *
    • The bam-file argument is necessary to write 2D tensors which incorporate read data.
    • - *
    - * - *

    Outputs

    - *
      - *
    • data-dir This directory is created and populated with variant tensors. - * it will be divided into training, validation and test sets and each set will be further divided into - * positive and negative SNPs and INDELs.
    • - *
    - * - *

    Usage example

    - * - *

    Write Reference Tensors

    - *
    - * gatk CNNVariantWriteTensors \
    - *   -R reference.fasta \
    - *   -V input.vcf.gz \
    - *   -truth-vcf platinum-genomes.vcf \
    - *   -truth-bed platinum-confident-region.bed \
    - *   -tensor-type reference \
    - *   -output-tensor-dir my-tensor-folder
    - * 
    - * - *

    Write Read Tensors

    - *
    - * gatk CNNVariantWriteTensors \
    - *   -R reference.fasta \
    - *   -V input.vcf.gz \
    - *   -truth-vcf platinum-genomes.vcf \
    - *   -truth-bed platinum-confident-region.bed \
    - *   -tensor-type read_tensor \
    - *   -bam-file input.bam \
    - *   -output-tensor-dir my-tensor-folder
    - * 
    - * - */ -@CommandLineProgramProperties( - summary = "Write variant tensors for training a CNN to filter variants", - oneLineSummary = "Write variant tensors for training a CNN to filter variants", - programGroup = VariantFilteringProgramGroup.class -) -@DeprecatedFeature -@DocumentedFeature -public class CNNVariantWriteTensors extends CommandLineProgram { - - @Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME, - shortName = StandardArgumentDefinitions.REFERENCE_SHORT_NAME, - doc = "Reference fasta file.") - private String reference; - - @Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, - shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, - doc = "Input VCF file") - private String inputVcf; - - @Argument(fullName = "output-tensor-dir", shortName = "output-tensor-dir", doc = "Directory of training tensors. Subdivided into train, valid and test sets.") - private String outputTensorsDir; - - @Argument(fullName = "truth-vcf", shortName = "truth-vcf", doc = "Validated VCF file.") - private String truthVcf; - - @Argument(fullName = "truth-bed", shortName = "truth-bed", doc = "Confident region of the validated VCF file.") - private String truthBed; - - @Argument(fullName = "bam-file", shortName = "bam-file", doc = "BAM or BAMout file to use for read data when generating 2D tensors.", optional = true) - private String bamFile = ""; - - @Argument(fullName = "tensor-type", shortName = "tensor-type", doc = "Name of the tensors to generate.") - private TensorType tensorType = TensorType.reference; - - @Argument(fullName = "downsample-snps", shortName = "downsample-snps", doc = "Fraction of SNPs to write tensors for.", optional = true) - private float downsampleSnps = 0.05f; - - @Argument(fullName = "downsample-indels", shortName = "downsample-indels", doc = "Fraction of INDELs to write tensors for.", optional = true) - private float downsampleIndels = 0.5f; - - @Advanced - @Argument(fullName = "channels-last", shortName = "channels-last", doc = "Store the channels in the last axis of tensors, tensorflow->true, theano->false", optional = true) - private boolean channelsLast = true; - - @Advanced - @Argument(fullName = "annotation-set", shortName = "annotation-set", doc = "Which set of annotations to use.", optional = true) - private String annotationSet = "best_practices"; - - @Argument(fullName = "max-tensors", shortName = "max-tensors", doc = "Maximum number of tensors to write.", optional = true, minValue = 0) - private int maxTensors = 1000000; - - private PythonScriptExecutor pythonExecutor; - - @Override - protected void onStartup() { - PythonScriptExecutor.checkPythonEnvironmentForPackage("vqsr_cnn"); - - // Start the Python executor. This does not actually start the Python process, but fails if python can't be located - pythonExecutor = new PythonScriptExecutor(true); - } - - @Override - protected Object doWork() { - - final Resource pythonScriptResource = new Resource("training.py", CNNVariantWriteTensors.class); - List arguments = new ArrayList<>(Arrays.asList( - "--reference_fasta", reference, - "--input_vcf", inputVcf, - "--bam_file", bamFile, - "--train_vcf", truthVcf, - "--bed_file", truthBed, - "--tensor_name", tensorType.name(), - "--annotation_set", annotationSet, - "--samples", Integer.toString(maxTensors), - "--downsample_snps", Float.toString(downsampleSnps), - "--downsample_indels", Float.toString(downsampleIndels), - "--data_dir", outputTensorsDir)); - - if(channelsLast){ - arguments.add("--channels_last"); - } else{ - arguments.add("--channels_first"); - } - - if (tensorType == TensorType.reference) { - arguments.addAll(Arrays.asList("--mode", "write_reference_and_annotation_tensors")); - } else if (tensorType == TensorType.read_tensor) { - arguments.addAll(Arrays.asList("--mode", "write_read_and_annotation_tensors")); - } else { - throw new GATKException("Unknown tensor mapping mode:"+ tensorType.name()); - } - - logger.info("Args are:"+ Arrays.toString(arguments.toArray())); - final boolean pythonReturnCode = pythonExecutor.executeScript( - pythonScriptResource, - null, - arguments - ); - return pythonReturnCode; - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java index f9ce6878427..b3c2cd9fa9d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/FilterVariantTranches.java @@ -28,7 +28,7 @@ /** * Apply tranche filtering to VCF based on scores from an annotation in the INFO field. - * The annotation can come from the {@link CNNScoreVariants} tool (CNNLOD), VQSR (VQSLOD), + * The annotation can come from the {@link NVScoreVariants} tool (CNN_1D or CNN_2D), VQSR (VQSLOD), * or any other variant scoring tool which adds numeric annotations in a VCF's INFO field. * * Tranches are specified in percent sensitivity to the variants in the resource files. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/NVScoreVariants.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/NVScoreVariants.java index 3e91d83e4df..a2210fb4afd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/NVScoreVariants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/NVScoreVariants.java @@ -1,9 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.vqsr; -import org.broadinstitute.barclay.argparser.Advanced; -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.barclay.argparser.ExperimentalFeature; +import org.broadinstitute.barclay.argparser.*; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.exceptions.UserException; @@ -65,7 +62,7 @@ oneLineSummary = "Annotate a VCF with scores from a PyTorch-based Convolutional Neural Network (CNN)", programGroup = VariantFilteringProgramGroup.class ) -@ExperimentalFeature +@BetaFeature public class NVScoreVariants extends CommandLineProgram { public static final String NV_SCORE_VARIANTS_PACKAGE = "scorevariants"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java index 41e2bcf9526..5b2b44a3f50 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/VariantRecalibrator.java @@ -140,7 +140,8 @@ *

    Additional notes

    *
      *
    • This tool only accepts a single input variant file unlike earlier version of GATK, which accepted multiple - * input variant files.
    • + * input variant files. + *
    • The input VCF must be genotyped, raw GVCF files will not work correctly.
    • *
    • SNPs and indels must be recalibrated in separate runs, but it is not necessary to separate them into different * files. See the tutorial linked above for an example workflow. Note that mixed records are treated as indels.
    • *
    • diff --git a/src/main/java/org/broadinstitute/hellbender/utils/runtime/CapturedStreamOutput.java b/src/main/java/org/broadinstitute/hellbender/utils/runtime/CapturedStreamOutput.java index c2c565b8601..695cd487e99 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/runtime/CapturedStreamOutput.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/runtime/CapturedStreamOutput.java @@ -48,7 +48,7 @@ public CapturedStreamOutput(OutputStreamSettings settings, InputStream processSt } else { outputStream = new HardThresholdingOutputStream(bufferSize) { @Override - protected OutputStream getStream() { + protected OutputStream getOutputStream() { return bufferTruncated ? NullOutputStream.INSTANCE : bufferStream; } diff --git a/src/main/python/org/broadinstitute/hellbender/setup_vqsr_cnn.py b/src/main/python/org/broadinstitute/hellbender/setup_vqsr_cnn.py deleted file mode 100644 index b86d26e07df..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/setup_vqsr_cnn.py +++ /dev/null @@ -1,31 +0,0 @@ -import re -from distutils.core import setup - -def get_version_string(): - version_file = "vqsr_cnn/_version.py" - version_str_line = open(version_file, "rt").read() - version_regexp = r"^__version__ = ['\"]([^'\"]*)['\"]" - re_out = re.search(version_regexp, version_str_line, re.M) - if re_out is not None: - return re_out.group(1) - else: - raise RuntimeError("Unable to find version string in %s." % (version_file,)) - -setup(name='vqsr_cnn', - version=get_version_string(), - description='Variant quality score recalibration with Convolutional Neural Networks', - author='Sam Friedman', - author_email='sam@broadinstitute.org', - license='LICENSE.txt', - packages=['vqsr_cnn'], - install_requires=[ - "keras >= 2.0", - "numpy >= 1.13.1", - "scipy >= 0.19.1", - "pysam >= 0.13", - "scikit-learn >= 0.19.1", - "matplotlib >= 2.1.2", - "pyvcf >= 0.6.8", - "biopython >= 1.70" - ] -) diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/__init__.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/__init__.py deleted file mode 100644 index ece27f8dc1e..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from .vqsr_cnn.models import build_2d_annotation_model_from_args, build_1d_annotation_model_from_args -from .vqsr_cnn.models import build_default_1d_annotation_model, build_default_2d_annotation_model -from .vqsr_cnn.models import start_session_get_args_and_model, train_model_from_generators -from .vqsr_cnn.tensor_maps import get_tensor_channel_map_from_args, tensor_shape_from_args -from .vqsr_cnn.arguments import parse_args, weight_path_from_args, annotations_from_args -from .vqsr_cnn.inference import score_and_write_batch -from .vqsr_cnn.plots import plot_roc_per_class -from ._version import __version__ -from .vqsr_cnn.defines import * diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/_version.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/_version.py deleted file mode 100644 index 57d4bebe5ad..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/_version.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.0.29' diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/__init__.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/__init__.py deleted file mode 100644 index c89424a56ad..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .models import build_2d_annotation_model_from_args, build_1d_annotation_model_from_args -from .models import build_default_1d_annotation_model, build_default_2d_annotation_model -from .models import start_session_get_args_and_model, train_model_from_generators -from .tensor_maps import get_tensor_channel_map_from_args, tensor_shape_from_args -from .arguments import parse_args, weight_path_from_args, annotations_from_args -from .inference import score_and_write_batch -from .plots import plot_roc_per_class -from .defines import * diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/arguments.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/arguments.py deleted file mode 100644 index ca537fd82ea..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/arguments.py +++ /dev/null @@ -1,224 +0,0 @@ -import argparse -import numpy as np - -import keras.backend as K - -from . import defines - - -def parse_args(): - """Parse command line arguments. - - The args namespace is used promiscuously in this module. - Its fields control the tensor definition, dataset generation, training, file I/O and evaluation. - Some of the fields are typically dicts or lists that are not actually set on the command line, - but via a companion argument also in the namespace. - For example, input_symbols is set via the input_symbol_set string - and, annotations is set via the annotation_set string. - Here we also seed the random number generator. - The keras image data format is set here as well via the channels_last or channels_first arguments. - - Returns: - namespace: The args namespace that is used throughout this module. - """ - parser = argparse.ArgumentParser() - - # Tensor defining arguments - parser.add_argument('--tensor_name', default='read_tensor', choices=defines.TENSOR_MAPS_1D+defines.TENSOR_MAPS_2D, - help='String key which identifies the map from tensor channels to their meaning.') - parser.add_argument('--labels', default=defines.SNP_INDEL_LABELS, - help='Dict mapping label names to their index within label tensors.') - parser.add_argument('--input_symbol_set', default='dna_indel', choices=defines.INPUT_SYMBOLS.keys(), - help='Key which maps to an input symbol to index mapping.') - parser.add_argument('--input_symbols', help='Dict mapping input symbols to their index within input tensors, ' - + 'initialised via input_symbols_set argument') - parser.add_argument('--batch_size', default=32, type=int, - help='Mini batch size for stochastic gradient descent algorithms.') - parser.add_argument('--read_limit', default=128, type=int, - help='Maximum number of reads to load.') - parser.add_argument('--window_size', default=128, type=int, - help='Size of sequence window to use as input, typically centered at a variant.') - parser.add_argument('--base_quality_mode', default='phot', choices=['phot', 'phred', '1hot'], - help='How to treat base qualities, must be in [phot, phred, 1hot]') - parser.add_argument('--channels_last', default=True, dest='channels_last', action='store_true', - help='Store the channels in the last axis of tensors, tensorflow->true, theano->false') - parser.add_argument('--channels_first', dest='channels_last', action='store_false', - help='Store the channels in the first axis of tensors, tensorflow->false, theano->true') - - # Annotation arguments - parser.add_argument('--annotations', help='Array of annotation names, initialised via annotation_set argument') - parser.add_argument('--annotation_set', default='best_practices', choices=defines.ANNOTATIONS_SETS.keys(), - help='Key which maps to an annotations list (or _ to ignore annotations).') - - # Dataset generation related arguments - parser.add_argument('--samples', default=500, type=int, - help='Maximum number of data samples to write or load.') - parser.add_argument('--downsample_snps', default=1.0, type=float, - help='Rate of SNP examples that are kept must be in [0.0, 1.0].') - parser.add_argument('--downsample_indels', default=1.0, type=float, - help='Rate of INDEL examples that are kept must be in [0.0, 1.0].') - parser.add_argument('--downsample_not_snps', default=1.0, type=float, - help='Rate of NOT_SNP examples that are kept must be in [0.0, 1.0].') - parser.add_argument('--downsample_not_indels', default=1.0, type=float, - help='Rate of NOT_INDEL examples that are kept must be in [0.0, 1.0].') - parser.add_argument('--downsample_reference', default=0.001, type=float, - help='Rate of reference genotype examples that are kept must be in [0.0, 1.0].') - parser.add_argument('--downsample_homozygous', default=0.001, type=float, - help='Rate of homozygous genotypes that are kept must be in [0.0, 1.0].') - parser.add_argument('--start_pos', default=0, type=int, - help='Genomic position start for parallel tensor writing.') - parser.add_argument('--end_pos', default=0, type=int, - help='Genomic position end for parallel tensor writing.') - parser.add_argument('--skip_positive_class', default=False, action='store_true', - help='Whether to skip positive examples when writing tensors.') - parser.add_argument('--chrom', help='Chromosome to load for parallel tensor writing.') - - - # I/O files and directories: vcfs, bams, beds, hd5, fasta - parser.add_argument('--output_dir', default='./', help='Directory to write models or other data out.') - parser.add_argument('--image_dir', default=None, help='Directory to write images and plots to.') - parser.add_argument('--reference_fasta', help='The reference FASTA file (e.g. HG19 or HG38).') - parser.add_argument('--weights_hd5', default='', - help='A hd5 file of weights to initialize a model, will use all layers with names that match.') - parser.add_argument('--architecture', default='', - help='A json file specifying semantics and architecture of a neural net.') - parser.add_argument('--bam_file', - help='Path to a BAM file to train from or generate tensors with.') - parser.add_argument('--train_vcf', - help='Path to a VCF that has verified true calls from NIST, platinum genomes, etc.') - parser.add_argument('--input_vcf', - help='Haplotype Caller or VQSR generated VCF with raw annotation values [and quality scores].') - parser.add_argument('--output_vcf', default=None, - help='Optional VCF to write to.') - parser.add_argument('--bed_file', - help='Bed file specifying high confidence intervals associated with args.train_vcf.') - parser.add_argument('--data_dir', - help='Directory of tensors, must be split into test/valid/train directories' - +'with subdirectories for each label.') - - # Training and optimization related arguments - parser.add_argument('--epochs', default=25, type=int, - help='Number of epochs, typically passes through the entire dataset, not always well-defined.') - parser.add_argument('--batch_normalization', default=False, action='store_true', - help='Mini batch normalization layers after convolutions.') - parser.add_argument('--patience', default=4, type=int, - help='Maximum number of epochs to run without validation loss improvements (Early Stopping).') - parser.add_argument('--training_steps', default=80, type=int, - help='Number of training batches to examine in an epoch.') - parser.add_argument('--validation_steps', default=40, type=int, - help='Number of validation batches to examine in an epoch validation.') - parser.add_argument('--iterations', default=5, type=int, - help='Generic iteration limit for hyperparameter optimization, animation, and other counts.') - parser.add_argument('--tensor_board', default=False, action='store_true', - help='Add the tensor board callback.') - - parser.add_argument("--optimizer_learning_rate", default=0.0001, type=float, - help="Learning rate for the Adam optimizer.") - parser.add_argument("--optimizer_beta_1", default=0.9, type=float, - help="Beta 1 parameter for the Adam optimizer.") - parser.add_argument("--optimizer_beta_2", default=0.999, type=float, - help="Beta 2 parameter for the Adam optimizer.") - parser.add_argument("--optimizer_epsilon", default=1e-08, type=float, - help="Epsilon parameter for the Adam optimizer.") - parser.add_argument("--optimizer_clipnorm", default=1.0, type=float, - help="Clipnorm parameter for the Adam optimizer.") - - # Architecture defining arguments - parser.add_argument('--conv_width', default=5, type=int, help='Width of convolutional kernels.') - parser.add_argument('--conv_height', default=5, type=int, help='Height of convolutional kernels.') - parser.add_argument('--conv_dropout', default=0.0, type=float, - help='Dropout rate in convolutional layers.') - parser.add_argument('--conv_batch_normalize', default=False, action='store_true', - help='Batch normalize convolutional layers.') - parser.add_argument('--conv_layers', nargs='+', default=[128, 96, 64, 48], type=int, - help='List of sizes for each convolutional filter layer') - parser.add_argument('--padding', default='valid', choices=['valid', 'same'], - help='Valid or same border padding for convolutional layers.') - parser.add_argument('--spatial_dropout', default=False, action='store_true', - help='Spatial dropout on the convolutional layers.') - parser.add_argument('--max_pools', nargs='+', default=[], type=int, - help='List of max-pooling layers.') - parser.add_argument('--fc_layers', nargs='+', default=[32], type=int, - help='List of sizes for each fully connected layer') - parser.add_argument('--fc_dropout', default=0.0, type=float, - help='Dropout rate in fully connected layers.') - parser.add_argument('--fc_batch_normalize', default=False, action='store_true', - help='Batch normalize fully connected layers.') - parser.add_argument('--annotation_units', default=16, type=int, - help='Number of units connected to the annotation input layer.') - parser.add_argument('--annotation_shortcut', default=False, action='store_true', - help='Shortcut connections on the annotations.') - - # Evaluation related arguments - parser.add_argument('--score_keys', nargs='+', default=['VQSLOD'], - help='List of variant score keys for performance comparisons.') - parser.add_argument('--tranches', nargs='+', default=[100, 99.9, 99, 95, 90], type=float, - help='List of variant score keys for performance comparisons.') - - # Run specific arguments - parser.add_argument('--mode', help='High level recipe: write tensors, train, test or evaluate models.') - parser.add_argument('--id', default='no_id', - help='Identifier for this run, user-defined string to keep experiments organized.') - parser.add_argument('--gatk_version', default='4.1.0.0', - help='GATK version used to run this code.') - parser.add_argument('--model_version', default='1.0', - help='Model version for this run.') - parser.add_argument('--random_seed', default=12878, type=int, - help='Random seed to use throughout run. Always use np.random.') - - # Parse, print, set annotations and seed - args = parser.parse_args() - args.annotations = annotations_from_args(args) - args.input_symbols = input_symbols_from_args(args) - np.random.seed(args.random_seed) - - if args.channels_last: - K.set_image_data_format('channels_last') - else: - K.set_image_data_format('channels_first') - - print('Arguments are', args) - return args - - -def annotations_from_args(args): - """Get list of annotations corresponding to the args.annotation_set. - - The annotation_set argument allows us to name commonly used groups of annotations - without having to specify each annotation individually. - - Arguments: - args.annotation_set: The key for which annotation set to use. - - Returns: - list: Annotation strings as they appear in a VCF info/format field or None. - """ - if args.annotation_set and args.annotation_set in defines.ANNOTATIONS_SETS: - return defines.ANNOTATIONS_SETS[args.annotation_set] - return None - - -def input_symbols_from_args(args): - """Get dictionary mapping input data symbols to indices in the input tensor. - - Arguments: - args.input_symbol_set: The key for the symbol set to use. - - Returns: - dict: if there is a input symbol dict otherwise None - """ - if args.input_symbol_set and args.input_symbol_set in defines.INPUT_SYMBOLS: - return defines.INPUT_SYMBOLS[args.input_symbol_set] - return None - - -def weight_path_from_args(args): - """Create a weight file name from the command line arguments. - - Arguments: - args.output_dir: The directory where the file will be saved - args.id: The name of the file is this run's id with tensor suffix as file extension - """ - save_weight_hd5 = args.output_dir + args.id + defines.TENSOR_SUFFIX - return save_weight_hd5 - diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/defines.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/defines.py deleted file mode 100644 index a410414f611..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/defines.py +++ /dev/null @@ -1,56 +0,0 @@ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Definitions ~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -import re - -TENSOR_MAPS_2D = ['read_tensor'] -TENSOR_MAPS_1D = ['reference'] -TENSOR_SUFFIX = '.hd5' - -DNA_SYMBOLS = {'A': 0, 'C': 1, 'G': 2, 'T': 3} -INPUTS_INDEL = {'A': 0, 'C': 1, 'G': 2, 'T': 3, '*': 4} -INPUT_SYMBOLS = { - 'dna': DNA_SYMBOLS, - 'dna_indel': INPUTS_INDEL, -} - -# When there are base-calling ambiguities these codes are used, the values indicate the probability of each base. -# For example, The code K means there is 50% chance the real base is G, 50% it is T, and 0% chance it is A or C. -# See https://www.bioinformatics.org/sms/iupac.html -AMBIGUITY_CODES = { - 'K': [0, 0, 0.5, 0.5], 'M': [0.5, 0.5, 0, 0], 'R': [0.5, 0, 0, 0.5], 'Y': [0, 0.5, 0.5, 0], 'S': [0, 0.5, 0, 0.5], - 'W': [0.5, 0, 0.5, 0], 'B': [0, 0.333, 0.333, 0.334], 'V': [0.333, 0.333, 0, 0.334], 'H': [0.333, 0.333, 0.334, 0], - 'D': [0.333, 0, 0.333, 0.334], 'X': [0.25, 0.25, 0.25, 0.25], 'N': [0.25, 0.25, 0.25, 0.25] -} - - -# Sets of annotation sets -ANNOTATIONS_SETS = { - '_': [], # Allow command line to unset annotations - 'best_practices_w_qual': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'QUAL', 'ReadPosRankSum'], - 'best_practices': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'ReadPosRankSum'], - 'gatk': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'ReadPosRankSum'], - 'annotations': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'ReadPosRankSum'], - 'm2': ['AF', 'AD_0', 'AD_1', 'MBQ', 'MFRL_0', 'MFRL_1', 'MMQ', 'MPOS'], - 'combine': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'ReadPosRankSum', 'AF', 'AD_0', 'AD_1', 'MBQ', 'MFRL_0', 'MFRL_1', 'MMQ', 'MPOS'], - 'gnomad': ['MQ', 'DP', 'SOR', 'FS', 'QD', 'MQRankSum', 'ReadPosRankSum', 'DP_MEDIAN', 'DREF_MEDIAN', 'GQ_MEDIAN', 'AB_MEDIAN'], -} - -SNP_INDEL_LABELS = {'NOT_SNP': 0, 'NOT_INDEL': 1, 'SNP': 2, 'INDEL': 3} - -CODE2CIGAR = 'MIDNSHP=XB' -CIGAR2CODE = dict([y, x] for x, y in enumerate(CODE2CIGAR)) -CIGAR_CODE = {'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4} -CIGAR_REGEX = re.compile("(\d+)([MIDNSHP=XB])") - -SKIP_CHAR = '~' -INDEL_CHAR = '*' - -MAPPING_QUALITY_MAX = 60.0 # Mapping qualities from BWA are typically capped at 60 -READ_FLAGS = 12 # Total number of read flags, actual flags used is determined by the tensor map - -# The following constants correspond to constants set in CNNScoreVariants.java and must be kept in sync. -DATA_VALUE_SEPARATOR = ',' # If changed make change in CNNScoreVariants.java -DATA_TYPE_SEPARATOR = '\t' # If changed make change in CNNScoreVariants.java -ANNOTATION_SEPARATOR = ';' # If changed make change in CNNScoreVariants.java -ANNOTATION_SET_STRING = '=' # If changed make change in CNNScoreVariants.java diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py deleted file mode 100644 index 0adf9af9e53..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py +++ /dev/null @@ -1,495 +0,0 @@ -# Imports -import os -import math -import h5py -import numpy as np -from collections import namedtuple -from typing import List, Tuple, Dict, TextIO - -from gatktool import tool - -# Keras Imports -import keras -import keras.backend as K - -# Package Imports -from . import defines -from . import tensor_maps - -Variant = namedtuple("Variant", "contig pos ref alt type") -Read = namedtuple("Read", "seq qual cigar reverse mate_reverse first mapping_quality reference_start") - -READ_BASES_FIFO_INDEX = 0 -READ_QUAL_FIFO_INDEX = 1 -READ_CIGAR_FIFO_INDEX = 2 -READ_REVERSE_FIFO_INDEX = 3 -READ_MATE_REVERSE_FIFO_INDEX = 4 -READ_FIRST_IN_PAIR_FIFO_INDEX = 5 -READ_MQ_FIFO_INDEX = 6 -READ_REF_START_FIFO_INDEX = 7 -READ_ELEMENTS = 8 # The number of fields of the namedtuple defined above - -CONTIG_FIFO_INDEX = 0 -POS_FIFO_INDEX = 1 -REF_FIFO_INDEX = 2 -ALT_FIFO_INDEX = 3 -REF_STRING_FIFO_INDEX = 4 -ANNOTATION_FIFO_INDEX = 5 -VARIANT_TYPE_FIFO_INDEX = 6 -VARIANT_FIFO_FIELDS = 7 - - - -CIGAR_CODES_TO_COUNT = [ - defines.CIGAR_CODE['M'], defines.CIGAR_CODE['I'], defines.CIGAR_CODE['S'], defines.CIGAR_CODE['D'] -] - -p_lut = np.zeros((256,)) -not_p_lut = np.zeros((256,)) - -for i in range(256): - exponent = float(-i) / 10.0 - p_lut[i] = 1.0 - (10.0**exponent) - not_p_lut[i] = (1.0 - p_lut[i]) / 3.0 - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Inference ~~~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -def score_and_write_batch(model: keras.Model, - file_out: TextIO, - batch_size: int, - python_batch_size: int, - tensor_type: str, - annotation_set: str, - window_size: int, - read_limit: int, - tensor_dir: str = '') -> None: - """Score a batch of variants with a CNN model. Write tab delimited temp file with scores. - - This function is tightly coupled with the CNNScoreVariants.java - It requires data written to the fifo in the order given by transferToPythonViaFifo - - Arguments - model: a keras model - file_out: The temporary VCF-like file where variants scores will be written - batch_size: The total number of variants available in the fifo - python_batch_size: the number of variants to process in each inference - tensor_type: The name for the type of tensor to make - annotation_set: The name for the set of annotations to use - window_size: The size of the context window of genomic bases, i.e the width of the tensor - read_limit: The maximum number of reads to encode in a tensor, i.e. the height of the tensor - tensor_dir : If this path exists write hd5 files for each tensor (optional for debugging) - """ - annotation_batch = [] - reference_batch = [] - variant_types = [] - variant_data = [] - read_batch = [] - for _ in range(batch_size): - fifo_line = tool.readDataFIFO() - fifo_data = fifo_line.split(defines.DATA_TYPE_SEPARATOR) - - variant_data.append(fifo_data[CONTIG_FIFO_INDEX] + defines.DATA_TYPE_SEPARATOR - + fifo_data[POS_FIFO_INDEX] + defines.DATA_TYPE_SEPARATOR - + fifo_data[REF_FIFO_INDEX] + defines.DATA_TYPE_SEPARATOR + fifo_data[ALT_FIFO_INDEX]) - reference_batch.append(reference_string_to_tensor(fifo_data[REF_STRING_FIFO_INDEX])) - annotation_batch.append(annotation_string_to_tensor(annotation_set, fifo_data[ANNOTATION_FIFO_INDEX])) - variant_types.append(fifo_data[VARIANT_TYPE_FIFO_INDEX].strip()) - - fifo_idx = VARIANT_FIFO_FIELDS - if tensor_type in defines.TENSOR_MAPS_2D and len(fifo_data) > fifo_idx: - read_tuples = [] - var = Variant(fifo_data[CONTIG_FIFO_INDEX], int(fifo_data[POS_FIFO_INDEX]), fifo_data[POS_FIFO_INDEX], - fifo_data[ALT_FIFO_INDEX], fifo_data[VARIANT_TYPE_FIFO_INDEX]) - while fifo_idx+READ_ELEMENTS <= len(fifo_data): - read_tuples.append( - Read(fifo_data[fifo_idx + READ_BASES_FIFO_INDEX], - list(map(int, fifo_data[fifo_idx+READ_QUAL_FIFO_INDEX].split(defines.DATA_VALUE_SEPARATOR))), - fifo_data[fifo_idx+READ_CIGAR_FIFO_INDEX], - bool_from_java(fifo_data[fifo_idx+READ_REVERSE_FIFO_INDEX]), - bool_from_java(fifo_data[fifo_idx+READ_MATE_REVERSE_FIFO_INDEX]), - bool_from_java(fifo_data[fifo_idx+READ_FIRST_IN_PAIR_FIFO_INDEX]), - int(fifo_data[fifo_idx+READ_MQ_FIFO_INDEX]), - int(fifo_data[fifo_idx+READ_REF_START_FIFO_INDEX]))) - fifo_idx += READ_ELEMENTS - _, ref_start, _ = get_variant_window(window_size, var) - insert_dict = get_inserts(read_tuples, var, window_size) - tensor = read_tuples_to_tensor(read_tuples, ref_start, insert_dict, tensor_type, window_size, read_limit) - reference_sequence_into_tensor(fifo_data[4], tensor, insert_dict, window_size, read_limit) - if os.path.exists(tensor_dir): - _write_tensor_to_hd5(tensor, annotation_batch[-1], fifo_data[0], fifo_data[1], fifo_data[6], - tensor_type, annotation_set, tensor_dir) - read_batch.append(tensor) - - if tensor_type in defines.TENSOR_MAPS_1D: - predictions = model.predict([np.array(reference_batch), np.array(annotation_batch)], - batch_size=python_batch_size) - elif tensor_type in defines.TENSOR_MAPS_2D: - predictions = model.predict( - [np.array(read_batch), np.array(annotation_batch)], batch_size=python_batch_size) - else: - raise ValueError('Unknown tensor mapping. Check architecture file.', tensor_type) - - indel_scores = predictions_to_indel_scores(predictions) - snp_scores = predictions_to_snp_scores(predictions) - - for i in range(batch_size): - if 'SNP' == variant_types[i]: - file_out.write(variant_data[i] + defines.DATA_TYPE_SEPARATOR + '{0:.3f}'.format(snp_scores[i]) + '\n') - elif 'INDEL' == variant_types[i]: - file_out.write(variant_data[i] + defines.DATA_TYPE_SEPARATOR + '{0:.3f}'.format(indel_scores[i]) + '\n') - else: - file_out.write(variant_data[i] + defines.DATA_TYPE_SEPARATOR - + '{0:.3f}'.format(max(snp_scores[i], indel_scores[i])) + '\n') - - -def reference_string_to_tensor(reference: str) -> np.ndarray: - dna_data = np.zeros((len(reference), len(defines.DNA_SYMBOLS))) - for i,b in enumerate(reference): - if b in defines.DNA_SYMBOLS: - dna_data[i, defines.DNA_SYMBOLS[b]] = 1.0 - elif b in defines.AMBIGUITY_CODES: - dna_data[i] = defines.AMBIGUITY_CODES[b] - elif b == '\x00': - break - else: - raise ValueError('Error! Unknown code:', b) - return dna_data - - -def annotation_string_to_tensor(annotation_set: str, annotation_string: str) -> np.ndarray: - name_val_pairs = annotation_string.split(defines.ANNOTATION_SEPARATOR) - annotation_names = annotation_set.split(defines.DATA_VALUE_SEPARATOR) - name_val_arrays = [p.split(defines.ANNOTATION_SET_STRING) for p in name_val_pairs] - annotation_map = {str(p[0]).strip(): p[1] for p in name_val_arrays if len(p) > 1} - annotation_data = np.zeros((len(annotation_names),)) - for ii, a in enumerate(annotation_names): - if a in annotation_map and not math.isnan(float(annotation_map[a])): - annotation_data[ii] = annotation_map[a] - - return annotation_data - - -def get_inserts(read_tuples: List[Read], variant: Variant, window_size: int, sort_by: str='base') -> Dict: - """A dictionary mapping insertions to reference positions. - - Ignores artificial haplotype read group. - Relies on pysam's cigartuples structure see: http://pysam.readthedocs.io/en/latest/api.html - Match, M -> 0 - Insert, I -> 1 - Deletion, D -> 2 - Ref Skip, N -> 3 - Soft Clip, S -> 4 - - Arguments: - read_tuples: list of aligned read tuples to find insertions within - variant: the variant around which reads will load - window_size: The size of the context window of genomic bases, i.e the width of the tensor - sort_by: sort reads at the variant by base or refernce start - - Returns: - insert_dict: a dict mapping read indices to max insertions at that point - """ - insert_dict = {} - - idx_offset, ref_start, ref_end = get_variant_window(window_size, variant) - - for read in read_tuples: - index_dif = ref_start - read.reference_start - if abs(index_dif) >= window_size: - continue - - if 'I' in read.cigar: - cur_idx = 0 - for t in cigar_string_to_tuples(read.cigar): - if t[0] == defines.CIGAR_CODE['I']: - insert_idx = cur_idx - index_dif - if insert_idx not in insert_dict: - insert_dict[insert_idx] = t[1] - elif insert_dict[insert_idx] < t[1]: - insert_dict[insert_idx] = t[1] - - if t[0] in CIGAR_CODES_TO_COUNT: - cur_idx += t[1] - - read_tuples.sort(key=lambda r: r.reference_start) - if sort_by == 'base': - read_tuples.sort(key=lambda r: get_base_to_sort_by(r, variant)) - - return insert_dict - - -def get_base_to_sort_by(read: Read, variant: Variant) -> str: - if len(read.seq) > 0: - max_idx = len(read.seq)-1 - else: - return 'Z' - - if variant.type == 'SNP': - return read.seq[clamp((variant.pos-read.reference_start), 0, max_idx)] - else: - var_idx = (variant.pos-read.reference_start)+1 - cur_idx = 0 - for cur_op, length in cigar_string_to_tuples(read.cigar): - cur_idx += length - if cur_idx > var_idx: - if cur_op == defines.CIGAR_CODE['M']: - return read.seq[clamp(var_idx, 0, max_idx)] - else: - return defines.CODE2CIGAR[cur_op] - return 'Y' - - -def cigar_string_to_tuples(cigar: str) -> List[Tuple]: - if not cigar or len(cigar) == 0: - return [] - parts = defines.CIGAR_REGEX.findall(cigar) - # reverse order - return [(defines.CIGAR2CODE[y], int(x)) for x,y in parts] - - -def get_variant_window(window_size: int, variant: Variant) -> Tuple: - index_offset = (window_size//2) - reference_start = variant.pos-index_offset - reference_end = variant.pos + index_offset + (window_size % 2) - return index_offset, reference_start, reference_end - - -def bool_from_java(val: str) -> bool: - return val == 'true' - - -def clamp(n: int, minn: int, maxn: int) -> int: - return max(min(maxn, n), minn) - - -def read_tuples_to_tensor(read_tuples: List[Read], - ref_start: int, - insert_dict: Dict, - tensor_type: str, - window_size: int, - read_limit: int, - base_quality_mode: str='phot') -> np.ndarray: - """Create a read tensor based on a tensor channel map. - - Assumes read pairs have the same name. - Only loads reads that might align inside the tensor. - - Arguments: - read_tuples: list of reads to make into a tensor - ref_start: the beginning of the window in reference coordinates - insert_dict: a dict mapping read indices to max insertions at that point. - tensor_type: The name for the type of tensor to make - window_size: The size of the context window of genomic bases, i.e the width of the tensor - read_limit: The maximum number of reads to encode in a tensor, i.e. the height of the tensor - base_quality_mode: How to encode qualities in the tensor (phot, 1hot or phred) - - Returns: - tensor: 3D read tensor. - """ - channel_map = tensor_maps.get_tensor_channel_map_from_tensor_type(tensor_type) - tensor = np.zeros(tensor_maps.tensor_shape_from_tensor_type(tensor_type, window_size, read_limit)) - - if len(read_tuples) > read_limit: - read_tuples_idx = np.random.choice(range(len(read_tuples)), size=read_limit, replace=False) - read_tuples = [read_tuples[ii] for ii in read_tuples_idx] - - for j, read in enumerate(read_tuples): - rseq, rqual = sequence_and_qualities_from_read(read, ref_start, insert_dict, window_size) - flag_start = -1 - flag_end = 0 - - for ii, b in enumerate(rseq): - - if ii == window_size: - break - - if b == defines.SKIP_CHAR: - continue - elif flag_start == -1: - flag_start = ii - else: - flag_end = ii - - if b in defines.INPUTS_INDEL: - if b == defines.INDEL_CHAR: - if K.image_data_format() == 'channels_last': - tensor[j, ii, defines.INPUTS_INDEL[b]] = 1.0 - else: - tensor[defines.INPUTS_INDEL[b], j, ii] = 1.0 - else: - hot_array = quality_from_mode(rqual[ii], b, defines.INPUTS_INDEL, base_quality_mode) - if K.image_data_format() == 'channels_last': - tensor[j, ii, :4] = hot_array - else: - tensor[:4, j, ii] = hot_array - elif b in defines.AMBIGUITY_CODES: - if K.image_data_format() == 'channels_last': - tensor[j, ii, :4] = defines.AMBIGUITY_CODES[b] - else: - tensor[:4, j, ii] = defines.AMBIGUITY_CODES[b] - else: - raise ValueError('Unknown symbol in seq block:', b) - - if K.image_data_format() == 'channels_last': - tensor[j, flag_start:flag_end, channel_map['flag_bit_4']] = 1.0 if read.reverse else 0.0 - tensor[j, flag_start:flag_end, channel_map['flag_bit_5']] = 1.0 if read.mate_reverse else 0.0 - tensor[j, flag_start:flag_end, channel_map['flag_bit_6']] = 1.0 if read.first else 0.0 - tensor[j, flag_start:flag_end, channel_map['flag_bit_7']] = 0.0 if read.first else 1.0 - else: - tensor[channel_map['flag_bit_4'], j, flag_start:flag_end] = 1.0 if read.reverse else 0.0 - tensor[channel_map['flag_bit_5'], j, flag_start:flag_end] = 1.0 if read.mate_reverse else 0.0 - tensor[channel_map['flag_bit_6'], j, flag_start:flag_end] = 1.0 if read.first else 0.0 - tensor[channel_map['flag_bit_7'], j, flag_start:flag_end] = 0.0 if read.first else 1.0 - - if 'mapping_quality' in channel_map: - mq = float(read.mapping_quality) / defines.MAPPING_QUALITY_MAX - if K.image_data_format() == 'channels_last': - tensor[j, flag_start:flag_end, channel_map['mapping_quality']] = mq - else: - tensor[channel_map['mapping_quality'], j, flag_start:flag_end] = mq - - return tensor - - -def sequence_and_qualities_from_read(read: Read, ref_start: int, insert_dict: Dict, window_size: int) -> Tuple: - cur_idx = 0 - my_indel_dict = {} - no_qual_filler = 0 - - index_dif = ref_start - read.reference_start - for t in cigar_string_to_tuples(read.cigar): - my_ref_idx = cur_idx - index_dif - if t[0] == defines.CIGAR_CODE['I'] and my_ref_idx in insert_dict: - my_indel_dict[my_ref_idx] = insert_dict[my_ref_idx] - t[1] - elif t[0] == defines.CIGAR_CODE['D']: - my_indel_dict[my_ref_idx] = t[1] - if t[0] in CIGAR_CODES_TO_COUNT: - cur_idx += t[1] - - for k in insert_dict.keys(): - if k not in my_indel_dict: - my_indel_dict[k] = insert_dict[k] - - rseq = read.seq[:window_size] - rqual = read.qual[:window_size] - - if index_dif > 0: - rseq = rseq[index_dif:] - rqual = rqual[index_dif:] - elif index_dif < 0: - rseq = defines.SKIP_CHAR * (-index_dif) + rseq - rqual = [no_qual_filler]*(-index_dif) + rqual - - for j in sorted(my_indel_dict.keys(), key=int, reverse=True): - if j < 1: - rseq = (defines.INDEL_CHAR * my_indel_dict[j]) + rseq - rqual = ([no_qual_filler]*my_indel_dict[j]) + rqual - else: - rseq = rseq[:j] + (defines.INDEL_CHAR * my_indel_dict[j]) + rseq[j:] - rqual = rqual[:j] + ([no_qual_filler]*my_indel_dict[j]) + rqual[j:] - - return rseq, rqual - - -def reference_sequence_into_tensor(reference_seq: str, - tensor: np.ndarray, - insert_dict: Dict, - window_size: int, - read_limit: int): - ref_offset = len(defines.INPUTS_INDEL) - - for ii in sorted(insert_dict.keys(), key=int, reverse=True): - if ii < 0: - reference_seq = defines.INDEL_CHAR*insert_dict[ii] + reference_seq - else: - reference_seq = reference_seq[:ii] + defines.INDEL_CHAR*insert_dict[ii] + reference_seq[ii:] - - for ii,b in enumerate(reference_seq): - if ii == window_size: - break - - if b in defines.INPUTS_INDEL: - if K.image_data_format() == 'channels_last': - tensor[:, ii, ref_offset+defines.INPUTS_INDEL[b]] = 1.0 - else: - tensor[ref_offset+defines.INPUTS_INDEL[b], :, ii] = 1.0 - elif b in defines.AMBIGUITY_CODES: - if K.image_data_format() == 'channels_last': - tensor[:, ii, ref_offset:ref_offset+4] = np.tile(defines.AMBIGUITY_CODES[b], (read_limit, 1)) - else: - tensor[ref_offset:ref_offset+4, :, ii] = np.transpose( - np.tile(defines.AMBIGUITY_CODES[b], (read_limit, 1))) - - -def base_quality_to_phred_array(base_quality: int, base: str, base_dict: Dict) -> np.ndarray: - phred = np.zeros((4,)) - exponent = float(-base_quality) / 10.0 - p = 1.0-(10.0**exponent) # Convert to probability - not_p = (1.0-p) / 3.0 # Error could be any of the other 3 bases - not_base_quality = -10 * np.log10(not_p) # Back to Phred - - for b in base_dict.keys(): - if b == defines.INDEL_CHAR: - continue - elif b == base: - phred[base_dict[b]] = base_quality - else: - phred[base_dict[b]] = not_base_quality - return phred - - -def base_quality_to_p_hot_array(base_quality: int, base: str, base_dict: Dict) -> np.ndarray: - not_p = not_p_lut[base_quality] - phot = [not_p, not_p, not_p, not_p] - phot[base_dict[base]] = p_lut[base_quality] - - return phot - - -def quality_from_mode(base_quality: int, base: str, base_dict: Dict, base_quality_mode: str) -> np.ndarray: - if base_quality_mode == 'phot': - return base_quality_to_p_hot_array(base_quality, base, base_dict) - elif base_quality_mode == 'phred': - return base_quality_to_phred_array(base_quality, base, base_dict) - elif base_quality_mode == '1hot': - one_hot = np.zeros((4,)) - one_hot[base_dict[base]] = 1.0 - return one_hot - else: - raise ValueError('Unknown base quality mode:', base_quality_mode) - - -def predictions_to_snp_scores(predictions: np.ndarray, eps: float=1e-7) -> np.ndarray: - snp = predictions[:, defines.SNP_INDEL_LABELS['SNP']] - not_snp = predictions[:, defines.SNP_INDEL_LABELS['NOT_SNP']] - return np.log(eps + snp / (not_snp + eps)) - - -def predictions_to_indel_scores(predictions: np.ndarray, eps: float=1e-7) -> np.ndarray: - indel = predictions[:, defines.SNP_INDEL_LABELS['INDEL']] - not_indel = predictions[:, defines.SNP_INDEL_LABELS['NOT_INDEL']] - return np.log(eps + indel / (not_indel + eps)) - - -def predictions_to_snp_indel_scores(predictions: np.ndarray) -> Tuple: - snp_dict = predictions_to_snp_scores(predictions) - indel_dict = predictions_to_indel_scores(predictions) - return snp_dict, indel_dict - - -def _write_tensor_to_hd5(tensor: np.ndarray, - annotations: np.ndarray, - contig: str, - pos: str, - variant_type: str, - tensor_type: str, - annotation_set: str, - output_dir: str,) -> None: - tensor_path = os.path.join(output_dir, 'inference_tensor_'+contig+pos+variant_type+defines.TENSOR_SUFFIX) - if not os.path.exists(os.path.dirname(tensor_path)): - os.makedirs(os.path.dirname(tensor_path)) - with h5py.File(tensor_path, 'w') as hf: - hf.create_dataset(tensor_type, data=tensor, compression='gzip') - hf.create_dataset(annotation_set, data=annotations, compression='gzip') diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/models.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/models.py deleted file mode 100644 index 379835f87d6..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/models.py +++ /dev/null @@ -1,616 +0,0 @@ -import os -import json - -# Keras Imports -from keras import layers -from keras import metrics -import keras.backend as K -from keras.optimizers import Adam -from keras.models import Model, load_model -from keras.layers.convolutional import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D -from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard, ReduceLROnPlateau -from keras.layers import Input, Dense, Dropout, BatchNormalization, SpatialDropout1D, SpatialDropout2D, Activation, Flatten, AlphaDropout - -from . import plots -from . import defines -from . import arguments -from . import tensor_maps - - -def start_session_get_args_and_model(intra_ops, inter_ops, semantics_json, weights_hd5=None, tensor_type=None): - K.clear_session() - K.get_session().close() - cfg = K.tf.ConfigProto(intra_op_parallelism_threads=intra_ops, inter_op_parallelism_threads=inter_ops) - cfg.gpu_options.allow_growth = True - K.set_session(K.tf.Session(config=cfg)) - return args_and_model_from_semantics(semantics_json, weights_hd5, tensor_type) - - -def args_and_model_from_semantics(semantics_json, weights_hd5=None, tensor_type=None): - args = arguments.parse_args() - - if semantics_json is not None and os.path.exists(semantics_json): - model = set_args_and_get_model_from_semantics(args, semantics_json, weights_hd5) - else: - model = load_model(weights_hd5, custom_objects=get_metric_dict(args.labels)) - args.tensor_name = tensor_type - - return args, model - - -def set_args_and_get_model_from_semantics(args, semantics_json, weights_hd5=None): - """Recreate a model from a json file specifying model semantics. - - Update the args namespace from the semantics file values. - Assert that the serialized tensor map and the recreated one are the same. - - Arguments: - args.tensor_name: String which indicates tensor map to use or None - args.window_size: sites included in the tensor map - args.read_limit: Maximum reads included in the tensor map - args.annotations: List of annotations or None - semantics_json: Semantics json file (created with serialize_model_semantics()) - - Returns: - The Keras model - """ - with open(semantics_json, 'r') as infile: - semantics = json.load(infile) - - if 'model_version' in semantics: - assert(args.model_version == semantics['model_version']) - - if 'input_tensor_map' in semantics: - args.tensor_name = semantics['input_tensor_map_name'] - args.window_size = semantics['window_size'] - args.read_limit = semantics['read_limit'] - tm = tensor_maps.get_tensor_channel_map_from_args(args) - assert(len(tm) == len(semantics['input_tensor_map'])) - for key in tm: - assert(tm[key] == semantics['input_tensor_map'][key]) - - if 'input_annotations' in semantics: - args.annotations = semantics['input_annotations'] - args.annotation_set = semantics['input_annotation_set'] - - args.input_symbols = semantics['input_symbols'] - args.labels = semantics['output_labels'] - - if 'channels_last' in semantics: - args.channels_last = semantics['channels_last'] - if args.channels_last: - K.set_image_data_format('channels_last') - else: - K.set_image_data_format('channels_first') - - if weights_hd5 is None: - weights_hd5 = os.path.join(os.path.dirname(semantics_json), semantics['architecture']) - - print('Updated arguments:', args, '\nWeight file from:', weights_hd5) - model = load_model(weights_hd5, custom_objects=get_metric_dict(args.labels)) - model.summary() - return model - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Models ~~~~~~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -def build_default_1d_annotation_model(args): - return build_reference_annotation_1d_model_from_args(args, - conv_width=7, - conv_layers=[256, 216, 128, 64, 32], - conv_dropout=0.1, - conv_batch_normalize=True, - spatial_dropout=True, - max_pools=[], - padding='same', - annotation_units=64, - annotation_shortcut=True, - fc_layers=[64, 64], - fc_dropout=0.2, - annotation_batch_normalize=True, - fc_batch_normalize=False) - - -def build_1d_annotation_model_from_args(args): - return build_reference_annotation_1d_model_from_args(args, - conv_width=args.conv_width, - conv_layers=args.conv_layers, - conv_dropout=args.conv_dropout, - conv_batch_normalize=args.conv_batch_normalize, - spatial_dropout=args.spatial_dropout, - max_pools=args.max_pools, - padding=args.padding, - annotation_units=args.annotation_units, - annotation_shortcut=args.annotation_shortcut, - fc_layers=args.fc_layers, - fc_dropout=args.fc_dropout, - fc_batch_normalize=args.fc_batch_normalize) - - -def build_2d_annotation_model_from_args(args): - return read_tensor_2d_annotation_model_from_args(args, - conv_width = args.conv_width, - conv_height = args.conv_height, - conv_layers = args.conv_layers, - conv_dropout = args.conv_dropout, - conv_batch_normalize = args.conv_batch_normalize, - spatial_dropout = args.spatial_dropout, - max_pools = args.max_pools, - padding = args.padding, - annotation_units = args.annotation_units, - annotation_shortcut = args.annotation_shortcut, - fc_layers = args.fc_layers, - fc_dropout = args.fc_dropout, - fc_batch_normalize = args.fc_batch_normalize) - - -def build_default_2d_annotation_model(args): - return read_tensor_2d_annotation_model_from_args(args, - conv_width = 25, - conv_height = 25, - conv_layers = [64, 48, 32, 24], - conv_dropout = 0.1, - conv_batch_normalize = False, - spatial_dropout = True, - max_pools = [(3,1),(3,1)], - padding='valid', - annotation_units = 64, - annotation_shortcut = False, - fc_layers = [24], - fc_dropout = 0.3, - fc_batch_normalize = False) - - -def read_tensor_2d_annotation_model_from_args(args, - conv_width = 6, - conv_height = 6, - conv_layers = [128, 128, 128, 128], - conv_dropout = 0.0, - conv_batch_normalize = False, - spatial_dropout = True, - residual_layers = [], - max_pools = [(3,1), (3,3)], - padding='valid', - annotation_units = 16, - annotation_shortcut = False, - annotation_batch_normalize = True, - fc_layers = [64], - fc_dropout = 0.0, - fc_batch_normalize = False, - kernel_initializer='glorot_normal', - kernel_single_channel=True, - fc_initializer='glorot_normal'): - '''Builds Read Tensor 2d CNN model with variant annotations mixed in for classifying variants. - - Arguments specify widths and depths of each layer. - 2d Convolutions followed by dense connection mixed with annotation values. - Dynamically sets input channels based on args via defines.total_input_channels_from_args(args) - Uses the functional API. Supports theano or tensorflow channel ordering. - Prints out model summary. - - Arguments - args.window_size: Length in base-pairs of sequence centered at the variant to use as input. - args.labels: The output labels (e.g. SNP, NOT_SNP, INDEL, NOT_INDEL) - args.weights_hd5: An existing model file to load weights from - args.channels_last: Theano->False or Tensorflow->True channel ordering flag - conv_layers: list of number of convolutional filters in each layer - batch_normalization: Boolean whether to apply batch normalization or not - Returns - The keras model - ''' - in_channels = tensor_maps.total_input_channels_from_args(args) - - if K.image_data_format() == 'channels_last': - in_shape = (args.read_limit, args.window_size, in_channels) - concat_axis = -1 - else: - in_shape = (in_channels, args.read_limit, args.window_size) - concat_axis = 1 - - x = read_tensor_in = Input(shape=in_shape, name=args.tensor_name) - - max_pool_diff = max(0, len(conv_layers)-len(max_pools)) - - # Add convolutional layers - for i,f in enumerate(conv_layers): - if kernel_single_channel and i%2 == 0: - cur_kernel = (conv_width, 1) - elif kernel_single_channel: - cur_kernel = (1, conv_height) - else: - cur_kernel = (conv_width, conv_height) - - if conv_batch_normalize: - x = Conv2D(int(f), cur_kernel, activation='linear', padding=padding, kernel_initializer=kernel_initializer)(x) - x = BatchNormalization(axis=concat_axis)(x) - x = Activation('relu')(x) - else: - x = Conv2D(int(f), cur_kernel, activation='relu', padding=padding, kernel_initializer=kernel_initializer)(x) - - if conv_dropout > 0 and spatial_dropout: - x = SpatialDropout2D(conv_dropout)(x) - elif conv_dropout > 0: - x = Dropout(conv_dropout)(x) - - if i >= max_pool_diff: - x = MaxPooling2D(max_pools[i-max_pool_diff])(x) - - for i,r in enumerate(residual_layers): - if kernel_single_channel and i%2 == 0: - cur_kernel = (conv_width, 1) - elif kernel_single_channel: - cur_kernel = (1, conv_height) - else: - cur_kernel = (conv_width, conv_height) - - y = Conv2D(r.filters[0], (1, 1), strides=r.strides)(x) - y = BatchNormalization(axis=concat_axis)(y) - y = Activation('relu')(y) - - y = Conv2D(r.filters[1], cur_kernel, padding='same')(y) - y = BatchNormalization(axis=concat_axis)(y) - y = Activation('relu')(y) - - y = Conv2D(r.filters[2], (1, 1))(y) - y = BatchNormalization(axis=concat_axis)(y) - - if r.identity: - x = layers.add([y, x]) - else: - shortcut = Conv2D(r.filters[2], (1, 1), strides=r.strides)(x) - shortcut = BatchNormalization(axis=concat_axis)(shortcut) - x = layers.add([y, shortcut]) - - x = Activation('relu')(x) - - x = Flatten()(x) - - # Mix the variant annotations in - annotations = annotations_in = Input(shape=(len(args.annotations),), name=args.annotation_set) - if annotation_batch_normalize: - annotations_in = BatchNormalization(axis=-1)(annotations) - - annotations_mlp = Dense(units=annotation_units, kernel_initializer=fc_initializer, activation='relu')(annotations_in) - x = layers.concatenate([x, annotations_mlp], axis=concat_axis) - - # Fully connected layers - for fc_units in fc_layers: - - if fc_batch_normalize: - x = Dense(units=fc_units, kernel_initializer=fc_initializer, activation='linear')(x) - x = BatchNormalization(axis=1)(x) - x = Activation('relu')(x) - else: - x = Dense(units=fc_units, kernel_initializer=fc_initializer, activation='relu')(x) - - if fc_dropout > 0: - x = Dropout(fc_dropout)(x) - - if annotation_shortcut: - x = layers.concatenate([x, annotations_in], axis=concat_axis) - - # Softmax output - prob_output = Dense(units=len(args.labels), kernel_initializer=fc_initializer, activation='softmax')(x) - - # Map inputs to outputs - model = Model(inputs=[read_tensor_in, annotations], outputs=[prob_output]) - - # adamo = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clipnorm=1.) - adamo = Adam(lr=args.optimizer_learning_rate, beta_1=args.optimizer_beta_1, - beta_2=args.optimizer_beta_1, epsilon=args.optimizer_epsilon, clipnorm=args.optimizer_clipnorm) - model.compile(loss='categorical_crossentropy', optimizer=adamo, metrics=get_metrics(args.labels)) - model.summary() - - if os.path.exists(args.weights_hd5): - model.load_weights(args.weights_hd5, by_name=True) - print('Loaded model weights from:', args.weights_hd5) - - return model - - -def build_reference_annotation_1d_model_from_args(args, - conv_width = 6, - conv_layers = [128, 128, 128, 128], - conv_dropout = 0.0, - conv_batch_normalize = False, - spatial_dropout = True, - max_pools = [], - padding='valid', - activation = 'relu', - annotation_units = 16, - annotation_shortcut = False, - annotation_batch_normalize = True, - fc_layers = [64], - fc_dropout = 0.0, - fc_batch_normalize = False, - fc_initializer = 'glorot_normal', - kernel_initializer = 'glorot_normal', - alpha_dropout = False - ): - '''Build Reference 1d CNN model for classifying variants. - - Architecture specified by parameters. - Dynamically sets input channels based on args via defines.total_input_channels_from_args(args) - Uses the functional API. - Prints out model summary. - - Arguments - args.annotations: The variant annotations, perhaps from a HaplotypeCaller VCF. - args.labels: The output labels (e.g. SNP, NOT_SNP, INDEL, NOT_INDEL) - - Returns - The keras model - ''' - in_channels = tensor_maps.total_input_channels_from_args(args) - concat_axis = -1 - x = reference = Input(shape=(args.window_size, in_channels), name=args.tensor_name) - - max_pool_diff = len(conv_layers)-len(max_pools) - for i,c in enumerate(conv_layers): - - if conv_batch_normalize: - x = Conv1D(filters=c, kernel_size=conv_width, activation='linear', padding=padding, kernel_initializer=kernel_initializer)(x) - x = BatchNormalization(axis=concat_axis)(x) - x = Activation(activation)(x) - else: - x = Conv1D(filters=c, kernel_size=conv_width, activation=activation, padding=padding, kernel_initializer=kernel_initializer)(x) - - if conv_dropout > 0 and alpha_dropout: - x = AlphaDropout(conv_dropout)(x) - elif conv_dropout > 0 and spatial_dropout: - x = SpatialDropout1D(conv_dropout)(x) - elif conv_dropout > 0: - x = Dropout(conv_dropout)(x) - - if i >= max_pool_diff: - x = MaxPooling1D(max_pools[i-max_pool_diff])(x) - - f = Flatten()(x) - - annotations = annotations_in = Input(shape=(len(args.annotations),), name=args.annotation_set) - if annotation_batch_normalize: - annotations_in = BatchNormalization(axis=concat_axis)(annotations_in) - annotation_mlp = Dense(units=annotation_units, kernel_initializer=fc_initializer, activation=activation)(annotations_in) - - x = layers.concatenate([f, annotation_mlp], axis=1) - for fc in fc_layers: - if fc_batch_normalize: - x = Dense(units=fc, activation='linear', kernel_initializer=fc_initializer)(x) - x = BatchNormalization(axis=1)(x) - x = Activation(activation)(x) - else: - x = Dense(units=fc, activation=activation, kernel_initializer=fc_initializer)(x) - - if fc_dropout > 0 and alpha_dropout: - x = AlphaDropout(fc_dropout)(x) - elif fc_dropout > 0: - x = Dropout(fc_dropout)(x) - - if annotation_shortcut: - x = layers.concatenate([x, annotations_in], axis=1) - - prob_output = Dense(units=len(args.labels), activation='softmax', name='softmax_predictions')(x) - - model = Model(inputs=[reference, annotations], outputs=[prob_output]) - - # adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, clipnorm=1.) - adam = Adam(lr=args.optimizer_learning_rate, beta_1=args.optimizer_beta_1, - beta_2=args.optimizer_beta_1, epsilon=args.optimizer_epsilon, clipnorm=args.optimizer_clipnorm) - model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=get_metrics(args.labels)) - model.summary() - - if os.path.exists(args.weights_hd5): - model.load_weights(args.weights_hd5, by_name=True) - print('Loaded model weights from:', args.weights_hd5) - - return model - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Optimizing ~~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -def train_model_from_generators(args, model, generate_train, generate_valid, save_weight_hd5): - '''Train an image model for classifying variants. - - Training data lives on disk, it will be loaded by generator functions. - Plots the metric history after training. Creates a directory to save weights at if necessary. - - Arguments - args.batch_size: size of the mini-batches - args.patience: Maximum number of epochs to run without validation loss improvement - args.epochs: Maximum number of epochs to run regardless of Early Stopping - args.training_steps: Number of mini-batches in each so-called epoch - args.validation_steps: Number of validation mini-batches to examine after each epoch. - model: the model to optimize - generate_train: training data generator function - valid_tuple: Validation data data generator function - save_weight_hd5: path to save the model weights at - - Returns - The now optimized keras model - ''' - if not os.path.exists(os.path.dirname(save_weight_hd5)): - os.makedirs(os.path.dirname(save_weight_hd5)) - serialize_model_semantics(args, save_weight_hd5) - - history = model.fit_generator(generate_train, - steps_per_epoch=args.training_steps, epochs=args.epochs, verbose=1, - validation_steps=args.validation_steps, validation_data=generate_valid, - callbacks=get_callbacks(args, save_weight_hd5)) - print('Training complete, model weights saved at: %s' % save_weight_hd5) - if args.image_dir: - plots.plot_metric_history(history, plots.weight_path_to_title(save_weight_hd5), prefix=args.image_dir) - - - - - return model - - -def get_callbacks(args, save_weight_hd5): - callbacks = [] - - callbacks.append(ModelCheckpoint(filepath=save_weight_hd5, verbose=1, save_best_only=True)) - callbacks.append(EarlyStopping(monitor='val_loss', patience=args.patience*4, verbose=1)) - callbacks.append(ReduceLROnPlateau(monitor='val_loss', patience=args.patience, verbose=1)) - - if args.tensor_board: - callbacks.append(TensorBoard()) - - return callbacks - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Metrics ~~~~~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -def precision(y_true, y_pred): - '''Calculates the precision, a metric for multi-label classification of - how many selected items are relevant. - ''' - true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) - predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) - precision = true_positives / (predicted_positives + K.epsilon()) - return precision - - -def recall(y_true, y_pred): - '''Calculates the recall, a metric for multi-label classification of - how many relevant items are selected. - ''' - true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1))) - possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) - recall = true_positives / (possible_positives + K.epsilon()) - return recall - - -def per_class_recall(labels): - recall_fxns = [] - - for label_key in labels: - label_idx = labels[label_key] - fxn = 'def '+ label_key + '_recall(y_true, y_pred):\n' - fxn += '\ttrue_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)), axis=0)\n' - fxn += '\tpossible_positives = K.sum(K.round(K.clip(y_true, 0, 1)), axis=0)\n' - fxn += '\treturn true_positives['+str(label_idx)+'] / (possible_positives['+str(label_idx)+'] + K.epsilon())\n' - - exec(fxn) - recall_fxn = eval(label_key + '_recall') - recall_fxns.append(recall_fxn) - - return recall_fxns - - -def per_class_precision(labels): - precision_fxns = [] - - for label_key in labels: - label_idx = labels[label_key] - fxn = 'def '+ label_key + '_precision(y_true, y_pred):\n' - fxn += '\ttrue_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)), axis=0)\n' - fxn += '\tpredicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0)\n' - fxn += '\treturn true_positives['+str(label_idx)+'] / (predicted_positives['+str(label_idx)+'] + K.epsilon())\n' - - exec(fxn) - precision_fxn = eval(label_key + '_precision') - precision_fxns.append(precision_fxn) - return precision_fxns - - -def get_metric_dict(labels=defines.SNP_INDEL_LABELS): - metrics = {'precision':precision, 'recall':recall} - precision_fxns = per_class_precision(labels) - recall_fxns = per_class_recall(labels) - for i,label_key in enumerate(labels.keys()): - metrics[label_key+'_precision'] = precision_fxns[i] - metrics[label_key+'_recall'] = recall_fxns[i] - return metrics - - -def per_class_recall_3d(labels): - recall_fxns = [] - - for label_key in labels: - label_idx = labels[label_key] - fxn = 'def '+ label_key + '_recall(y_true, y_pred):\n' - fxn += '\ttrue_positives = K.sum(K.sum(K.round(K.clip(y_true*y_pred, 0, 1)), axis=0), axis=0)\n' - fxn += '\tpossible_positives = K.sum(K.sum(K.round(K.clip(y_true, 0, 1)), axis=0), axis=0)\n' - fxn += '\treturn true_positives['+str(label_idx)+'] / (possible_positives['+str(label_idx)+'] + K.epsilon())\n' - - exec(fxn) - recall_fxn = eval(label_key + '_recall') - recall_fxns.append(recall_fxn) - - return recall_fxns - - -def per_class_precision_3d(labels): - precision_fxns = [] - - for label_key in labels: - label_idx = labels[label_key] - fxn = 'def '+ label_key + '_precision(y_true, y_pred):\n' - fxn += '\ttrue_positives = K.sum(K.sum(K.round(K.clip(y_true*y_pred, 0, 1)), axis=0), axis=0)\n' - fxn += '\tpredicted_positives = K.sum(K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0), axis=0)\n' - fxn += '\treturn true_positives['+str(label_idx)+'] / (predicted_positives['+str(label_idx)+'] + K.epsilon())\n' - - exec(fxn) - precision_fxn = eval(label_key + '_precision') - precision_fxns.append(precision_fxn) - - return precision_fxns - - -def get_metrics(classes=None, dim=2): - if classes and dim == 2: - return [metrics.categorical_accuracy] + per_class_precision(classes) + per_class_recall(classes) - elif classes and dim == 3: - return [metrics.categorical_accuracy] + per_class_precision_3d(classes) + per_class_recall_3d(classes) - else: - return [metrics.categorical_accuracy, precision, recall] - - -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Serialization ~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -def serialize_model_semantics(args, architecture_hd5): - """Save a json file specifying model semantics, I/O contract. - - Arguments - args.tensor_name: String which indicates tensor map to use (from defines.py) or None - args.window_size: sites included in the tensor map - args.read_limit: Maximum reads included in the tensor map - args.annotations: List of annotations or None - args.id: the id of the run will be the name of the semantics file - architecture_hd5: Keras model and weights hd5 file (created with save_model()) - """ - semantics = { - 'id': args.id, - 'output_labels': args.labels, - 'architecture': os.path.basename(architecture_hd5), - 'input_symbols': args.input_symbols, - 'model_version': args.model_version, - 'gatk_version': args.gatk_version, - } - - if args.tensor_name: - semantics['input_tensor_map_name'] = args.tensor_name - semantics['input_tensor_map'] = tensor_maps.get_tensor_channel_map_from_args(args) - semantics['window_size'] = args.window_size - semantics['read_limit'] = args.read_limit - - if args.annotation_set and args.annotation_set != '_': - semantics['input_annotations'] = args.annotations - semantics['input_annotation_set'] = args.annotation_set - - if args.data_dir: - semantics['data_dir'] = args.data_dir - - semantics['channels_last'] = args.channels_last - - json_file_name = args.output_dir + args.id + '.json' - with open(json_file_name, 'w') as outfile: - json.dump(semantics, outfile) - - print('Saved model semantics at:', json_file_name) \ No newline at end of file diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/plots.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/plots.py deleted file mode 100644 index 491fd385148..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/plots.py +++ /dev/null @@ -1,180 +0,0 @@ -# plots.py -# -# Plotting code for Variant Filtration with Neural Nets -# This includes evaluation plots like Precision and Recall curves, -# various flavors of Receiver Operating Characteristic (ROC curves), -# As well as graphs of the metrics that are watched during neural net training. -# -# December 2016 -# Sam Friedman -# sam@broadinstitute.org - -# Imports -import os -import math -import matplotlib -import numpy as np -matplotlib.use('Agg') # Need this to write images from the GSA servers. Order matters: -import matplotlib.pyplot as plt # First import matplotlib, then use Agg, then import plt -from sklearn.metrics import roc_curve, auc, roc_auc_score, precision_recall_curve, average_precision_score - -image_ext = '.png' - -color_array = ['red', 'indigo', 'cyan', 'pink', 'purple'] -key_colors = { - 'Neural Net':'green', 'CNN_SCORE':'green', 'CNN_2D':'green', - 'Heng Li Hard Filters':'lightblue', - 'GATK Hard Filters':'orange','GATK Signed Distance':'darksalmon', - 'VQSR gnomAD':'cornflowerblue', 'VQSR Single Sample':'blue', 'VQSLOD':'cornflowerblue', - 'Deep Variant':'magenta', 'QUAL':'magenta', 'DEEP_VARIANT_QUAL':'magenta', - 'Random Forest':'darkorange', - 'SNP':'cornflowerblue', 'NOT_SNP':'orange', 'INDEL':'green', 'NOT_INDEL':'red', - 'VQSLOD none':'cornflowerblue', 'VQSLOD strModel':'orange', 'VQSLOD default':'green', - 'REFERENCE':'green', 'HET_SNP':'cornflowerblue', 'HOM_SNP':'blue', 'HET_DELETION':'magenta', - 'HOM_DELETION':'violet', 'HET_INSERTION':'orange', 'HOM_INSERTION':'darkorange' -} - -precision_label = 'Precision | Positive Predictive Value | TP/(TP+FP)' -recall_label = 'Recall | Sensitivity | True Positive Rate | TP/(TP+FN)' -fallout_label = 'Fallout | 1 - Specificity | False Positive Rate | FP/(FP+TN)' - - -def get_fpr_tpr_roc(model, test_data, test_truth, labels, batch_size=32): - """Get false positive and true positive rates from a classification model. - - Arguments: - model: The model whose predictions to evaluate. - test_data: Input testing data in the shape the model expects. - test_truth: The true labels of the testing data - labels: dict specifying the class labels. - batch_size: Size of batches for prediction over the test data. - - Returns: - dict, dict, dict: false positive rate, true positive rate, and area under ROC curve. - The dicts all use label indices as keys. fpr and tpr dict's values are lists - (the x and y coordinates that defines the ROC curves) and for AUC the value is a float. - """ - y_pred = model.predict(test_data, batch_size=batch_size, verbose=0) - return get_fpr_tpr_roc_pred(y_pred, test_truth, labels) - - -def get_fpr_tpr_roc_pred(y_pred, test_truth, labels): - """Get false positive and true positive rates from predictions and true labels. - - Arguments: - y_pred: model predictions to evaluate. - test_truth: The true labels of the testing data - labels: dict specifying the class labels. - - Returns: - dict, dict, dict: false positive rate, true positive rate, and area under ROC curve. - The dicts all use label indices as keys. fpr and tpr dict's values are lists - (the x and y coordinates that defines the ROC curves) and for AUC the value is a float. - """ - fpr = dict() - tpr = dict() - roc_auc = dict() - - for k in labels.keys(): - cur_idx = labels[k] - fpr[labels[k]], tpr[labels[k]], _ = roc_curve(test_truth[:,cur_idx], y_pred[:,cur_idx]) - roc_auc[labels[k]] = auc(fpr[labels[k]], tpr[labels[k]]) - - return fpr, tpr, roc_auc - - -def plot_roc_per_class(model, test_data, test_truth, labels, title, batch_size=32, prefix='./figures/'): - """Plot a per class ROC curve. - - Arguments: - model: The model whose predictions to evaluate. - test_data: Input testing data in the shape the model expects. - test_truth: The true labels of the testing data - labels: dict specifying the class labels. - title: the title to display on the plot. - batch_size: Size of batches for prediction over the test data. - prefix: path specifying where to save the plot. - """ - fpr, tpr, roc_auc = get_fpr_tpr_roc(model, test_data, test_truth, labels, batch_size) - - lw = 3 - plt.figure(figsize=(28,22)) - matplotlib.rcParams.update({'font.size': 34}) - - for key in labels.keys(): - if key in key_colors: - color = key_colors[key] - else: - color = np.random.choice(color_array) - plt.plot(fpr[labels[key]], tpr[labels[key]], color=color, lw=lw, - label=str(key)+' area under ROC: %0.3f'%roc_auc[labels[key]]) - - plt.plot([0, 1], [0, 1], 'k:', lw=0.5) - plt.xlim([0.0, 1.0]) - plt.ylim([-0.02, 1.03]) - plt.xlabel(fallout_label) - plt.ylabel(recall_label) - plt.title('ROC:'+ title + '\n') - - matplotlib.rcParams.update({'font.size': 56}) - plt.legend(loc="lower right") - figure_path = prefix+"per_class_roc_"+title+image_ext - if not os.path.exists(os.path.dirname(figure_path)): - os.makedirs(os.path.dirname(figure_path)) - plt.savefig(figure_path) - print('Saved figure at:', figure_path) - - -def plot_metric_history(history, title, prefix='./figures/'): - """Plot metric history throughout training. - - Arguments: - history: History object returned by Keras fit function. - title: the title to display on the plot. - prefix: path specifying where to save the plot. - """ - num_plots = len([k for k in history.history.keys() if not 'val' in k]) - - row = 0 - col = 0 - rows = 4 - cols = max(2, int(math.ceil(num_plots/float(rows)))) - - f, axes = plt.subplots(rows, cols, sharex=True, figsize=(36, 24)) - for k in sorted(history.history.keys()): - if 'val' not in k: - axes[row, col].plot(history.history[k]) - axes[row, col].set_ylabel(str(k)) - axes[row, col].set_xlabel('epoch') - if 'val_'+k in history.history: - axes[row, col].plot(history.history['val_'+k]) - labels = ['train', 'valid'] - else: - labels = [k] - axes[row, col].legend(labels, loc='upper left') - - row += 1 - if row == rows: - row = 0 - col += 1 - if row*col >= rows*cols: - break - - axes[0, 1].set_title(title) - figure_path = prefix+"metric_history_"+title+image_ext - if not os.path.exists(os.path.dirname(figure_path)): - os.makedirs(os.path.dirname(figure_path)) - plt.savefig(figure_path) - - -def weight_path_to_title(wp): - """Get a title from a model's weight path - - Arguments: - wp: path to model's weights. - - Returns: - str: a reformatted string - """ - return wp.split('/')[-1].replace('__', '-').split('.')[0] - diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/tensor_maps.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/tensor_maps.py deleted file mode 100644 index 733d77fd0e0..00000000000 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/tensor_maps.py +++ /dev/null @@ -1,91 +0,0 @@ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# ~~~~~~~ Tensor Maps ~~~~~~~~~~~~~ -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -import keras.backend as K -from . import defines - - -def get_tensor_channel_map_from_args(args): - '''Return tensor mapping dict given args.tensor_name''' - if args.tensor_name is None: - return None - - if 'read_tensor' == args.tensor_name: - return get_read_tensor_channel_map() - elif 'reference' == args.tensor_name: - return get_tensor_channel_map_1d_dna() - else: - raise ValueError('Unknown tensor mapping mode:', args.tensor_name) - - -def get_tensor_channel_map_from_tensor_type(tensor_type: str): - """Return tensor mapping dict given args.tensor_name""" - if 'read_tensor' == tensor_type: - return get_read_tensor_channel_map() - elif 'reference' == tensor_type: - return get_tensor_channel_map_1d_dna() - else: - raise ValueError('Unknown tensor mapping mode:', tensor_type) - - -def get_tensor_channel_map_1d_dna(): - '''1D Reference tensor with 4 channel DNA encoding.''' - tensor_map = {} - for k in defines.DNA_SYMBOLS.keys(): - tensor_map[k] = defines.DNA_SYMBOLS[k] - - return tensor_map - - -def get_tensor_channel_map_reference_reads(): - '''Read and reference tensor with 4 channel DNA encoding. - Plus insertions and deletions. - ''' - tensor_map = {} - for k in defines.INPUTS_INDEL.keys(): - tensor_map['read_'+k] = defines.INPUTS_INDEL[k] - for k in defines.INPUTS_INDEL.keys(): - tensor_map['reference_'+k] = len(defines.INPUTS_INDEL) + defines.INPUTS_INDEL[k] - - return tensor_map - - -def get_read_tensor_channel_map(): - '''Read and reference tensor with 4 channel DNA encoding. - Also includes read flags. - ''' - tensor_map = {} - for k in defines.INPUTS_INDEL.keys(): - tensor_map['read_'+k] = defines.INPUTS_INDEL[k] - for k in defines.INPUTS_INDEL.keys(): - tensor_map['reference_'+k] = len(defines.INPUTS_INDEL) + defines.INPUTS_INDEL[k] - tensor_map['flag_bit_4'] = 10 - tensor_map['flag_bit_5'] = 11 - tensor_map['flag_bit_6'] = 12 - tensor_map['flag_bit_7'] = 13 - tensor_map['mapping_quality'] = 14 - return tensor_map - - -def tensor_shape_from_args(args): - in_channels = len(get_tensor_channel_map_from_args(args)) - if K.image_data_format() == 'channels_last': - tensor_shape = (args.read_limit, args.window_size, in_channels) - else: - tensor_shape = (in_channels, args.read_limit, args.window_size) - return tensor_shape - - -def tensor_shape_from_tensor_type(tensor_type: str, window_size: int, read_limit: int): - in_channels = len(get_tensor_channel_map_from_tensor_type(tensor_type)) - if K.image_data_format() == 'channels_last': - tensor_shape = (read_limit, window_size, in_channels) - else: - tensor_shape = (in_channels, read_limit, window_size) - return tensor_shape - - -def total_input_channels_from_args(args): - '''Get the number of channels in the tensor map''' - return len(get_tensor_channel_map_from_args(args)) - diff --git a/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.hd5 b/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.hd5 deleted file mode 100644 index 48581550b03..00000000000 --- a/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.hd5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd17c3a98f7651b4e7ee54d875c47ec12e18b75daf79b3744a2590ddb0d6b44d -size 20227144 diff --git a/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.json b/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.json deleted file mode 100644 index 666a73dfb9a..00000000000 --- a/src/main/resources/large/cnn_score_variants/1d_cnn_mix_train_full_bn.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eda2517817b23238c2b28f69a1fa39e9b85b45985854f0a5d5508280e76da39e -size 519 diff --git a/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.hd5 b/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.hd5 deleted file mode 100644 index c7d8916236d..00000000000 --- a/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.hd5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d70940bd9d7c6c862304c66d64233726dc30342ae7032a4636939e8249cbf46 -size 24459852 diff --git a/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.json b/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.json deleted file mode 100644 index 8efe4242f46..00000000000 --- a/src/main/resources/large/cnn_score_variants/2d_cnn_mix_train.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ed7feb0343e9ac03135b1456b2c8d2edab1b359c4950908c4d44152c0634a89 -size 732 diff --git a/src/main/resources/large/cnn_score_variants/small_2d.hd5 b/src/main/resources/large/cnn_score_variants/small_2d.hd5 deleted file mode 100644 index deb36d22e04..00000000000 --- a/src/main/resources/large/cnn_score_variants/small_2d.hd5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f663a2fdbcde0addc5cb755f7af5d4c19bed92dccfd20e25b2acf2bc8c2ca7c -size 2163096 diff --git a/src/main/resources/large/cnn_score_variants/small_2d.json b/src/main/resources/large/cnn_score_variants/small_2d.json deleted file mode 100644 index c35cfbdfcae..00000000000 --- a/src/main/resources/large/cnn_score_variants/small_2d.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e38e09cfe7b7ffbc80dce4972bc9c382148520147d46738a3f6f3235b2d876c6 -size 758 diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py deleted file mode 100644 index 574a50ae97e..00000000000 --- a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py +++ /dev/null @@ -1,1161 +0,0 @@ -# Imports -import os -import vcf -import math -import h5py -import pysam -import vqsr_cnn -import numpy as np -from Bio import Seq, SeqIO -from collections import Counter - -# Keras Imports -import keras.backend as K - - -def run(): - args = vqsr_cnn.parse_args() - if 'write_reference_and_annotation_tensors' == args.mode: - write_reference_and_annotation_tensors(args) - elif 'write_read_and_annotation_tensors' == args.mode: - write_read_and_annotation_tensors(args) - elif 'train_default_1d_model' == args.mode: - train_default_1d_model(args) - elif 'train_default_2d_model' == args.mode: - train_default_2d_model(args) - elif 'train_args_model_on_read_tensors_and_annotations' == args.mode: - train_args_model_on_read_tensors_and_annotations(args) - elif 'train_args_model_on_reference_and_annotations' == args.mode: - train_args_model_on_read_tensors_and_annotations(args) - else: - raise ValueError('Unknown training mode:', args.mode) - - -def write_reference_and_annotation_tensors(args, include_dna=True, include_annotations=True): - if not args.tensor_name in vqsr_cnn.TENSOR_MAPS_1D: - raise ValueError('Unknown tensor name:', args.tensor_name, '1d maps must be in:', str(vqsr_cnn.TENSOR_MAPS_1D)) - - record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta")) - - vcf_reader = get_vcf_reader(args.input_vcf) - vcf_ram = get_vcf_reader(args.train_vcf) - - bed_dict = bed_file_to_dict(args.bed_file) - stats = Counter() - - if args.chrom: - variants = vcf_reader.fetch(args.chrom, args.start_pos, args.end_pos) - else: - variants = vcf_reader - - for variant in variants: - for allele_idx, allele in enumerate(variant.ALT): - idx_offset, ref_start, ref_end = get_variant_window(args, variant) - contig = record_dict[variant.CHROM] - record = contig[variant.POS-idx_offset: variant.POS+idx_offset] - - cur_label_key = get_true_label(allele, variant, bed_dict, vcf_ram, stats) - if not cur_label_key or downsample(args, cur_label_key, stats): - continue - - if include_annotations: - if all(map( - lambda x: x not in variant.INFO and x not in variant.FORMAT and x != "QUAL", args.annotations)): - stats['Missing ALL annotations'] += 1 - continue # Require at least 1 annotation... - annotation_data = get_annotation_data(args, variant, stats) - - if include_dna: - dna_data = np.zeros( (args.window_size, len(vqsr_cnn.DNA_SYMBOLS)) ) - for i,b in enumerate(record.seq): - if b in vqsr_cnn.DNA_SYMBOLS: - dna_data[i, vqsr_cnn.DNA_SYMBOLS[b]] = 1.0 - elif b in vqsr_cnn.AMBIGUITY_CODES: - dna_data[i] = vqsr_cnn.AMBIGUITY_CODES[b] - else: - raise ValueError('Error! Unknown code:', b) - - tp = get_path_to_train_valid_or_test(args.data_dir) - tp += cur_label_key +'/'+ plain_name(args.input_vcf) +'_'+ plain_name(args.train_vcf) - tp += '_allele_' + str(allele_idx) +'-'+ variant.CHROM +'_'+ str(variant.POS) + vqsr_cnn.TENSOR_SUFFIX - if not os.path.exists(os.path.dirname(tp)): - os.makedirs(os.path.dirname(tp)) - - with h5py.File(tp, 'w') as hf: - if include_annotations: - hf.create_dataset(args.annotation_set, data=annotation_data, compression='gzip') - if include_dna: - hf.create_dataset(args.tensor_name, data=dna_data, compression='gzip') - - stats[cur_label_key] += 1 - stats['count'] += 1 - if stats['count']%500==0: - print('Wrote', stats['count'], 'out of:', args.samples, 'Last variant:', variant) - if args.samples <= stats['count']: - break - - print('Done Writing 1D Tensors. Tensor Map:', args.tensor_name, ' Annotation set:', args.annotation_set) - for k in stats.keys(): - print(k, ' has:', stats[k]) - - -def write_read_and_annotation_tensors(args, include_annotations=True, pileup=False): - '''Create tensors structured as tensor map of reads organized by labels in the data directory. - - Defines true variants as those in the args.train_vcf, defines false variants as - those called in args.input_vcf and in the args.bed_file high confidence intervals, - but not in args.train_vcf. - - Arguments - args.data_dir: directory where tensors will live. Created here and filled with - subdirectories of test, valid and train, each containing - subdirectories for each label with tensors stored as hd5 files. - args.bam_file: BAM or BAMout file where the aligned reads are stored - args.input_vcf: VCF file with annotation values from Haplotype caller or VQSR - args.train_vcf: VCF file with true variant (from NIST or Platinum genomes, etc.) - args.bed_file: High confidence intervals for the calls in args.train_vcf - args.window_size: Size of sequence window around variant (width of the tensor) - args.read_limit: Maximum number of reads to include (height of the tensor) - args.chrom: Only write tensors from this chromosome (optional, used for parallelization) - args.start_pos: Only write tensors after this position (optional, used for parallelization) - args.end_pos: Only write tensors before this position (optional, used for parallelization) - ''' - print('Writing tensors with:', args.tensor_name, 'channel map.') - stats = Counter() - - samfile = pysam.AlignmentFile(args.bam_file, "rb") - bed_dict = bed_file_to_dict(args.bed_file) - record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta")) - vcf_reader = get_vcf_reader(args.input_vcf) - vcf_ram = get_vcf_reader(args.train_vcf) - - if args.chrom: - variants = vcf_reader.fetch(args.chrom, args.start_pos, args.end_pos) - else: - variants = vcf_reader - - for variant in variants: - for allele_idx, allele in enumerate(variant.ALT): - idx_offset, ref_start, ref_end = get_variant_window(args, variant) - contig = record_dict[variant.CHROM] - record = contig[ ref_start : ref_end ] - - cur_label_key = get_true_label(allele, variant, bed_dict, vcf_ram, stats) - if not cur_label_key or downsample(args, cur_label_key, stats): - continue - - if include_annotations: - if all(map( - lambda x: x not in variant.INFO and x not in variant.FORMAT and x != "QUAL", args.annotations)): - stats['Missing ALL annotations'] += 1 - continue # Require at least 1 annotation... - annotation_data = get_annotation_data(args, variant, stats) - - good_reads, insert_dict = get_good_reads(args, samfile, variant) - if len(good_reads) >= args.read_limit: - stats['More reads than read_limit'] += 1 - if len(good_reads) == 0: - stats['No reads aligned'] += 1 - continue - - reference_seq = record.seq - for i in sorted(insert_dict.keys(), key=int, reverse=True): - if i < 0: - reference_seq = vqsr_cnn.INDEL_CHAR*insert_dict[i] + reference_seq - else: - reference_seq = reference_seq[:i] + vqsr_cnn.INDEL_CHAR*insert_dict[i] + reference_seq[i:] - - read_tensor = good_reads_to_tensor(args, good_reads, ref_start, insert_dict) - reference_sequence_into_tensor(args, reference_seq, read_tensor) - - tensor_path = get_path_to_train_valid_or_test(args.data_dir) - tensor_prefix = plain_name(args.input_vcf) +'_'+ plain_name(args.train_vcf) - tensor_prefix += '_allele_' + str(allele_idx) + '-' + cur_label_key - tensor_path += cur_label_key + '/' + tensor_prefix + '-' + variant.CHROM - tensor_path += '_' + str(variant.POS) + vqsr_cnn.TENSOR_SUFFIX - stats[cur_label_key] += 1 - - if not os.path.exists(os.path.dirname(tensor_path)): - os.makedirs(os.path.dirname(tensor_path)) - with h5py.File(tensor_path, 'w') as hf: - if pileup: - pileup_tensor = read_tensor_to_pileup(args, read_tensor) - hf.create_dataset('pileup_tensor', data=pileup_tensor, compression='gzip') - hf.create_dataset(args.tensor_name, data=read_tensor, compression='gzip') - if include_annotations: - hf.create_dataset(args.annotation_set, data=annotation_data, compression='gzip') - - stats['count'] += 1 - if stats['count']%100 == 0: - print('Wrote', stats['count'], 'tensors out of', args.samples, ' last variant:', str(variant)) - if stats['count'] >= args.samples: - break - - for s in stats.keys(): - print(s, 'has:', stats[s]) - if variant: - print('Done generating tensors. Last variant:', str(variant), 'from vcf:', args.input_vcf) - - -def train_default_1d_model(args): - '''Train a 1D Convolution plus reference tracks and MLP Annotation architecture. - - Arguments: - args.data_dir: must be set to an appropriate directory with - subdirectories of test, valid and train, each containing - subdirectories for each label with tensors stored as hd5 files. - - Reference and Annotation tensors must be generated by calling - write_reference_and_annotation_tensors() before this function is used. - Performance curves for CNN are plotted on the test dataset. - ''' - train_paths, valid_paths, test_paths = get_train_valid_test_paths(args) - - generate_train = dna_annotation_generator(args, train_paths) - generate_valid = dna_annotation_generator(args, valid_paths) - - weight_path = vqsr_cnn.weight_path_from_args(args) - model = vqsr_cnn.build_default_1d_annotation_model(args) - model = vqsr_cnn.train_model_from_generators(args, model, generate_train, generate_valid, weight_path) - - test = load_dna_annotations_positions_from_class_dirs(args, test_paths, per_class_max=args.samples) - if args.image_dir: - vqsr_cnn.plot_roc_per_class(model, [test[0], test[1]], test[2], args.labels, args.id, prefix=args.image_dir) - - -def train_default_2d_model(args): - '''Trains a reference, read, and annotation CNN architecture on tensors at the supplied data directory. - - This architecture looks at reads, read flags, reference sequence, and variant annotations. - Tensors must be generated by calling write_read_and_annotation_tensors() before this function is used. - After training with early stopping performance curves are plotted on the test dataset. - - Arguments: - args.data_dir: must be set to an appropriate directory with - subdirectories of test, valid and train, each containing - subdirectories for each label with tensors stored as hd5 files. - - ''' - train_paths, valid_paths, test_paths = get_train_valid_test_paths(args) - - generate_train = tensor_generator_from_label_dirs_and_args(args, train_paths) - generate_valid = tensor_generator_from_label_dirs_and_args(args, valid_paths) - - weight_path = vqsr_cnn.weight_path_from_args(args) - model = vqsr_cnn.build_default_2d_annotation_model(args) - model = vqsr_cnn.train_model_from_generators(args, model, generate_train, generate_valid, weight_path) - - test = load_tensors_and_annotations_from_class_dirs(args, test_paths, per_class_max=args.samples) - if args.image_dir: - vqsr_cnn.plot_roc_per_class(model, [test[0], test[1]], test[2], args.labels, args.id, - prefix=args.image_dir, batch_size=args.batch_size) - - -def train_args_model_on_read_tensors_and_annotations(args): - '''Trains a reference, read, and annotation CNN architecture on tensors at the supplied data directory. - - This architecture looks at reads, read flags, reference sequence, and variant annotations. - Tensors must be generated by calling write_read_and_annotation_tensors() before this function is used. - After training with early stopping performance curves are plotted on the test dataset. - - Arguments: - args.data_dir: must be set to an appropriate directory with - subdirectories of test, valid and train, each containing - subdirectories for each label with tensors stored as hd5 files. - - ''' - train_paths, valid_paths, test_paths = get_train_valid_test_paths(args) - - generate_train = tensor_generator_from_label_dirs_and_args(args, train_paths) - generate_valid = tensor_generator_from_label_dirs_and_args(args, valid_paths) - - weight_path = vqsr_cnn.weight_path_from_args(args) - model = vqsr_cnn.build_2d_annotation_model_from_args(args) - model = vqsr_cnn.train_model_from_generators(args, model, generate_train, generate_valid, weight_path) - - test = load_tensors_and_annotations_from_class_dirs(args, test_paths, per_class_max=args.samples) - if args.image_dir: - vqsr_cnn.plot_roc_per_class(model, [test[0], test[1]], test[2], args.labels, args.id, - prefix=args.image_dir, batch_size=args.batch_size) - - -def train_small_model_on_read_tensors_and_annotations(args): - '''Trains a reference, read, and annotation CNN architecture on tensors at the supplied data directory. - - This architecture looks at reads, read flags, reference sequence, and variant annotations. - Tensors must be generated by calling write_read_and_annotation_tensors() before this function is used. - After training with early stopping performance curves are plotted on the test dataset. - - Arguments: - args.data_dir: must be set to an appropriate directory with - subdirectories of test, valid and train, each containing - subdirectories for each label with tensors stored as hd5 files. - - ''' - train_paths, valid_paths, test_paths = get_train_valid_test_paths(args) - - generate_train = tensor_generator_from_label_dirs_and_args(args, train_paths) - generate_valid = tensor_generator_from_label_dirs_and_args(args, valid_paths) - - weight_path = vqsr_cnn.weight_path_from_args(args) - model = vqsr_cnn.build_small_2d_annotation_model(args) - model = vqsr_cnn.train_model_from_generators(args, model, generate_train, generate_valid, weight_path) - - test = load_tensors_and_annotations_from_class_dirs(args, test_paths, per_class_max=args.samples) - if args.image_dir: - vqsr_cnn.plot_roc_per_class(model, [test[0], test[1]], test[2], args.labels, args.id, - prefix=args.image_dir, batch_size=args.batch_size) - - -def get_annotation_data(args, annotation_variant, stats): - '''Return an array annotation data about the variant. - - Arguments: - args.annotations: List of variant annotations to use - annotation_variant: the variant with annotation - stats: Counter of run statistics - - Returns: - annotation_data: numpy array of annotation values - ''' - annotation_data = np.zeros((len(args.annotations),)) - - for i, a in enumerate(args.annotations): - if a == 'QUAL': - annotation_data[i] = annotation_variant.QUAL - elif a == 'AF': - annotation_data[i] = annotation_variant.INFO[a][0] - elif a in annotation_variant.INFO and not math.isnan(annotation_variant.INFO[a]): - annotation_data[i] = annotation_variant.INFO[a] - elif a == 'MBQ': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.MBQ - elif a == 'MPOS': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.MPOS - elif a == 'MMQ': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.MMQ - elif a == 'MFRL_0': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.MFRL[0] - elif a == 'MFRL_1': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.MFRL[1] - elif a == 'AD_0': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.AD[0] - elif a == 'AD_1': - call = annotation_variant.genotype(args.sample_name) - annotation_data[i] = call.data.AD[1] - else: - stats['Could not find annotation:' + a] += 1 - - return annotation_data - - -def get_good_reads(args, samfile, variant, sort_by='base'): - '''Return an array of usable reads centered at the variant. - - Ignores artificial haplotype read group. - Relies on pysam's cigartuples structure see: http://pysam.readthedocs.io/en/latest/api.html - Match, M -> 0 - Insert, I -> 1 - Deletion, D -> 2 - Ref Skip, N -> 3 - Soft Clip, S -> 4 - - Arguments: - args.read_limit: maximum number of reads to return - samfile: the BAM (or BAMout) file - variant: the variant around which reads will load - - Returns: - good_reads: array of usable reads sorted by reference start position - insert_dict: a dict mapping read indices to max insertions at that point - ''' - good_reads = [] - insert_dict = {} - - idx_offset, ref_start, ref_end = get_variant_window(args, variant) - - for read in samfile.fetch(variant.CHROM, variant.POS-1, variant.POS+1): - - if not read or not hasattr(read, 'cigarstring') or read.cigarstring is None: - continue - - read_group = read.get_tag('RG') - if 'artificial' in read_group.lower(): - continue - - index_dif = ref_start - read.reference_start - if abs(index_dif) >= args.window_size: - continue - - if 'I' in read.cigarstring: - cur_idx = 0 - for t in read.cigartuples: - if t[0] == vqsr_cnn.CIGAR_CODE['I']: - insert_idx = cur_idx - index_dif - if insert_idx not in insert_dict: - insert_dict[insert_idx] = t[1] - elif insert_dict[insert_idx] < t[1]: - insert_dict[insert_idx] = t[1] - - if t[0] in [vqsr_cnn.CIGAR_CODE['M'], vqsr_cnn.CIGAR_CODE['I'], - vqsr_cnn.CIGAR_CODE['S'], vqsr_cnn.CIGAR_CODE['D']]: - cur_idx += t[1] - - good_reads.append(read) - - if len(good_reads) > args.read_limit: - good_reads = np.random.choice(good_reads, size=args.read_limit, replace=False).tolist() - - good_reads.sort(key=lambda x: x.reference_start + x.query_alignment_start) - if sort_by == 'base': - good_reads.sort(key=lambda read: get_base_to_sort_by(read, variant)) - - return good_reads, insert_dict - - -def get_base_to_sort_by(read, variant): - if len(read.query_alignment_sequence) > 0: - max_idx = len(read.query_alignment_sequence)-1 - else: - return 'Z' - - if variant.is_snp: - return read.query_alignment_sequence[clamp((variant.POS-read.reference_start)-1, 0, max_idx)] - elif variant.is_indel: - var_idx = variant.POS-read.reference_start - cur_idx = 0 - for cur_op, length in read.cigartuples: - cur_idx += length - if cur_idx > var_idx: - if cur_op == vqsr_cnn.CIGAR_CODE['M']: - return read.query_alignment_sequence[clamp(var_idx, 0, max_idx)] - else: - return vqsr_cnn.CODE2CIGAR[cur_op] - return 'Y' - - -def clamp(n, minn, maxn): - return max(min(maxn, n), minn) - - -def good_reads_to_tensor(args, good_reads, ref_start, insert_dict): - '''Create a read tensor based on a tensor channel map. - - Assumes read pairs have the same name. - Only loads reads that might align inside the tensor. - - Arguments: - args.read_limit: maximum number of reads to return - good_reads: list of reads to make arrays from - ref_start: the beginning of the window in reference coordinates - insert_dict: a dict mapping read indices to max insertions at that point. - - Returns: - tensor: 3D read tensor. - ''' - channel_map = vqsr_cnn.get_tensor_channel_map_from_args(args) - tensor = np.zeros( vqsr_cnn.tensor_shape_from_args(args) ) - - for j,read in enumerate(good_reads): - - rseq, rqual = sequence_and_qualities_from_read(args, read, ref_start, insert_dict) - flag_start = -1 - flag_end = 0 - - for i,b in enumerate(rseq): - - if i == args.window_size: - break - - if b == vqsr_cnn.SKIP_CHAR: - continue - elif flag_start == -1: - flag_start = i - else: - flag_end = i - - if b in args.input_symbols: - if b == vqsr_cnn.INDEL_CHAR: - if K.image_data_format() == 'channels_last': - tensor[j, i, args.input_symbols[b]] = 1.0 - else: - tensor[args.input_symbols[b], j, i] = 1.0 - else: - hot_array = quality_from_mode(args, rqual[i], b, args.input_symbols) - if K.image_data_format() == 'channels_last': - tensor[j, i, :4] = hot_array - else: - tensor[:4, j, i] = hot_array - - elif b in vqsr_cnn.AMBIGUITY_CODES: - if K.image_data_format() == 'channels_last': - tensor[j, i, :4] = vqsr_cnn.AMBIGUITY_CODES[b] - else: - tensor[:4, j, i] = vqsr_cnn.AMBIGUITY_CODES[b] - - else: - print('Error! Unknown symbol in seq block:', b) - return - - flags = flag_to_array(read.flag) - for i in range(vqsr_cnn.READ_FLAGS): - flag_str = 'flag_bit_'+ str(i) - - if flags[i] and flag_str in channel_map: - if K.image_data_format() == 'channels_last': - tensor[j, flag_start:flag_end, channel_map[flag_str]] = 1.0 - else: - tensor[channel_map[flag_str], j, flag_start:flag_end] = 1.0 - - if 'mapping_quality' in channel_map: - mq = float(read.mapping_quality)/vqsr_cnn.MAPPING_QUALITY_MAX - if K.image_data_format() == 'channels_last': - tensor[j, flag_start:flag_end, channel_map['mapping_quality']] = mq - else: - tensor[channel_map['mapping_quality'], j, flag_start:flag_end] = mq - - return tensor - - -def sequence_and_qualities_from_read(args, read, ref_start, insert_dict): - cur_idx = 0 - my_indel_dict = {} - no_qual_filler = 0 - - index_dif = ref_start - read.reference_start - for t in read.cigartuples: - my_ref_idx = cur_idx - index_dif - if t[0] == vqsr_cnn.CIGAR_CODE['I'] and my_ref_idx in insert_dict: - my_indel_dict[my_ref_idx] = insert_dict[my_ref_idx] - t[1] - elif t[0] == vqsr_cnn.CIGAR_CODE['D']: - my_indel_dict[my_ref_idx] = t[1] - if t[0] in [vqsr_cnn.CIGAR_CODE['M'], vqsr_cnn.CIGAR_CODE['I'], - vqsr_cnn.CIGAR_CODE['S'], vqsr_cnn.CIGAR_CODE['D']]: - cur_idx += t[1] - - for k in insert_dict.keys(): - if k not in my_indel_dict: - my_indel_dict[k] = insert_dict[k] - - rseq = read.query_alignment_sequence[:args.window_size] - rqual = read.query_alignment_qualities[:args.window_size].tolist() - - if index_dif > 0: - rseq = rseq[index_dif:] - rqual = rqual[index_dif:] - elif index_dif < 0: - rseq = vqsr_cnn.SKIP_CHAR*(-index_dif) + rseq - rqual = [no_qual_filler]*(-index_dif) + rqual - - for j in sorted(my_indel_dict.keys(), key=int, reverse=True): - if j < 1: - rseq = (vqsr_cnn.INDEL_CHAR*my_indel_dict[j]) + rseq - rqual = ([no_qual_filler]*my_indel_dict[j]) + rqual - else: - rseq = rseq[:j] + (vqsr_cnn.INDEL_CHAR*my_indel_dict[j]) + rseq[j:] - rqual = rqual[:j] + ([no_qual_filler]*my_indel_dict[j]) + rqual[j:] - - return rseq, rqual - - -def read_tensor_to_pileup(args, read_tensor): - tensor_map = vqsr_cnn.get_tensor_channel_map_from_args(args) - channels = vqsr_cnn.get_reference_and_read_channels(args) - pileup_tensor = np.zeros((args.window_size, channels)) - - for i in range(args.window_size): - for key in tensor_map: - if 'read' not in key and 'reference' not in key: - continue - - if 'read' in key and K.image_data_format() == 'channels_last': - pileup_tensor[i, tensor_map[key]] = np.sum(read_tensor[:, i, tensor_map[key]]) / args.window_size - elif 'read' in key: - pileup_tensor[i, tensor_map[key]] = np.sum(read_tensor[tensor_map[key], :, i]) / args.window_size - elif 'reference' in key and K.image_data_format() == 'channels_last': - pileup_tensor[i, tensor_map[key]] = np.amax(read_tensor[:, i, tensor_map[key]]) - elif 'reference' in key: - pileup_tensor[i, tensor_map[key]] = np.amax(read_tensor[tensor_map[key], :, i]) - else: - raise ValueError('Error unexpected key:'+key) - - return pileup_tensor - - -def reference_sequence_into_tensor(args, reference_seq, tensor): - ref_offset = len(set(args.input_symbols.values())) - for i,b in enumerate(reference_seq): - if i == args.window_size: - break - if b in args.input_symbols: - if K.image_data_format() == 'channels_last': - tensor[:, i, ref_offset+args.input_symbols[b]] = 1.0 - else: - tensor[ref_offset+args.input_symbols[b], :, i] = 1.0 - elif b in vqsr_cnn.AMBIGUITY_CODES: - ambiguous_vector = np.tile(vqsr_cnn.AMBIGUITY_CODES[b], (args.read_limit, 1)) - if K.image_data_format() == 'channels_last': - tensor[:, i, ref_offset:ref_offset+4] = ambiguous_vector - else: - tensor[ref_offset:ref_offset+4, :, i] = np.transpose(ambiguous_vector) - - -def flag_to_array(flag): - flags = [] - - for i in range(vqsr_cnn.READ_FLAGS): - flags.append((flag>>i)&1) - - return np.array(flags) - - -def add_flags_to_read_tensor(args, tensor, tensor_channel_map, flags): - for k in tensor_channel_map.keys(): - if 'flag' in k: - flag_bit = int(k.split('_')[-1]) - for read_idx in range(flags.shape[1]): - if K.image_data_format() == 'channels_last': - tensor[read_idx, :, tensor_channel_map[k]] = flags[flag_bit, read_idx] - else: - tensor[tensor_channel_map[k], read_idx, :] = flags[flag_bit, read_idx] - - -def add_mq_to_read_tensor(args, tensor, tensor_channel_map, mapping_qualities): - if not 'mapping_quality' in tensor_channel_map: - return - - for read_idx, mq in enumerate(mapping_qualities): - if K.image_data_format() == 'channels_last': - tensor[read_idx, :, tensor_channel_map['mapping_quality']] = float(mq) / vqsr_cnn.MAPPING_QUALITY_MAX - else: - tensor[tensor_channel_map['mapping_quality'], read_idx, :] = float(mq) / vqsr_cnn.MAPPING_QUALITY_MAX - - -def base_quality_to_phred_array(base_quality, base, base_dict): - phred = np.zeros((4,)) - exponent = float(-base_quality) / 10.0 - p = 1.0-(10.0**exponent) # Convert to probability - not_p = (1.0-p) / 3.0 # Error could be any of the other 3 bases - not_base_quality = -10 * np.log10(not_p) # Back to Phred - - for b in base_dict.keys(): - if b == vqsr_cnn.INDEL_CHAR: - continue - elif b == base: - phred[base_dict[b]] = base_quality - else: - phred[base_dict[b]] = not_base_quality - return phred - - -def base_quality_to_p_hot_array(base_quality, base, base_dict): - phot = np.zeros((4,)) - exponent = float(-base_quality) / 10.0 - p = 1.0-(10.0**exponent) - not_p = (1.0-p)/3.0 - - for b in base_dict.keys(): - if b == base: - phot[base_dict[b]] = p - elif b == vqsr_cnn.INDEL_CHAR: - continue - else: - phot[base_dict[b]] = not_p - - return phot - - -def quality_from_mode(args, base_quality, base, base_dict): - if args.base_quality_mode == 'phot': - return base_quality_to_p_hot_array(base_quality, base, base_dict) - elif args.base_quality_mode == 'phred': - return base_quality_to_phred_array(base_quality, base, base_dict) - elif args.base_quality_mode == '1hot': - one_hot = np.zeros((4,)) - one_hot[base_dict[base]] = 1.0 - return one_hot - else: - raise ValueError('Error! Unknown base quality mode:', args.base_quality_mode) - - - -def get_true_label(allele, variant, bed_dict, truth_vcf, stats): - '''Defines the truth status of a variant allele given a truth vcf and confident region. - - Arguments: - allele: The allele to check - variant: the variant whose allele we will check - bed_dict: confident region dict defined by intervals e.g. from bed_file_to_dict() - truth_vcf: vcf of validated variants - stats: Counter dict used to keep track of the label distribution, etc. - - Returns: - None if outside the confident region - Otherwise a label string: - SNP if variant is snp and in truth vcf - INDEL if variant is indel and in truth vcf - NOT_SNP if variant is snp and not in truth vcf - NOT_INDEL if variant is indel and not in truth vcf - ''' - in_bed = in_bed_file(bed_dict, variant.CHROM, variant.POS) - - if allele_in_vcf(allele, variant, truth_vcf) and in_bed: - class_prefix = '' - elif in_bed: - class_prefix = 'NOT_' - else: - stats['Variant outside confident bed file'] += 1 - return None - - if variant.is_snp: - cur_label_key = class_prefix + 'SNP' - elif variant.is_indel: - cur_label_key = class_prefix + 'INDEL' - else: - stats['Not SNP or INDEL'] += 1 - return None - - return cur_label_key - - -def downsample(args, cur_label_key, stats): - '''Indicates whether or not to downsample a variant. - - Arguments: - args.skip_positive_class: Skip all positive examples - args.downsample_snps: fraction of SNPs to keep - args.downsample_indels: fraction of INDELs to keep - cur_label_key: truth label from get_true_label() - stats: Counter dict used to keep track of a run - - Returns: - Boolean: should we downsample this variant or not. - ''' - if args.skip_positive_class and cur_label_key in ['SNP', 'INDEL']: - stats['Downsampled positive examples'] += 1 - return True - - if args.downsample_snps < 1.0 and cur_label_key == 'SNP': - dice = np.random.rand() - if dice > args.downsample_snps: - stats['Downsampled SNPs'] += 1 - return True - elif args.downsample_indels < 1.0 and cur_label_key == 'INDEL': - dice = np.random.rand() - if dice > args.downsample_indels: - stats['Downsampled INDELs'] += 1 - return True - if args.downsample_not_snps < 1.0 and cur_label_key == 'NOT_SNP': - dice = np.random.rand() - if dice > args.downsample_not_snps: - stats['Downsampled NOT_SNPs'] += 1 - return True - elif args.downsample_not_indels < 1.0 and cur_label_key == 'NOT_INDEL': - dice = np.random.rand() - if dice > args.downsample_not_indels: - stats['Downsampled NOT_INDELs'] += 1 - return True - - return False - - -def interval_file_to_dict(interval_file, shift1=0, skip=['@']): - ''' Create a dict to store intervals from a interval list file. - - Arguments: - interval_file: the file to load either a bed file -> shift1 should be 1 - or a picard style interval_list file -> shift1 should be 0 - shift1: Shift the intervals 1 position over to align with 1-indexed VCFs - skip: Comment character to ignore - Returns: - intervals: dict where keys in the dict are contig ids - values are a tuple of arrays the first array - in the tuple contains the start positions - the second array contains the end positions. - ''' - intervals = {} - - with open(interval_file) as f: - for line in f: - if line[0] in skip: - continue - - parts = line.split() - contig = parts[0] - lower = int(parts[1])+shift1 - upper = int(parts[2])+shift1 - - if contig not in intervals: - intervals[contig] = ([], []) - - intervals[contig][0].append(lower) - intervals[contig][1].append(upper) - - for k in intervals.keys(): - intervals[k] = (np.array(intervals[k][0]), np.array(intervals[k][1])) - - return intervals - - -def bed_file_to_dict(bed_file): - return interval_file_to_dict(bed_file, shift1=1) - - -def in_bed_file(bed_dict, contig, pos): - - if not contig in bed_dict: - return False - - lows = bed_dict[contig][0] - ups = bed_dict[contig][1] - - # Half open interval [#,#) - return np.any((lows <= pos) & (pos < ups)) - - -def allele_in_vcf(allele, variant, vcf_ram): - ''' Check if variant's allele is in a VCF file. - - Arguments - allele: the allele from the provided variant that we are checking - variant: the variant whose allele we are looking for - vcf_ram: the VCF we look in, must have an index (tbi, or idx) - - Returns - variant if it is found otherwise None - ''' - if not variant.CHROM in vcf_ram.contigs: - return None - - try: - variants = vcf_ram.fetch(variant.CHROM, variant.POS-1, variant.POS) - except ValueError as e: - print('catching value error on fetch') - return None - - for v in variants: - if v.CHROM == variant.CHROM and v.POS == variant.POS and allele in v.ALT: - return v - - return None - - -def get_variant_window(args, variant): - index_offset = (args.window_size//2) - reference_start = variant.POS-(index_offset+1) - reference_end = variant.POS+index_offset - - return index_offset, reference_start, reference_end - - -def dna_annotation_generator(args, train_paths): - """Data generator of DNA and annotation tensors. - - Assumes train paths contains example in labelled directories. - Loops over all examples sampling args.batch_size examples - uniformly from each label. - - Arguments: - args: args object needed for batch_size, labels, and annotations - train_paths: array of label directories with hd5 tensors within each - - Returns: - A tuple with a dict of the input tensors - and a 1-Hot matrix (2D numpy array) of the labels. - """ - per_batch_per_label = (args.batch_size // len(args.labels)) - tensor_counts = Counter() - tensors = {} - - if args.window_size > 0: - channel_map = vqsr_cnn.get_tensor_channel_map_from_args(args) - tensor = np.zeros((args.batch_size, args.window_size, len(channel_map))) - - annotation_data = np.zeros((args.batch_size, len(args.annotations))) - label_matrix = np.zeros((args.batch_size, len(args.labels))) - - - for tp in train_paths: - label_key = os.path.basename(tp) - if label_key not in args.labels: - print('Skipping label directory:', label_key, ' which is not in args label set:', args.labels.keys()) - continue - label = args.labels[label_key] - - tensors[label] = [os.path.join(tp, t) for t in os.listdir(tp) - if os.path.splitext(t)[1] == vqsr_cnn.TENSOR_SUFFIX] - tensor_counts[label] = 0 - print('Found ', len(tensors[label]), 'examples of label:', label, 'in:', tp) - - while True: - cur_example = 0 - for label in tensors.keys(): - for i in range(per_batch_per_label): - tensor_path = tensors[label][tensor_counts[label]] - label_matrix[cur_example, label] = 1.0 - with h5py.File(tensor_path,'r') as hf: - annotation_data[cur_example,:] = np.array(hf.get(args.annotation_set)) - if args.window_size > 0: - tensor[cur_example,:,:] = np.array(hf.get(args.tensor_name)) - - tensor_counts[label] += 1 - if tensor_counts[label] == len(tensors[label]): - np.random.shuffle(tensors[label]) - print('\nGenerator shuffled & looped over:', tensor_counts[label], - 'examples of label:',label, '\nLast tensor was:', tensor_path) - tensor_counts[label] = 0 - cur_example += 1 - if cur_example == args.batch_size: - break - - if args.window_size > 0: - yield ({args.tensor_name:tensor, args.annotation_set:annotation_data}, label_matrix) - else: - yield (annotation_data, label_matrix) - - - -def tensor_generator_from_label_dirs_and_args(args, train_paths, with_positions=False): - """Data generator of tensors with reads, and annotations. - - Assumes train paths contains example in labelled directories. - Loops over all examples sampling args.batch_size examples - uniformly from each label. - - Arguments: - args: args object needed for batch_size, labels, and annotations - train_paths: array of label directories with hd5 tensors within each - with_positions: boolean if True will include a position string - (i.e. "1_1234_0" for tensor from contig one base 1234 and first allele) - as the last element in each tensor tuple. - Returns: - A tuple with a dict of the input tensors - and a 1-Hot matrix (2D numpy array) of the labels. - """ - batch = {} - tensors = {} - tensor_counts = Counter() - per_batch_per_label = (args.batch_size // len(args.labels) ) - - tm = vqsr_cnn.get_tensor_channel_map_from_args(args) - if tm: - tensor_shape = vqsr_cnn.tensor_shape_from_args(args) - batch[args.tensor_name] = np.zeros(((args.batch_size,)+tensor_shape)) - - if vqsr_cnn.annotations_from_args(args): - batch[args.annotation_set] = np.zeros((args.batch_size, len(args.annotations))) - - if with_positions: - positions = [] - - label_matrix = np.zeros((args.batch_size, len(args.labels))) - - for tp in train_paths: - label_key = os.path.basename(tp) - if label_key not in args.labels: - print('Skipping label directory:', label_key, ' which is not in args label set:', args.labels.keys()) - continue - label = args.labels[label_key] - tensors[label] = [os.path.join(tp, t) for t in os.listdir(tp) - if os.path.splitext(t)[1] == vqsr_cnn.TENSOR_SUFFIX] - tensor_counts[label] = 0 - print('Found ', len(tensors[label]), 'examples of label:', label, 'in:', tp) - - while True: - cur_example = 0 - for label in tensors.keys(): - for i in range(per_batch_per_label): - tensor_path = tensors[label][tensor_counts[label]] - - with h5py.File(tensor_path, 'r') as hf: - for key in batch.keys(): - batch[key][cur_example] = np.array(hf.get(key)) - - label_matrix[cur_example, label] = 1.0 - tensor_counts[label] += 1 - if tensor_counts[label] == len(tensors[label]): - np.random.shuffle(tensors[label]) - print('\nGenerator looped over:', tensor_counts[label], - 'examples of label:', label, '\nShuffled them. Last tensor was:', tensor_path) - tensor_counts[label] = 0 - - if with_positions: - positions.append(position_string_from_tensor_name(tensor_path)) - - cur_example += 1 - if cur_example == args.batch_size: - break - - if with_positions: - yield (batch, label_matrix, positions) - positions = [] - else: - yield (batch, label_matrix) - label_matrix = np.zeros((args.batch_size, len(args.labels))) - if with_positions and tm: - tensor_shape = vqsr_cnn.tensor_shape_from_args(args) - batch[args.tensor_name] = np.zeros(((args.batch_size,)+tensor_shape)) - - if with_positions and vqsr_cnn.annotations_from_args(args): - batch[args.annotation_set] = np.zeros((args.batch_size, len(args.annotations))) - - -def load_dna_annotations_positions_from_class_dirs(args, train_paths, - per_class_max=4000, include_dna=True, include_annotations=True): - count = 0 - - annotation_data = [] - reference_data = [] - labels_data = [] - positions = [] - - for tp in train_paths: - label_key = os.path.basename(tp) - if label_key not in args.labels: - print('Skipping label directory:', label_key, ' which is not in args label set:', args.labels.keys()) - continue - label = args.labels[label_key] - imgs = os.listdir(tp) - count += 1 - print(count, " dir out of:", len(train_paths), tp, "has:", len(imgs)) - this_t = 0 - for t in imgs: - this_t += 1 - if this_t > per_class_max: - print('Per class max reached. bailing at', this_t) - break - - fn, file_extension = os.path.splitext(t) - if not file_extension.lower() == vqsr_cnn.TENSOR_SUFFIX: - continue - - with h5py.File(tp+'/'+t, 'r') as hf: - if include_annotations: - annotation_data.append(np.array(hf.get(args.annotation_set))) - if include_dna: - reference_data.append(np.array(hf.get(args.tensor_name))) - - y_vector = np.zeros(len(args.labels)) # One hot Y vector of size labels, correct label is 1 others are 0 - y_vector[label] = 1.0 - labels_data.append(y_vector) - positions.append(position_string_from_tensor_name(t)) - - if include_dna and include_annotations: - return np.asarray(reference_data), np.asarray(annotation_data), np.asarray(labels_data), np.asarray(positions) - elif include_annotations: - return np.asarray(annotation_data), np.asarray(labels_data), np.asarray(positions) - elif include_dna: - return np.asarray(reference_data), np.asarray(labels_data), np.asarray(positions) - - -def load_tensors_and_annotations_from_class_dirs(args, train_paths, per_class_max=2500, position_dict=None): - annotations = [] - positions = [] - tensors = [] - labels = [] - count = 0 - - for tp in train_paths: - label_key = os.path.basename(tp) - if label_key not in args.labels: - print('Skipping label directory:', label_key, ' which is not in args label set:', args.labels.keys()) - continue - - label = args.labels[label_key] - imgs = os.listdir(tp) - count += 1 - this_t = 0 - for t in imgs: - if this_t > per_class_max: - print('Per class max reached. bailing at', this_t) - break - - fn, file_extension = os.path.splitext(t) - if not file_extension.lower() == vqsr_cnn.TENSOR_SUFFIX: - continue - - with h5py.File(tp+'/'+t, 'r') as hf: - tensors.append(np.array(hf.get(args.tensor_name))) - annotations.append(np.array(hf.get(args.annotation_set))) - - y_vector = np.zeros(len(args.labels)) # One hot Y vector of size labels, correct label is 1 all others are 0 - y_vector[label] = 1.0 - labels.append(y_vector) - positions.append(position_string_from_tensor_name(t)) - this_t += 1 - - print(count, " dir out of:", len(train_paths), tp, "has:", len(imgs), 'Loaded:', this_t) - - return np.asarray(tensors), np.asarray(annotations), np.asarray(labels), np.asarray(positions) - - -def position_string_from_tensor_name(tensor_name): - '''Genomic position as underscore delineated string from a filename. - - Includes an allele index if the filename includes _allele_ - This is ugly, we need file names ending with genomic position - (e.g. my_tensor-12_1234.h5 returns 12_1234 and a_tensor_allele_1-8_128.hd5 returns 8_128_1) - - Arguments: - tensor_name: the filename to parse - Returns: - Genomic position string Contig_Position or Contig_Position_AlleleIndex - ''' - slash_split = tensor_name.split('/') - dash_split = slash_split[-1].split('-') - gsplit = dash_split[0].split('_') - - gpos = dash_split[-1] - chrom = gpos.split('_')[0] - pos = os.path.splitext(gpos.split('_')[1])[0] - pos_str = chrom + '_' + pos - - for i,p in enumerate(gsplit): - if p == 'allele': - pos_str += '_'+str(gsplit[i+1]) - - return pos_str - - -def get_path_to_train_valid_or_test(path, valid_ratio=0.1, test_ratio=0.2, valid_contig='-19_', test_contig='-20_'): - dice = np.random.rand() - if dice < valid_ratio or valid_contig in path: - return os.path.join(path, 'valid/') - elif dice < valid_ratio+test_ratio or test_contig in path: - return os.path.join(path, 'test/') - else: - return os.path.join(path, 'train/') - - -def get_train_valid_test_paths(args): - train_dir = args.data_dir + 'train/' - valid_dir = args.data_dir + 'valid/' - test_dir = args.data_dir + 'test/' - train_paths = [train_dir + tp for tp in sorted(os.listdir(train_dir)) if os.path.isdir(train_dir + tp)] - valid_paths = [valid_dir + vp for vp in sorted(os.listdir(valid_dir)) if os.path.isdir(valid_dir + vp)] - test_paths = [test_dir + vp for vp in sorted(os.listdir(test_dir)) if os.path.isdir(test_dir + vp)] - - assert(len(train_paths) == len(valid_paths) == len(test_paths)) - - return train_paths, valid_paths, test_paths - - -def plain_name(full_name): - name = os.path.basename(full_name) - return name.split('.')[0] - -def get_vcf_reader(my_vcf): - if os.path.splitext(my_vcf)[-1].lower() == '.gz': - return vcf.Reader(open(my_vcf, 'rb')) - else: - return vcf.Reader(open(my_vcf, 'r')) - - -# Back to the top! -if "__main__" == __name__: - run() \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariantsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariantsIntegrationTest.java deleted file mode 100644 index 591a9db98e0..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariantsIntegrationTest.java +++ /dev/null @@ -1,303 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.vqsr; - -import htsjdk.variant.variantcontext.VariantContext; -import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; -import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; -import org.broadinstitute.hellbender.utils.Utils; - -import org.broadinstitute.hellbender.utils.python.PythonScriptExecutorException; -import org.testng.Assert; -import org.testng.SkipException; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.IOException; - -import java.util.Collections; -import java.util.Iterator; -import java.util.stream.Collectors; - - -/** - * Integration tests for {@link CNNScoreVariants}. - * Created by sam on 1/8/18. - */ -public class CNNScoreVariantsIntegrationTest extends CommandLineProgramTest { - private static final String architecture1D = largeFileTestDir + "VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.json"; - private static final String weights1D = largeFileTestDir + "VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.hd5"; - private static final String architecture2D = largeFileTestDir + "VQSR/cnn_read_model/small_2d.json"; - private static final String weights2D = largeFileTestDir + "VQSR/cnn_read_model/small_2d.hd5"; - private static final String inputVCF = largeFileTestDir + "VQSR/recalibrated_chr20_start.vcf"; - private static final String bigInputVCF = largeFileTestDir + "VQSR/g94982_20_1m_10m_python_2dcnn.vcf.gz"; - private static final String inputBAM = largeFileTestDir + "VQSR/g94982_contig_20_start_bamout.bam"; - private static final String inputIntervals = largeFileTestDir + "VQSR/contig20_conf_1m_10m.interval_list"; - private static final double EPSILON = 0.01; - - @Test(expectedExceptions = RuntimeException.class) - public void testRequirePythonEnvironment() throws IOException { - // This test is deliberately left out of the "python" test group in order to ensure that - // it only executes when the Python environment has *NOT* been properly established. Also, - // skip this test if we're running on the Docker because the Python environment is always - // activated there. - if (isGATKDockerContainer()) { - throw new SkipException("Python environment validation test must be skipped when running on the Docker"); - } - - // Re-running the "testAllDefaultArgs" test should throw when run outside of the GATK Python environment - testAllDefaultArgs(); - } - /** - * Run the tool on a small test VCF. - */ - @Test(groups = {"python"}, enabled = false) - public void testAllDefaultArgs() { - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_1d_chr20_subset_expected.vcf"); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - - @Test(groups = {"python"}, expectedExceptions = PythonScriptExecutorException.class, enabled = false) - public void testExceptionDuringAsyncBatch() { - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - final File tempVcf = createTempFile("tester", ".vcf"); - // the last variant in this vcf has a value of "." for the float attributes in the default CNN - // annotation set MQ, MQRankSum, ReadPosRankSum, SOR, VQSLOD, and QD - //TODO: move this into the large resources dir - final File malformedVCF = new File("src/test/resources/cnn_1d_chr20_subset_expected.badAnnotations.vcf"); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, malformedVCF) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceArchitecture() { - final boolean newExpectations = false; - final String expectedVCFName = largeFileTestDir + "VQSR/expected/cnn_1d_chr20_subset_expected.vcf"; - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("architecture", architecture1D) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - if (newExpectations) { - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, expectedVCFName); - runCommandLine(argsBuilder); - } else { - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(expectedVCFName); - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()); - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceWeights() { - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_1d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("weights", weights1D) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceArchitectureAndWeights() { - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_1d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("weights", weights1D) - .add("architecture", architecture1D) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceWithIntervals() { - final boolean newExpectations = false; - final String expectedVCFName = largeFileTestDir + "VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf"; - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, bigInputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add(StandardArgumentDefinitions.INTERVALS_LONG_NAME, inputIntervals) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - if (newExpectations) { - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, expectedVCFName); - runCommandLine(argsBuilder); - } else { - final File expectedVcf = new File(expectedVCFName); - final File tempVcf = createTempFile("tester", ".vcf"); - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()); - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - } - - @Test(groups = {"python"}, enabled = false) - public void testSmallBatchInference() { - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_1d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("inference-batch-size", "8") - .add("transfer-batch-size", "16") - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - - @Test(groups = {"python"}, enabled = false) - public void testOnContigEdge() { - final String edgeVcf = toolsTestDir + "walkers/VQSR/variantNearContigEdge.vcf"; - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/chrM.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, edgeVcf) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, hg19MiniReference) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_1D_KEY); - } - - /** - * Run the 2D Model on a small test VCF with the resource loaded weights and architecture. - */ - @Test(groups = {"python"}, enabled = false) - public void testInference2dResourceModel() { - // We reset the random number generator at the beginning of each test so that the random down-sampling of reads - // by the reservoir down-sampler does not cause slightly different scores. - Utils.resetRandomGenerator(); - TensorType tt = TensorType.read_tensor; - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_2d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.INPUT_LONG_NAME, inputBAM) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("inference-batch-size", "2") - .add("transfer-batch-size", "2") - .add("tensor-type", tt.name()) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_2D_KEY); - } - - /** - * Run the 2D Model on a small test VCF. - */ - @Test(groups = {"python"}, enabled = false) - public void testInferenceArchitecture2d() { - Utils.resetRandomGenerator(); - final boolean newExpectations = false; - TensorType tt = TensorType.read_tensor; - final String expectedVCFName = largeFileTestDir + "VQSR/expected/cnn_2d_chr20_subset_expected.vcf"; - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.INPUT_LONG_NAME, inputBAM) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("architecture", architecture2D) - .add("tensor-type", tt.name()) - .add("inference-batch-size", "8") - .add("transfer-batch-size", "8") - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - if (newExpectations) { - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, expectedVCFName); - runCommandLine(argsBuilder); - } else { - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(expectedVCFName); - argsBuilder.add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()); - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_2D_KEY); - } - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceWeights2d() { - Utils.resetRandomGenerator(); - TensorType tt = TensorType.read_tensor; - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_2d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.INPUT_LONG_NAME, inputBAM) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("weights", weights2D) - .add("inference-batch-size", "4") - .add("transfer-batch-size", "4") - .add("tensor-type", tt.name()) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_2D_KEY); - } - - @Test(groups = {"python"}, enabled = false) - public void testInferenceArchitectureAndWeights2d() { - Utils.resetRandomGenerator(); - TensorType tt = TensorType.read_tensor; - final File tempVcf = createTempFile("tester", ".vcf"); - final File expectedVcf = new File(largeFileTestDir + "VQSR/expected/cnn_2d_chr20_subset_expected.vcf"); - final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); - argsBuilder.add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.INPUT_LONG_NAME, inputBAM) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, tempVcf.getPath()) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("weights", weights2D) - .add("architecture", architecture2D) - .add("inference-batch-size", "4") - .add("transfer-batch-size", "4") - .add("tensor-type", tt.name()) - .add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"); - - runCommandLine(argsBuilder); - assertInfoFieldsAreClose(tempVcf, expectedVcf, GATKVCFConstants.CNN_2D_KEY); - } - - private void assertInfoFieldsAreClose(File actualVcf, File expectedVcf, String infoKey){ - Iterator expectedVi = VariantContextTestUtils.streamVcf(expectedVcf).collect(Collectors.toList()).iterator(); - Iterator actualVi = VariantContextTestUtils.streamVcf(actualVcf).collect(Collectors.toList()).iterator(); - while (expectedVi.hasNext() && actualVi.hasNext()) { - VariantContext expectedVc = expectedVi.next(); - VariantContext actualVc = actualVi.next(); - double expectedScore = expectedVc.getAttributeAsDouble(infoKey, 0.0); // Different defaults trigger failures on missing scores - double actualScore = actualVc.getAttributeAsDouble(infoKey, EPSILON+1.0); - double diff = Math.abs(expectedScore-actualScore); - Assert.assertTrue(diff < EPSILON); - VariantContextTestUtils.assertVariantContextsAreEqual(actualVc, expectedVc, Collections.singletonList(infoKey), Collections.emptyList()); - } - Assert.assertTrue(!expectedVi.hasNext() && !actualVi.hasNext()); - } -} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java deleted file mode 100644 index c5504b9bbc0..00000000000 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNVariantPipelineTest.java +++ /dev/null @@ -1,106 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.vqsr; - -import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; -import org.broadinstitute.hellbender.Main; -import org.testng.annotations.Test; - -import java.io.File; - - -public class CNNVariantPipelineTest extends GATKBaseTest { - final private static String inputVCF = largeFileTestDir + "VQSR/g94982_b37_chr20_1m_10m.vcf"; - final private static String truthVCF = largeFileTestDir + "VQSR/giab_chr20_1m_10m.vcf.gz"; - final private static String truthBED = largeFileTestDir + "VQSR/giab_na12878_confident_chr20_1m_10m.bed"; - - private static File readTensorDir; - private static File referenceTensorDir; - - @Test(groups = {"python"}, enabled = false) - public static void makeTempDirectories() { - readTensorDir = createTempDir("readTensorDir"); - referenceTensorDir = createTempDir("referenceTensorDir"); - } - - @Test(groups = {"python"}, dependsOnMethods = {"makeTempDirectories"}, enabled = false) - public void testGenerateReferenceTensors() { - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("CNNVariantWriteTensors") - .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("truth-vcf", truthVCF) - .add("truth-bed", truthBED) - .add("tensor-type", TensorType.reference.name()) - .add("output-tensor-dir", referenceTensorDir.toString()); - - new Main().instanceMain(args.getArgsArray()); - } - - @Test(groups = {"python"}, dependsOnMethods = {"makeTempDirectories"}, enabled = false) - public void testGenerateReadTensors() { - final String bamFile = largeFileTestDir + "VQSR/g94982_b37_chr20_1m_8m_bamout.bam"; - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("CNNVariantWriteTensors") - .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, inputVCF) - .add(StandardArgumentDefinitions.REFERENCE_LONG_NAME, b37_reference_20_21) - .add("truth-vcf", truthVCF) - .add("truth-bed", truthBED) - .add("bam-file", bamFile) - .add("max-tensors", "4000") - .add("tensor-type", TensorType.read_tensor.name()) - .add("output-tensor-dir", readTensorDir.toString()) - .add("channels-last", "true"); - - new Main().instanceMain(args.getArgsArray()); - } - - @Test(groups = {"python"}, dependsOnMethods = {"testGenerateReferenceTensors"}, enabled = false) - public void testTrainingReferenceModel() { - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("CNNVariantTrain") - .add("input-tensor-dir", referenceTensorDir.toString()+"/") - .add("tensor-type", TensorType.reference.name()) - .add("epochs", "1") - .add("training-steps", "30") - .add("model-name", "test_reference_model") - .add("output-dir", referenceTensorDir.toString()+"/"); - - new Main().instanceMain(args.getArgsArray()); - } - - @Test(groups = {"python"}, dependsOnMethods = {"testGenerateReadTensors"}, enabled = false) - public void testTrainingReadModel() { - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("CNNVariantTrain") - .add("input-tensor-dir", readTensorDir.toString()+"/") - .add("tensor-type", TensorType.read_tensor.name()) - .add("epochs", "1") - .add("training-steps", "5") - .add("validation-steps", "2") - .add("model-name", "test_read_tensor_model") - .add("output-dir", readTensorDir.toString()+"/") - .add("channels-last", "true"); - - new Main().instanceMain(args.getArgsArray()); - } - - @Test(groups = {"python"}, enabled = false) - public void testTranches() { - final String trancheVCF = largeFileTestDir + "VQSR/g94982_b37_chr20_1m_10m.vcf.gz"; - final String snpTruthVCF = largeFileTestDir + "VQSR/giab_chr20_1m_10m.vcf.gz"; - final String indelTruthVCF = largeFileTestDir + "VQSR/giab_chr20_1m_10m.vcf.gz"; - final File outputVCF = createTempFile("variant_tranches_output", "vcf"); - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addRaw("FilterVariantTranches") - .add(StandardArgumentDefinitions.VARIANT_LONG_NAME, trancheVCF) - .add(StandardArgumentDefinitions.OUTPUT_LONG_NAME, outputVCF.getAbsolutePath()) - .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME, snpTruthVCF) - .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME, indelTruthVCF) - .add("snp-tranche", "99.9") - .add("indel-tranche", "99.0") - .add("info-key", "VQSLOD"); - - new Main().instanceMain(args.getArgsArray()); - } -} diff --git a/src/test/resources/large/VQSR/cnn_read_model/small_2d.hd5 b/src/test/resources/large/VQSR/cnn_read_model/small_2d.hd5 deleted file mode 100644 index deb36d22e04..00000000000 --- a/src/test/resources/large/VQSR/cnn_read_model/small_2d.hd5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f663a2fdbcde0addc5cb755f7af5d4c19bed92dccfd20e25b2acf2bc8c2ca7c -size 2163096 diff --git a/src/test/resources/large/VQSR/cnn_read_model/small_2d.json b/src/test/resources/large/VQSR/cnn_read_model/small_2d.json deleted file mode 100644 index c35cfbdfcae..00000000000 --- a/src/test/resources/large/VQSR/cnn_read_model/small_2d.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e38e09cfe7b7ffbc80dce4972bc9c382148520147d46738a3f6f3235b2d876c6 -size 758 diff --git a/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.hd5 b/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.hd5 deleted file mode 100644 index 48581550b03..00000000000 --- a/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.hd5 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd17c3a98f7651b4e7ee54d875c47ec12e18b75daf79b3744a2590ddb0d6b44d -size 20227144 diff --git a/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.json b/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.json deleted file mode 100644 index 666a73dfb9a..00000000000 --- a/src/test/resources/large/VQSR/cnn_ref_model/1d_cnn_mix_train_full_bn.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eda2517817b23238c2b28f69a1fa39e9b85b45985854f0a5d5508280e76da39e -size 519 diff --git a/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf b/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf deleted file mode 100644 index cd0d76e99d3..00000000000 --- a/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0948096cb6dc127cc08a8295f3d879992600c68f5f9e015bca22aa98643af5d -size 4036521 diff --git a/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf.idx b/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf.idx deleted file mode 100644 index 33ab8e404b9..00000000000 --- a/src/test/resources/large/VQSR/expected/cnn_1d_contig20_1m_10m_expected.vcf.idx +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f1376064a2f8dde0eb90b88b7f71e9a82e6c72067e95517d8cd3f7f5c1385b8 -size 12083